From 0a21065de1cde03f29a9cfcdc6829769ccdf6724 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 5 Apr 2017 18:58:54 +0300
Subject: [PATCH] Added example Python grammars & parser

---
 examples/python2.g        | 169 ++++++++++++++++++++++++++++++++++
 examples/python3.g        | 189 ++++++++++++++++++++++++++++++++++++++
 examples/python_parser.py |  85 +++++++++++++++++
 3 files changed, 443 insertions(+)
 create mode 100644 examples/python2.g
 create mode 100644 examples/python3.g
 create mode 100644 examples/python_parser.py

diff --git a/examples/python2.g b/examples/python2.g
new file mode 100644
index 0000000..e8ebda8
--- /dev/null
+++ b/examples/python2.g
@@ -0,0 +1,169 @@
+// Python 2 grammar for Lark
+
+// NOTE: Work in progress!!! (XXX TODO)
+// This grammar should parse all python 2.x code successfully,
+// but the resulting parse-tree is still not well-organized.
+
+// Adapted from: https://docs.python.org/2/reference/grammar.html
+// Adapted by: Erez Shinan
+
+// Start symbols for the grammar:
+//       single_input is a single interactive statement;
+//       file_input is a module or sequence of commands read from an input file;
+//       eval_input is the input for the eval() and input() functions.
+// NB: compound_stmt in single_input is followed by extra _NEWLINE!
+single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE
+?file_input: (_NEWLINE | stmt)*
+eval_input: testlist _NEWLINE?
+
+decorator: "@" dotted_name [ "(" [arglist] ")" ] _NEWLINE
+decorators: decorator+
+decorated: decorators (classdef | funcdef)
+funcdef: "def" NAME "(" parameters ")" ":" suite
+parameters: [paramlist]
+paramlist: param ("," param)* ["," [star_params ["," kw_params] | kw_params]]
+           | star_params ["," kw_params]
+           | kw_params
+star_params: "*" NAME
+kw_params: "**" NAME
+param: fpdef ["=" test]
+fpdef: NAME | "(" fplist ")"
+fplist: fpdef ("," fpdef)* [","]
+
+?stmt: simple_stmt | compound_stmt
+?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE
+?small_stmt: (expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt
+          |  import_stmt | global_stmt | exec_stmt | assert_stmt)
+expr_stmt: testlist augassign (yield_expr|testlist) -> augassign2
+         | testlist ("=" (yield_expr|testlist))+    -> assign
+         | testlist
+
+augassign: ("+=" | "-=" | "*=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=")
+// For normal assignments, additional restrictions enforced by the interpreter
+print_stmt: "print" ( [ test ("," test)* [","] ] | ">>" test [ ("," test)+ [","] ] )
+del_stmt: "del" exprlist
+pass_stmt: "pass"
+?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
+break_stmt: "break"
+continue_stmt: "continue"
+return_stmt: "return" [testlist]
+yield_stmt: yield_expr
+raise_stmt: "raise" [test ["," test ["," test]]]
+import_stmt: import_name | import_from
+import_name: "import" dotted_as_names
+import_from: "from" ("."* dotted_name | "."+) "import" ("*" | "(" import_as_names ")" | import_as_names)
+?import_as_name: NAME ["as" NAME]
+?dotted_as_name: dotted_name ["as" NAME]
+import_as_names: import_as_name ("," import_as_name)* [","]
+dotted_as_names: dotted_as_name ("," dotted_as_name)*
+dotted_name: NAME ("." NAME)*
+global_stmt: "global" NAME ("," NAME)*
+exec_stmt: "exec" expr ["in" test ["," test]]
+assert_stmt: "assert" test ["," test]
+
+?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated
+if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite]
+while_stmt: "while" test ":" suite ["else" ":" suite]
+for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite]
+try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite))
+with_stmt: "with" with_item ("," with_item)*  ":" suite
+with_item: test ["as" expr]
+// NB compile.c makes sure that the default except clause is last
+except_clause: "except" [test [("as" | ",") test]]
+suite: simple_stmt | _NEWLINE _INDENT _NEWLINE? stmt+ _DEDENT _NEWLINE?
+
+// Backward compatibility cruft to support:
+// [ x for x in lambda: True, lambda: False if x() ]
+// even while also allowing:
+// lambda x: 5 if x else 2
+// (But not a mix of the two)
+testlist_safe: old_test [("," old_test)+ [","]]
+old_test: or_test | old_lambdef
+old_lambdef: "lambda" [paramlist] ":" old_test
+
+?test: or_test ["if" or_test "else" test] | lambdef
+?or_test: and_test ("or" and_test)*
+?and_test: not_test ("and" not_test)*
+?not_test: "not" not_test | comparison
+?comparison: expr (comp_op expr)*
+comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"
+?expr: xor_expr ("|" xor_expr)*
+?xor_expr: and_expr ("^" and_expr)*
+?and_expr: shift_expr ("&" shift_expr)*
+?shift_expr: arith_expr (("<<"|">>") arith_expr)*
+?arith_expr: term (("+"|"-") term)*
+?term: factor (("*"|"/"|"%"|"//") factor)*
+?factor: ("+"|"-"|"~") factor | power
+?power: molecule ["**" factor]
+// _trailer: "(" [arglist] ")" | "[" subscriptlist "]" | "." NAME
+?molecule: molecule "(" [arglist] ")" -> func_call
+         | molecule "[" [subscriptlist] "]" -> getitem
+         | molecule "." NAME -> getattr
+         | atom
+?atom: "(" [yield_expr|testlist_comp] ")"
+    |   "[" [listmaker] "]"
+    |   "{" [dictorsetmaker] "}"
+    |   "`" testlist1 "`"
+    |   NAME | number | string+
+listmaker: test ( list_for | ("," test)* [","] )
+?testlist_comp: test ( comp_for | ("," test)* [","] )
+lambdef: "lambda" [paramlist] ":" test
+?subscriptlist: subscript ("," subscript)* [","]
+subscript: "." "." "." | test | [test] ":" [test] [sliceop]
+sliceop: ":" [test]
+?exprlist: expr ("," expr)* [","]
+?testlist: test ("," test)* [","]
+dictorsetmaker: ( (test ":" test (comp_for | ("," test ":" test)* [","])) | (test (comp_for | ("," test)* [","])) )
+
+classdef: "class" NAME ["(" [testlist] ")"] ":" suite
+
+arglist: (argument ",")* (argument [","]
+                         | star_args ["," kw_args]
+                         | kw_args)
+
+star_args: "*" test
+kw_args: "**" test
+
+
+// The reason that keywords are test nodes instead of NAME is that using NAME
+// results in an ambiguity. ast.c makes sure it's a NAME.
+argument: test [comp_for] | test "=" test
+
+list_iter: list_for | list_if
+list_for: "for" exprlist "in" testlist_safe [list_iter]
+list_if: "if" old_test [list_iter]
+
+comp_iter: comp_for | comp_if
+comp_for: "for" exprlist "in" or_test [comp_iter]
+comp_if: "if" old_test [comp_iter]
+
+testlist1: test ("," test)*
+
+yield_expr: "yield" [testlist]
+
+number: DEC_NUMBER | HEX_NUMBER | OCT_NUMBER | FLOAT | IMAG_NUMBER
+string: STRING | LONG_STRING
+// Tokens
+
+COMMENT: /\#[^\n]*/
+_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+
+
+%ignore /[\t \f]+/  // WS
+%ignore /\\\\[\t \f]*\r?\n/   // LINE_CONT 
+%ignore COMMENT
+
+
+STRING : /(?i)[ub]?r?("(?!"").*?(?<!\\\\)(\\\\\\\\)*?"|'(?!'').*?(?<!\\\\)(\\\\\\\\)*?')/
+LONG_STRING: /(?i)(?s)[ub]?r?(""".*?(?<!\\\\)(\\\\\\\\)*?"""|'''.*?(?<!\\\\)(\\\\\\\\)*?''')/
+
+DEC_NUMBER: /(?i)[1-9]\d*l?/
+HEX_NUMBER: /(?i)0x[\da-f]*l?/
+OCT_NUMBER: /(?i)0o?[0-7]*l?/
+%import common.FLOAT -> FLOAT
+%import common.INT -> _INT
+%import common.CNAME -> NAME
+IMAG_NUMBER: (_INT | FLOAT) ("j"|"J")
+
+_DEDENT: "<DEDENT>"
+_INDENT: "<INDENT>"
+
diff --git a/examples/python3.g b/examples/python3.g
new file mode 100644
index 0000000..279c268
--- /dev/null
+++ b/examples/python3.g
@@ -0,0 +1,189 @@
+// Python 3 grammar for Lark
+
+// NOTE: Work in progress!!! (XXX TODO)
+// This grammar should parse all python 3.x code successfully,
+// but the resulting parse-tree is still not well-organized.
+
+// Adapted from: https://docs.python.org/3/reference/grammar.html
+// Adapted by: Erez Shinan
+
+// Start symbols for the grammar:
+//       single_input is a single interactive statement;
+//       file_input is a module or sequence of commands read from an input file;
+//       eval_input is the input for the eval() functions.
+// NB: compound_stmt in single_input is followed by extra NEWLINE!
+single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE
+file_input: (_NEWLINE | stmt)*
+eval_input: testlist _NEWLINE*
+
+decorator: "@" dotted_name [ "(" [arguments] ")" ] _NEWLINE
+decorators: decorator+
+decorated: decorators (classdef | funcdef | async_funcdef)
+
+async_funcdef: "async" funcdef
+funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite
+
+parameters: paramvalue ("," paramvalue)* ["," [ starparams | kwparams]]
+          | starparams
+          | kwparams
+starparams: "*" typedparam? ("," paramvalue)* ["," kwparams]
+kwparams: "**" typedparam
+
+?paramvalue: typedparam ["=" test]
+?typedparam: NAME [":" test]
+
+varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]]
+  | "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]]
+  | "**" vfpdef [","])
+
+vfpdef: NAME
+
+?stmt: simple_stmt | compound_stmt
+?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE
+?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
+?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist)
+         | ("=" (yield_expr|testlist_star_expr))*)
+annassign: ":" test ["=" test]
+?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","]
+!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=")
+// For normal and annotated assignments, additional restrictions enforced by the interpreter
+del_stmt: "del" exprlist
+pass_stmt: "pass"
+?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
+break_stmt: "break"
+continue_stmt: "continue"
+return_stmt: "return" [testlist]
+yield_stmt: yield_expr
+raise_stmt: "raise" [test ["from" test]]
+import_stmt: import_name | import_from
+import_name: "import" dotted_as_names
+// note below: the ("." | "...") is necessary because "..." is tokenized as ELLIPSIS
+import_from: "from" (dots? dotted_name | dots) "import" ("*" | "(" import_as_names ")" | import_as_names)
+!dots: "."+
+import_as_name: NAME ["as" NAME]
+dotted_as_name: dotted_name ["as" NAME]
+import_as_names: import_as_name ("," import_as_name)* [","]
+dotted_as_names: dotted_as_name ("," dotted_as_name)*
+dotted_name: NAME ("." NAME)*
+global_stmt: "global" NAME ("," NAME)*
+nonlocal_stmt: "nonlocal" NAME ("," NAME)*
+assert_stmt: "assert" test ["," test]
+
+compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
+async_stmt: "async" (funcdef | with_stmt | for_stmt)
+if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite]
+while_stmt: "while" test ":" suite ["else" ":" suite]
+for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite]
+try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite))
+with_stmt: "with" with_item ("," with_item)*  ":" suite
+with_item: test ["as" expr]
+// NB compile.c makes sure that the default except clause is last
+except_clause: "except" [test ["as" NAME]]
+suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT
+
+?test: or_test ["if" or_test "else" test] | lambdef
+?test_nocond: or_test | lambdef_nocond
+lambdef: "lambda" [varargslist] ":" test
+lambdef_nocond: "lambda" [varargslist] ":" test_nocond
+?or_test: and_test ("or" and_test)*
+?and_test: not_test ("and" not_test)*
+?not_test: "not" not_test -> not
+         | comparison
+?comparison: expr (_comp_op expr)*
+star_expr: "*" expr
+?expr: xor_expr ("|" xor_expr)*
+?xor_expr: and_expr ("^" and_expr)*
+?and_expr: shift_expr ("&" shift_expr)*
+?shift_expr: arith_expr (_shift_op arith_expr)*
+?arith_expr: term (_add_op term)*
+?term: factor (_mul_op factor)*
+?factor: _factor_op factor | power
+
+!_factor_op: "+"|"-"|"~"
+!_add_op: "+"|"-"
+!_shift_op: "<<"|">>"
+!_mul_op: "*"|"@"|"/"|"%"|"//"
+// <> isn't actually a valid comparison operator in Python. It's here for the
+// sake of a __future__ import described in PEP 401 (which really works :-)
+!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"
+
+?power: await_expr ["**" factor]
+?await_expr: AWAIT? atom_expr
+AWAIT: "await"
+
+?atom_expr: atom_expr "(" [arguments] ")"      -> funccall
+          | atom_expr "[" subscriptlist "]"  -> getitem
+          | atom_expr "." NAME               -> getattr
+          | atom
+
+?atom: "(" [yield_expr|testlist_comp] ")"
+     | "[" [testlist_comp] "]"
+     | "{" [dictorsetmaker] "}"
+     | NAME -> var
+     | number | string+ | "..."
+     | "None"    -> const_none
+     | "True"    -> const_true
+     | "False"   -> const_false
+
+testlist_comp: (test|star_expr) ( comp_for | ("," (test|star_expr))* [","] )
+subscriptlist: subscript ("," subscript)* [","]
+subscript: test | [test] ":" [test] [sliceop]
+sliceop: ":" [test]
+exprlist: (expr|star_expr) ("," (expr|star_expr))* [","]
+testlist: test ("," test)* [","]
+dictorsetmaker: ( ((test ":" test | "**" expr) (comp_for | ("," (test ":" test | "**" expr))* [","])) | ((test | star_expr) (comp_for | ("," (test | star_expr))* [","])) )
+
+classdef: "class" NAME ["(" [arguments] ")"] ":" suite
+
+arguments: argvalue ("," argvalue)*  ["," [ starargs | kwargs]]
+         | starargs
+         | kwargs
+         | test comp_for
+
+starargs: "*" test ("," argvalue)* ["," kwargs]
+kwargs: "**" test
+
+?argvalue: test ["=" test]
+
+
+
+comp_iter: comp_for | comp_if | async_for
+async_for: "async" "for" exprlist "in" or_test [comp_iter]
+comp_for: "for" exprlist "in" or_test [comp_iter]
+comp_if: "if" test_nocond [comp_iter]
+
+// not used in grammar, but may appear in "node" passed from Parser to Compiler
+encoding_decl: NAME
+
+yield_expr: "yield" [yield_arg]
+yield_arg: "from" test | testlist
+
+
+number: DEC_NUMBER | HEX_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER
+string: STRING | LONG_STRING
+// Tokens
+
+NAME: /[a-zA-Z_]\w*/
+COMMENT: /\#[^\n]*/
+_NEWLINE: /(\r?\n[\t ]*|${COMMENT})+/
+
+%ignore /[\t \f]+/  // WS
+%ignore /\\\\[\t \f]*\r?\n/   // LINE_CONT 
+%ignore COMMENT
+
+
+
+// STRING : /[ub]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/
+// LONG_STRING: /(?s)[ub]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/
+STRING : /(?i)[ub]?r?("(?!"").*?(?<!\\\\)(\\\\\\\\)*?"|'(?!'').*?(?<!\\\\)(\\\\\\\\)*?')/
+LONG_STRING: /(?i)(?s)[ub]?r?(""".*?(?<!\\\\)(\\\\\\\\)*?"""|'''.*?(?<!\\\\)(\\\\\\\\)*?''')/
+
+DEC_NUMBER: /(?i)[1-9]\d*l?/
+HEX_NUMBER: /(?i)0x[\da-f]*l?/
+OCT_NUMBER: /(?i)0o?[0-7]*l?/
+FLOAT_NUMBER: /(?i)((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/
+IMAG_NUMBER: /(?i)\d+j|${FLOAT_NUMBER}j/
+
+_DEDENT: "<DEDENT>"
+_INDENT: "<INDENT>"
+
diff --git a/examples/python_parser.py b/examples/python_parser.py
new file mode 100644
index 0000000..671d829
--- /dev/null
+++ b/examples/python_parser.py
@@ -0,0 +1,85 @@
+#
+# This example demonstrates usage of the included Python grammars
+#
+
+import sys
+import os, os.path
+from io import open
+import glob, time
+
+from lark import Lark
+from lark.indenter import Indenter
+
+__path__ = os.path.dirname(__file__)
+
+class PythonIndenter(Indenter):
+    NL_type = '_NEWLINE'
+    OPEN_PAREN_types = ['__LPAR', '__LSQB', '__LBRACE']
+    CLOSE_PAREN_types = ['__RPAR', '__RSQB', '__RBRACE']
+    INDENT_type = '_INDENT'
+    DEDENT_type = '_DEDENT'
+    tab_len = 8
+
+
+grammar2_filename = os.path.join(__path__, 'python2.g')
+grammar3_filename = os.path.join(__path__, 'python3.g')
+with open(grammar2_filename) as f:
+    python_parser2 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input')
+with open(grammar3_filename) as f:
+    python_parser3 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input')
+
+
+with open(grammar2_filename) as f:
+    python_parser2_earley = Lark(f, parser='lalr', lexer='standard', postlex=PythonIndenter(), start='file_input')
+
+def _read(fn, *args):
+    kwargs = {'encoding': 'iso-8859-1'}
+    with open(fn, *args, **kwargs) as f:
+        return f.read()
+
+def _get_lib_path():
+    if os.name == 'nt':
+        if 'PyPy' in sys.version:
+            return os.path.join(sys.prefix, 'lib-python', sys.winver)
+        else:
+            return os.path.join(sys.prefix, 'Lib')
+    else:
+        return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0]
+
+def test_python_lib():
+
+    path = _get_lib_path()
+
+    start = time.time()
+    files = glob.glob(path+'/*.py')
+    for f in files:
+        print( f )
+        try:
+            # print list(python_parser.lex(_read(os.path.join(path, f)) + '\n'))
+            try:
+                xrange
+            except NameError:
+                python_parser3.parse(_read(os.path.join(path, f)) + '\n')
+            else:
+                python_parser2.parse(_read(os.path.join(path, f)) + '\n')
+        except:
+            print ('At %s' % f)
+            raise
+
+    end = time.time()
+    print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) )
+
+def test_earley_equals_lalr():
+    path = _get_lib_path()
+
+    files = glob.glob(path+'/*.py')
+    for f in files:
+        print( f )
+        tree1 = python_parser2.parse(_read(os.path.join(path, f)) + '\n')
+        tree2 = python_parser2_earley.parse(_read(os.path.join(path, f)) + '\n')
+        assert tree1 == tree2
+
+
+if __name__ == '__main__':
+    test_python_lib()
+    # test_earley_equals_lalr()