| @@ -0,0 +1,169 @@ | |||
| // Python 2 grammar for Lark | |||
| // NOTE: Work in progress!!! (XXX TODO) | |||
| // This grammar should parse all python 2.x code successfully, | |||
| // but the resulting parse-tree is still not well-organized. | |||
| // Adapted from: https://docs.python.org/2/reference/grammar.html | |||
| // Adapted by: Erez Shinan | |||
| // Start symbols for the grammar: | |||
| // single_input is a single interactive statement; | |||
| // file_input is a module or sequence of commands read from an input file; | |||
| // eval_input is the input for the eval() and input() functions. | |||
| // NB: compound_stmt in single_input is followed by extra _NEWLINE! | |||
| single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE | |||
| ?file_input: (_NEWLINE | stmt)* | |||
| eval_input: testlist _NEWLINE? | |||
| decorator: "@" dotted_name [ "(" [arglist] ")" ] _NEWLINE | |||
| decorators: decorator+ | |||
| decorated: decorators (classdef | funcdef) | |||
| funcdef: "def" NAME "(" parameters ")" ":" suite | |||
| parameters: [paramlist] | |||
| paramlist: param ("," param)* ["," [star_params ["," kw_params] | kw_params]] | |||
| | star_params ["," kw_params] | |||
| | kw_params | |||
| star_params: "*" NAME | |||
| kw_params: "**" NAME | |||
| param: fpdef ["=" test] | |||
| fpdef: NAME | "(" fplist ")" | |||
| fplist: fpdef ("," fpdef)* [","] | |||
| ?stmt: simple_stmt | compound_stmt | |||
| ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE | |||
| ?small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | |||
| | import_stmt | global_stmt | exec_stmt | assert_stmt) | |||
| expr_stmt: testlist augassign (yield_expr|testlist) -> augassign2 | |||
| | testlist ("=" (yield_expr|testlist))+ -> assign | |||
| | testlist | |||
| augassign: ("+=" | "-=" | "*=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") | |||
| // For normal assignments, additional restrictions enforced by the interpreter | |||
| print_stmt: "print" ( [ test ("," test)* [","] ] | ">>" test [ ("," test)+ [","] ] ) | |||
| del_stmt: "del" exprlist | |||
| pass_stmt: "pass" | |||
| ?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt | |||
| break_stmt: "break" | |||
| continue_stmt: "continue" | |||
| return_stmt: "return" [testlist] | |||
| yield_stmt: yield_expr | |||
| raise_stmt: "raise" [test ["," test ["," test]]] | |||
| import_stmt: import_name | import_from | |||
| import_name: "import" dotted_as_names | |||
| import_from: "from" ("."* dotted_name | "."+) "import" ("*" | "(" import_as_names ")" | import_as_names) | |||
| ?import_as_name: NAME ["as" NAME] | |||
| ?dotted_as_name: dotted_name ["as" NAME] | |||
| import_as_names: import_as_name ("," import_as_name)* [","] | |||
| dotted_as_names: dotted_as_name ("," dotted_as_name)* | |||
| dotted_name: NAME ("." NAME)* | |||
| global_stmt: "global" NAME ("," NAME)* | |||
| exec_stmt: "exec" expr ["in" test ["," test]] | |||
| assert_stmt: "assert" test ["," test] | |||
| ?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | |||
| if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] | |||
| while_stmt: "while" test ":" suite ["else" ":" suite] | |||
| for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] | |||
| try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) | |||
| with_stmt: "with" with_item ("," with_item)* ":" suite | |||
| with_item: test ["as" expr] | |||
| // NB compile.c makes sure that the default except clause is last | |||
| except_clause: "except" [test [("as" | ",") test]] | |||
| suite: simple_stmt | _NEWLINE _INDENT _NEWLINE? stmt+ _DEDENT _NEWLINE? | |||
| // Backward compatibility cruft to support: | |||
| // [ x for x in lambda: True, lambda: False if x() ] | |||
| // even while also allowing: | |||
| // lambda x: 5 if x else 2 | |||
| // (But not a mix of the two) | |||
| testlist_safe: old_test [("," old_test)+ [","]] | |||
| old_test: or_test | old_lambdef | |||
| old_lambdef: "lambda" [paramlist] ":" old_test | |||
| ?test: or_test ["if" or_test "else" test] | lambdef | |||
| ?or_test: and_test ("or" and_test)* | |||
| ?and_test: not_test ("and" not_test)* | |||
| ?not_test: "not" not_test | comparison | |||
| ?comparison: expr (comp_op expr)* | |||
| comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" | |||
| ?expr: xor_expr ("|" xor_expr)* | |||
| ?xor_expr: and_expr ("^" and_expr)* | |||
| ?and_expr: shift_expr ("&" shift_expr)* | |||
| ?shift_expr: arith_expr (("<<"|">>") arith_expr)* | |||
| ?arith_expr: term (("+"|"-") term)* | |||
| ?term: factor (("*"|"/"|"%"|"//") factor)* | |||
| ?factor: ("+"|"-"|"~") factor | power | |||
| ?power: molecule ["**" factor] | |||
| // _trailer: "(" [arglist] ")" | "[" subscriptlist "]" | "." NAME | |||
| ?molecule: molecule "(" [arglist] ")" -> func_call | |||
| | molecule "[" [subscriptlist] "]" -> getitem | |||
| | molecule "." NAME -> getattr | |||
| | atom | |||
| ?atom: "(" [yield_expr|testlist_comp] ")" | |||
| | "[" [listmaker] "]" | |||
| | "{" [dictorsetmaker] "}" | |||
| | "`" testlist1 "`" | |||
| | NAME | number | string+ | |||
| listmaker: test ( list_for | ("," test)* [","] ) | |||
| ?testlist_comp: test ( comp_for | ("," test)* [","] ) | |||
| lambdef: "lambda" [paramlist] ":" test | |||
| ?subscriptlist: subscript ("," subscript)* [","] | |||
| subscript: "." "." "." | test | [test] ":" [test] [sliceop] | |||
| sliceop: ":" [test] | |||
| ?exprlist: expr ("," expr)* [","] | |||
| ?testlist: test ("," test)* [","] | |||
| dictorsetmaker: ( (test ":" test (comp_for | ("," test ":" test)* [","])) | (test (comp_for | ("," test)* [","])) ) | |||
| classdef: "class" NAME ["(" [testlist] ")"] ":" suite | |||
| arglist: (argument ",")* (argument [","] | |||
| | star_args ["," kw_args] | |||
| | kw_args) | |||
| star_args: "*" test | |||
| kw_args: "**" test | |||
| // The reason that keywords are test nodes instead of NAME is that using NAME | |||
| // results in an ambiguity. ast.c makes sure it's a NAME. | |||
| argument: test [comp_for] | test "=" test | |||
| list_iter: list_for | list_if | |||
| list_for: "for" exprlist "in" testlist_safe [list_iter] | |||
| list_if: "if" old_test [list_iter] | |||
| comp_iter: comp_for | comp_if | |||
| comp_for: "for" exprlist "in" or_test [comp_iter] | |||
| comp_if: "if" old_test [comp_iter] | |||
| testlist1: test ("," test)* | |||
| yield_expr: "yield" [testlist] | |||
| number: DEC_NUMBER | HEX_NUMBER | OCT_NUMBER | FLOAT | IMAG_NUMBER | |||
| string: STRING | LONG_STRING | |||
| // Tokens | |||
| COMMENT: /\#[^\n]*/ | |||
| _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ | |||
| %ignore /[\t \f]+/ // WS | |||
| %ignore /\\\\[\t \f]*\r?\n/ // LINE_CONT | |||
| %ignore COMMENT | |||
| STRING : /(?i)[ub]?r?("(?!"").*?(?<!\\\\)(\\\\\\\\)*?"|'(?!'').*?(?<!\\\\)(\\\\\\\\)*?')/ | |||
| LONG_STRING: /(?i)(?s)[ub]?r?(""".*?(?<!\\\\)(\\\\\\\\)*?"""|'''.*?(?<!\\\\)(\\\\\\\\)*?''')/ | |||
| DEC_NUMBER: /(?i)[1-9]\d*l?/ | |||
| HEX_NUMBER: /(?i)0x[\da-f]*l?/ | |||
| OCT_NUMBER: /(?i)0o?[0-7]*l?/ | |||
| %import common.FLOAT -> FLOAT | |||
| %import common.INT -> _INT | |||
| %import common.CNAME -> NAME | |||
| IMAG_NUMBER: (_INT | FLOAT) ("j"|"J") | |||
| _DEDENT: "<DEDENT>" | |||
| _INDENT: "<INDENT>" | |||
| @@ -0,0 +1,189 @@ | |||
| // Python 3 grammar for Lark | |||
| // NOTE: Work in progress!!! (XXX TODO) | |||
| // This grammar should parse all python 3.x code successfully, | |||
| // but the resulting parse-tree is still not well-organized. | |||
| // Adapted from: https://docs.python.org/3/reference/grammar.html | |||
| // Adapted by: Erez Shinan | |||
| // Start symbols for the grammar: | |||
| // single_input is a single interactive statement; | |||
| // file_input is a module or sequence of commands read from an input file; | |||
| // eval_input is the input for the eval() functions. | |||
| // NB: compound_stmt in single_input is followed by extra NEWLINE! | |||
| single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE | |||
| file_input: (_NEWLINE | stmt)* | |||
| eval_input: testlist _NEWLINE* | |||
| decorator: "@" dotted_name [ "(" [arguments] ")" ] _NEWLINE | |||
| decorators: decorator+ | |||
| decorated: decorators (classdef | funcdef | async_funcdef) | |||
| async_funcdef: "async" funcdef | |||
| funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite | |||
| parameters: paramvalue ("," paramvalue)* ["," [ starparams | kwparams]] | |||
| | starparams | |||
| | kwparams | |||
| starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] | |||
| kwparams: "**" typedparam | |||
| ?paramvalue: typedparam ["=" test] | |||
| ?typedparam: NAME [":" test] | |||
| varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]] | |||
| | "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | |||
| | "**" vfpdef [","]) | |||
| vfpdef: NAME | |||
| ?stmt: simple_stmt | compound_stmt | |||
| ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE | |||
| ?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) | |||
| ?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) | |||
| | ("=" (yield_expr|testlist_star_expr))*) | |||
| annassign: ":" test ["=" test] | |||
| ?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","] | |||
| !augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") | |||
| // For normal and annotated assignments, additional restrictions enforced by the interpreter | |||
| del_stmt: "del" exprlist | |||
| pass_stmt: "pass" | |||
| ?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt | |||
| break_stmt: "break" | |||
| continue_stmt: "continue" | |||
| return_stmt: "return" [testlist] | |||
| yield_stmt: yield_expr | |||
| raise_stmt: "raise" [test ["from" test]] | |||
| import_stmt: import_name | import_from | |||
| import_name: "import" dotted_as_names | |||
| // note below: the ("." | "...") is necessary because "..." is tokenized as ELLIPSIS | |||
| import_from: "from" (dots? dotted_name | dots) "import" ("*" | "(" import_as_names ")" | import_as_names) | |||
| !dots: "."+ | |||
| import_as_name: NAME ["as" NAME] | |||
| dotted_as_name: dotted_name ["as" NAME] | |||
| import_as_names: import_as_name ("," import_as_name)* [","] | |||
| dotted_as_names: dotted_as_name ("," dotted_as_name)* | |||
| dotted_name: NAME ("." NAME)* | |||
| global_stmt: "global" NAME ("," NAME)* | |||
| nonlocal_stmt: "nonlocal" NAME ("," NAME)* | |||
| assert_stmt: "assert" test ["," test] | |||
| compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt | |||
| async_stmt: "async" (funcdef | with_stmt | for_stmt) | |||
| if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] | |||
| while_stmt: "while" test ":" suite ["else" ":" suite] | |||
| for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] | |||
| try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) | |||
| with_stmt: "with" with_item ("," with_item)* ":" suite | |||
| with_item: test ["as" expr] | |||
| // NB compile.c makes sure that the default except clause is last | |||
| except_clause: "except" [test ["as" NAME]] | |||
| suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT | |||
| ?test: or_test ["if" or_test "else" test] | lambdef | |||
| ?test_nocond: or_test | lambdef_nocond | |||
| lambdef: "lambda" [varargslist] ":" test | |||
| lambdef_nocond: "lambda" [varargslist] ":" test_nocond | |||
| ?or_test: and_test ("or" and_test)* | |||
| ?and_test: not_test ("and" not_test)* | |||
| ?not_test: "not" not_test -> not | |||
| | comparison | |||
| ?comparison: expr (_comp_op expr)* | |||
| star_expr: "*" expr | |||
| ?expr: xor_expr ("|" xor_expr)* | |||
| ?xor_expr: and_expr ("^" and_expr)* | |||
| ?and_expr: shift_expr ("&" shift_expr)* | |||
| ?shift_expr: arith_expr (_shift_op arith_expr)* | |||
| ?arith_expr: term (_add_op term)* | |||
| ?term: factor (_mul_op factor)* | |||
| ?factor: _factor_op factor | power | |||
| !_factor_op: "+"|"-"|"~" | |||
| !_add_op: "+"|"-" | |||
| !_shift_op: "<<"|">>" | |||
| !_mul_op: "*"|"@"|"/"|"%"|"//" | |||
| // <> isn't actually a valid comparison operator in Python. It's here for the | |||
| // sake of a __future__ import described in PEP 401 (which really works :-) | |||
| !_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" | |||
| ?power: await_expr ["**" factor] | |||
| ?await_expr: AWAIT? atom_expr | |||
| AWAIT: "await" | |||
| ?atom_expr: atom_expr "(" [arguments] ")" -> funccall | |||
| | atom_expr "[" subscriptlist "]" -> getitem | |||
| | atom_expr "." NAME -> getattr | |||
| | atom | |||
| ?atom: "(" [yield_expr|testlist_comp] ")" | |||
| | "[" [testlist_comp] "]" | |||
| | "{" [dictorsetmaker] "}" | |||
| | NAME -> var | |||
| | number | string+ | "..." | |||
| | "None" -> const_none | |||
| | "True" -> const_true | |||
| | "False" -> const_false | |||
| testlist_comp: (test|star_expr) ( comp_for | ("," (test|star_expr))* [","] ) | |||
| subscriptlist: subscript ("," subscript)* [","] | |||
| subscript: test | [test] ":" [test] [sliceop] | |||
| sliceop: ":" [test] | |||
| exprlist: (expr|star_expr) ("," (expr|star_expr))* [","] | |||
| testlist: test ("," test)* [","] | |||
| dictorsetmaker: ( ((test ":" test | "**" expr) (comp_for | ("," (test ":" test | "**" expr))* [","])) | ((test | star_expr) (comp_for | ("," (test | star_expr))* [","])) ) | |||
| classdef: "class" NAME ["(" [arguments] ")"] ":" suite | |||
| arguments: argvalue ("," argvalue)* ["," [ starargs | kwargs]] | |||
| | starargs | |||
| | kwargs | |||
| | test comp_for | |||
| starargs: "*" test ("," argvalue)* ["," kwargs] | |||
| kwargs: "**" test | |||
| ?argvalue: test ["=" test] | |||
| comp_iter: comp_for | comp_if | async_for | |||
| async_for: "async" "for" exprlist "in" or_test [comp_iter] | |||
| comp_for: "for" exprlist "in" or_test [comp_iter] | |||
| comp_if: "if" test_nocond [comp_iter] | |||
| // not used in grammar, but may appear in "node" passed from Parser to Compiler | |||
| encoding_decl: NAME | |||
| yield_expr: "yield" [yield_arg] | |||
| yield_arg: "from" test | testlist | |||
| number: DEC_NUMBER | HEX_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER | |||
| string: STRING | LONG_STRING | |||
| // Tokens | |||
| NAME: /[a-zA-Z_]\w*/ | |||
| COMMENT: /\#[^\n]*/ | |||
| _NEWLINE: /(\r?\n[\t ]*|${COMMENT})+/ | |||
| %ignore /[\t \f]+/ // WS | |||
| %ignore /\\\\[\t \f]*\r?\n/ // LINE_CONT | |||
| %ignore COMMENT | |||
| // STRING : /[ub]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/ | |||
| // LONG_STRING: /(?s)[ub]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/ | |||
| STRING : /(?i)[ub]?r?("(?!"").*?(?<!\\\\)(\\\\\\\\)*?"|'(?!'').*?(?<!\\\\)(\\\\\\\\)*?')/ | |||
| LONG_STRING: /(?i)(?s)[ub]?r?(""".*?(?<!\\\\)(\\\\\\\\)*?"""|'''.*?(?<!\\\\)(\\\\\\\\)*?''')/ | |||
| DEC_NUMBER: /(?i)[1-9]\d*l?/ | |||
| HEX_NUMBER: /(?i)0x[\da-f]*l?/ | |||
| OCT_NUMBER: /(?i)0o?[0-7]*l?/ | |||
| FLOAT_NUMBER: /(?i)((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/ | |||
| IMAG_NUMBER: /(?i)\d+j|${FLOAT_NUMBER}j/ | |||
| _DEDENT: "<DEDENT>" | |||
| _INDENT: "<INDENT>" | |||
| @@ -0,0 +1,85 @@ | |||
| # | |||
| # This example demonstrates usage of the included Python grammars | |||
| # | |||
| import sys | |||
| import os, os.path | |||
| from io import open | |||
| import glob, time | |||
| from lark import Lark | |||
| from lark.indenter import Indenter | |||
| __path__ = os.path.dirname(__file__) | |||
| class PythonIndenter(Indenter): | |||
| NL_type = '_NEWLINE' | |||
| OPEN_PAREN_types = ['__LPAR', '__LSQB', '__LBRACE'] | |||
| CLOSE_PAREN_types = ['__RPAR', '__RSQB', '__RBRACE'] | |||
| INDENT_type = '_INDENT' | |||
| DEDENT_type = '_DEDENT' | |||
| tab_len = 8 | |||
| grammar2_filename = os.path.join(__path__, 'python2.g') | |||
| grammar3_filename = os.path.join(__path__, 'python3.g') | |||
| with open(grammar2_filename) as f: | |||
| python_parser2 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') | |||
| with open(grammar3_filename) as f: | |||
| python_parser3 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') | |||
| with open(grammar2_filename) as f: | |||
| python_parser2_earley = Lark(f, parser='lalr', lexer='standard', postlex=PythonIndenter(), start='file_input') | |||
| def _read(fn, *args): | |||
| kwargs = {'encoding': 'iso-8859-1'} | |||
| with open(fn, *args, **kwargs) as f: | |||
| return f.read() | |||
| def _get_lib_path(): | |||
| if os.name == 'nt': | |||
| if 'PyPy' in sys.version: | |||
| return os.path.join(sys.prefix, 'lib-python', sys.winver) | |||
| else: | |||
| return os.path.join(sys.prefix, 'Lib') | |||
| else: | |||
| return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0] | |||
| def test_python_lib(): | |||
| path = _get_lib_path() | |||
| start = time.time() | |||
| files = glob.glob(path+'/*.py') | |||
| for f in files: | |||
| print( f ) | |||
| try: | |||
| # print list(python_parser.lex(_read(os.path.join(path, f)) + '\n')) | |||
| try: | |||
| xrange | |||
| except NameError: | |||
| python_parser3.parse(_read(os.path.join(path, f)) + '\n') | |||
| else: | |||
| python_parser2.parse(_read(os.path.join(path, f)) + '\n') | |||
| except: | |||
| print ('At %s' % f) | |||
| raise | |||
| end = time.time() | |||
| print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) ) | |||
| def test_earley_equals_lalr(): | |||
| path = _get_lib_path() | |||
| files = glob.glob(path+'/*.py') | |||
| for f in files: | |||
| print( f ) | |||
| tree1 = python_parser2.parse(_read(os.path.join(path, f)) + '\n') | |||
| tree2 = python_parser2_earley.parse(_read(os.path.join(path, f)) + '\n') | |||
| assert tree1 == tree2 | |||
| if __name__ == '__main__': | |||
| test_python_lib() | |||
| # test_earley_equals_lalr() | |||