From 0a21065de1cde03f29a9cfcdc6829769ccdf6724 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 5 Apr 2017 18:58:54 +0300 Subject: [PATCH] Added example Python grammars & parser --- examples/python2.g | 169 ++++++++++++++++++++++++++++++++++ examples/python3.g | 189 ++++++++++++++++++++++++++++++++++++++ examples/python_parser.py | 85 +++++++++++++++++ 3 files changed, 443 insertions(+) create mode 100644 examples/python2.g create mode 100644 examples/python3.g create mode 100644 examples/python_parser.py diff --git a/examples/python2.g b/examples/python2.g new file mode 100644 index 0000000..e8ebda8 --- /dev/null +++ b/examples/python2.g @@ -0,0 +1,169 @@ +// Python 2 grammar for Lark + +// NOTE: Work in progress!!! (XXX TODO) +// This grammar should parse all python 2.x code successfully, +// but the resulting parse-tree is still not well-organized. + +// Adapted from: https://docs.python.org/2/reference/grammar.html +// Adapted by: Erez Shinan + +// Start symbols for the grammar: +// single_input is a single interactive statement; +// file_input is a module or sequence of commands read from an input file; +// eval_input is the input for the eval() and input() functions. +// NB: compound_stmt in single_input is followed by extra _NEWLINE! +single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE +?file_input: (_NEWLINE | stmt)* +eval_input: testlist _NEWLINE? + +decorator: "@" dotted_name [ "(" [arglist] ")" ] _NEWLINE +decorators: decorator+ +decorated: decorators (classdef | funcdef) +funcdef: "def" NAME "(" parameters ")" ":" suite +parameters: [paramlist] +paramlist: param ("," param)* ["," [star_params ["," kw_params] | kw_params]] + | star_params ["," kw_params] + | kw_params +star_params: "*" NAME +kw_params: "**" NAME +param: fpdef ["=" test] +fpdef: NAME | "(" fplist ")" +fplist: fpdef ("," fpdef)* [","] + +?stmt: simple_stmt | compound_stmt +?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE +?small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt + | import_stmt | global_stmt | exec_stmt | assert_stmt) +expr_stmt: testlist augassign (yield_expr|testlist) -> augassign2 + | testlist ("=" (yield_expr|testlist))+ -> assign + | testlist + +augassign: ("+=" | "-=" | "*=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") +// For normal assignments, additional restrictions enforced by the interpreter +print_stmt: "print" ( [ test ("," test)* [","] ] | ">>" test [ ("," test)+ [","] ] ) +del_stmt: "del" exprlist +pass_stmt: "pass" +?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt +break_stmt: "break" +continue_stmt: "continue" +return_stmt: "return" [testlist] +yield_stmt: yield_expr +raise_stmt: "raise" [test ["," test ["," test]]] +import_stmt: import_name | import_from +import_name: "import" dotted_as_names +import_from: "from" ("."* dotted_name | "."+) "import" ("*" | "(" import_as_names ")" | import_as_names) +?import_as_name: NAME ["as" NAME] +?dotted_as_name: dotted_name ["as" NAME] +import_as_names: import_as_name ("," import_as_name)* [","] +dotted_as_names: dotted_as_name ("," dotted_as_name)* +dotted_name: NAME ("." NAME)* +global_stmt: "global" NAME ("," NAME)* +exec_stmt: "exec" expr ["in" test ["," test]] +assert_stmt: "assert" test ["," test] + +?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated +if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] +while_stmt: "while" test ":" suite ["else" ":" suite] +for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] +try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) +with_stmt: "with" with_item ("," with_item)* ":" suite +with_item: test ["as" expr] +// NB compile.c makes sure that the default except clause is last +except_clause: "except" [test [("as" | ",") test]] +suite: simple_stmt | _NEWLINE _INDENT _NEWLINE? stmt+ _DEDENT _NEWLINE? + +// Backward compatibility cruft to support: +// [ x for x in lambda: True, lambda: False if x() ] +// even while also allowing: +// lambda x: 5 if x else 2 +// (But not a mix of the two) +testlist_safe: old_test [("," old_test)+ [","]] +old_test: or_test | old_lambdef +old_lambdef: "lambda" [paramlist] ":" old_test + +?test: or_test ["if" or_test "else" test] | lambdef +?or_test: and_test ("or" and_test)* +?and_test: not_test ("and" not_test)* +?not_test: "not" not_test | comparison +?comparison: expr (comp_op expr)* +comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" +?expr: xor_expr ("|" xor_expr)* +?xor_expr: and_expr ("^" and_expr)* +?and_expr: shift_expr ("&" shift_expr)* +?shift_expr: arith_expr (("<<"|">>") arith_expr)* +?arith_expr: term (("+"|"-") term)* +?term: factor (("*"|"/"|"%"|"//") factor)* +?factor: ("+"|"-"|"~") factor | power +?power: molecule ["**" factor] +// _trailer: "(" [arglist] ")" | "[" subscriptlist "]" | "." NAME +?molecule: molecule "(" [arglist] ")" -> func_call + | molecule "[" [subscriptlist] "]" -> getitem + | molecule "." NAME -> getattr + | atom +?atom: "(" [yield_expr|testlist_comp] ")" + | "[" [listmaker] "]" + | "{" [dictorsetmaker] "}" + | "`" testlist1 "`" + | NAME | number | string+ +listmaker: test ( list_for | ("," test)* [","] ) +?testlist_comp: test ( comp_for | ("," test)* [","] ) +lambdef: "lambda" [paramlist] ":" test +?subscriptlist: subscript ("," subscript)* [","] +subscript: "." "." "." | test | [test] ":" [test] [sliceop] +sliceop: ":" [test] +?exprlist: expr ("," expr)* [","] +?testlist: test ("," test)* [","] +dictorsetmaker: ( (test ":" test (comp_for | ("," test ":" test)* [","])) | (test (comp_for | ("," test)* [","])) ) + +classdef: "class" NAME ["(" [testlist] ")"] ":" suite + +arglist: (argument ",")* (argument [","] + | star_args ["," kw_args] + | kw_args) + +star_args: "*" test +kw_args: "**" test + + +// The reason that keywords are test nodes instead of NAME is that using NAME +// results in an ambiguity. ast.c makes sure it's a NAME. +argument: test [comp_for] | test "=" test + +list_iter: list_for | list_if +list_for: "for" exprlist "in" testlist_safe [list_iter] +list_if: "if" old_test [list_iter] + +comp_iter: comp_for | comp_if +comp_for: "for" exprlist "in" or_test [comp_iter] +comp_if: "if" old_test [comp_iter] + +testlist1: test ("," test)* + +yield_expr: "yield" [testlist] + +number: DEC_NUMBER | HEX_NUMBER | OCT_NUMBER | FLOAT | IMAG_NUMBER +string: STRING | LONG_STRING +// Tokens + +COMMENT: /\#[^\n]*/ +_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ + +%ignore /[\t \f]+/ // WS +%ignore /\\\\[\t \f]*\r?\n/ // LINE_CONT +%ignore COMMENT + + +STRING : /(?i)[ub]?r?("(?!"").*?(? FLOAT +%import common.INT -> _INT +%import common.CNAME -> NAME +IMAG_NUMBER: (_INT | FLOAT) ("j"|"J") + +_DEDENT: "" +_INDENT: "" + diff --git a/examples/python3.g b/examples/python3.g new file mode 100644 index 0000000..279c268 --- /dev/null +++ b/examples/python3.g @@ -0,0 +1,189 @@ +// Python 3 grammar for Lark + +// NOTE: Work in progress!!! (XXX TODO) +// This grammar should parse all python 3.x code successfully, +// but the resulting parse-tree is still not well-organized. + +// Adapted from: https://docs.python.org/3/reference/grammar.html +// Adapted by: Erez Shinan + +// Start symbols for the grammar: +// single_input is a single interactive statement; +// file_input is a module or sequence of commands read from an input file; +// eval_input is the input for the eval() functions. +// NB: compound_stmt in single_input is followed by extra NEWLINE! +single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE +file_input: (_NEWLINE | stmt)* +eval_input: testlist _NEWLINE* + +decorator: "@" dotted_name [ "(" [arguments] ")" ] _NEWLINE +decorators: decorator+ +decorated: decorators (classdef | funcdef | async_funcdef) + +async_funcdef: "async" funcdef +funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite + +parameters: paramvalue ("," paramvalue)* ["," [ starparams | kwparams]] + | starparams + | kwparams +starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] +kwparams: "**" typedparam + +?paramvalue: typedparam ["=" test] +?typedparam: NAME [":" test] + +varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]] + | "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] + | "**" vfpdef [","]) + +vfpdef: NAME + +?stmt: simple_stmt | compound_stmt +?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE +?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) +?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) + | ("=" (yield_expr|testlist_star_expr))*) +annassign: ":" test ["=" test] +?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","] +!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") +// For normal and annotated assignments, additional restrictions enforced by the interpreter +del_stmt: "del" exprlist +pass_stmt: "pass" +?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt +break_stmt: "break" +continue_stmt: "continue" +return_stmt: "return" [testlist] +yield_stmt: yield_expr +raise_stmt: "raise" [test ["from" test]] +import_stmt: import_name | import_from +import_name: "import" dotted_as_names +// note below: the ("." | "...") is necessary because "..." is tokenized as ELLIPSIS +import_from: "from" (dots? dotted_name | dots) "import" ("*" | "(" import_as_names ")" | import_as_names) +!dots: "."+ +import_as_name: NAME ["as" NAME] +dotted_as_name: dotted_name ["as" NAME] +import_as_names: import_as_name ("," import_as_name)* [","] +dotted_as_names: dotted_as_name ("," dotted_as_name)* +dotted_name: NAME ("." NAME)* +global_stmt: "global" NAME ("," NAME)* +nonlocal_stmt: "nonlocal" NAME ("," NAME)* +assert_stmt: "assert" test ["," test] + +compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt +async_stmt: "async" (funcdef | with_stmt | for_stmt) +if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] +while_stmt: "while" test ":" suite ["else" ":" suite] +for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] +try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) +with_stmt: "with" with_item ("," with_item)* ":" suite +with_item: test ["as" expr] +// NB compile.c makes sure that the default except clause is last +except_clause: "except" [test ["as" NAME]] +suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT + +?test: or_test ["if" or_test "else" test] | lambdef +?test_nocond: or_test | lambdef_nocond +lambdef: "lambda" [varargslist] ":" test +lambdef_nocond: "lambda" [varargslist] ":" test_nocond +?or_test: and_test ("or" and_test)* +?and_test: not_test ("and" not_test)* +?not_test: "not" not_test -> not + | comparison +?comparison: expr (_comp_op expr)* +star_expr: "*" expr +?expr: xor_expr ("|" xor_expr)* +?xor_expr: and_expr ("^" and_expr)* +?and_expr: shift_expr ("&" shift_expr)* +?shift_expr: arith_expr (_shift_op arith_expr)* +?arith_expr: term (_add_op term)* +?term: factor (_mul_op factor)* +?factor: _factor_op factor | power + +!_factor_op: "+"|"-"|"~" +!_add_op: "+"|"-" +!_shift_op: "<<"|">>" +!_mul_op: "*"|"@"|"/"|"%"|"//" +// <> isn't actually a valid comparison operator in Python. It's here for the +// sake of a __future__ import described in PEP 401 (which really works :-) +!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" + +?power: await_expr ["**" factor] +?await_expr: AWAIT? atom_expr +AWAIT: "await" + +?atom_expr: atom_expr "(" [arguments] ")" -> funccall + | atom_expr "[" subscriptlist "]" -> getitem + | atom_expr "." NAME -> getattr + | atom + +?atom: "(" [yield_expr|testlist_comp] ")" + | "[" [testlist_comp] "]" + | "{" [dictorsetmaker] "}" + | NAME -> var + | number | string+ | "..." + | "None" -> const_none + | "True" -> const_true + | "False" -> const_false + +testlist_comp: (test|star_expr) ( comp_for | ("," (test|star_expr))* [","] ) +subscriptlist: subscript ("," subscript)* [","] +subscript: test | [test] ":" [test] [sliceop] +sliceop: ":" [test] +exprlist: (expr|star_expr) ("," (expr|star_expr))* [","] +testlist: test ("," test)* [","] +dictorsetmaker: ( ((test ":" test | "**" expr) (comp_for | ("," (test ":" test | "**" expr))* [","])) | ((test | star_expr) (comp_for | ("," (test | star_expr))* [","])) ) + +classdef: "class" NAME ["(" [arguments] ")"] ":" suite + +arguments: argvalue ("," argvalue)* ["," [ starargs | kwargs]] + | starargs + | kwargs + | test comp_for + +starargs: "*" test ("," argvalue)* ["," kwargs] +kwargs: "**" test + +?argvalue: test ["=" test] + + + +comp_iter: comp_for | comp_if | async_for +async_for: "async" "for" exprlist "in" or_test [comp_iter] +comp_for: "for" exprlist "in" or_test [comp_iter] +comp_if: "if" test_nocond [comp_iter] + +// not used in grammar, but may appear in "node" passed from Parser to Compiler +encoding_decl: NAME + +yield_expr: "yield" [yield_arg] +yield_arg: "from" test | testlist + + +number: DEC_NUMBER | HEX_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER +string: STRING | LONG_STRING +// Tokens + +NAME: /[a-zA-Z_]\w*/ +COMMENT: /\#[^\n]*/ +_NEWLINE: /(\r?\n[\t ]*|${COMMENT})+/ + +%ignore /[\t \f]+/ // WS +%ignore /\\\\[\t \f]*\r?\n/ // LINE_CONT +%ignore COMMENT + + + +// STRING : /[ub]?r?("(?!"").*?(?" +_INDENT: "" + diff --git a/examples/python_parser.py b/examples/python_parser.py new file mode 100644 index 0000000..671d829 --- /dev/null +++ b/examples/python_parser.py @@ -0,0 +1,85 @@ +# +# This example demonstrates usage of the included Python grammars +# + +import sys +import os, os.path +from io import open +import glob, time + +from lark import Lark +from lark.indenter import Indenter + +__path__ = os.path.dirname(__file__) + +class PythonIndenter(Indenter): + NL_type = '_NEWLINE' + OPEN_PAREN_types = ['__LPAR', '__LSQB', '__LBRACE'] + CLOSE_PAREN_types = ['__RPAR', '__RSQB', '__RBRACE'] + INDENT_type = '_INDENT' + DEDENT_type = '_DEDENT' + tab_len = 8 + + +grammar2_filename = os.path.join(__path__, 'python2.g') +grammar3_filename = os.path.join(__path__, 'python3.g') +with open(grammar2_filename) as f: + python_parser2 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') +with open(grammar3_filename) as f: + python_parser3 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input') + + +with open(grammar2_filename) as f: + python_parser2_earley = Lark(f, parser='lalr', lexer='standard', postlex=PythonIndenter(), start='file_input') + +def _read(fn, *args): + kwargs = {'encoding': 'iso-8859-1'} + with open(fn, *args, **kwargs) as f: + return f.read() + +def _get_lib_path(): + if os.name == 'nt': + if 'PyPy' in sys.version: + return os.path.join(sys.prefix, 'lib-python', sys.winver) + else: + return os.path.join(sys.prefix, 'Lib') + else: + return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0] + +def test_python_lib(): + + path = _get_lib_path() + + start = time.time() + files = glob.glob(path+'/*.py') + for f in files: + print( f ) + try: + # print list(python_parser.lex(_read(os.path.join(path, f)) + '\n')) + try: + xrange + except NameError: + python_parser3.parse(_read(os.path.join(path, f)) + '\n') + else: + python_parser2.parse(_read(os.path.join(path, f)) + '\n') + except: + print ('At %s' % f) + raise + + end = time.time() + print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) ) + +def test_earley_equals_lalr(): + path = _get_lib_path() + + files = glob.glob(path+'/*.py') + for f in files: + print( f ) + tree1 = python_parser2.parse(_read(os.path.join(path, f)) + '\n') + tree2 = python_parser2_earley.parse(_read(os.path.join(path, f)) + '\n') + assert tree1 == tree2 + + +if __name__ == '__main__': + test_python_lib() + # test_earley_equals_lalr()