Browse Source

Added example Python grammars & parser

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 years ago
parent
commit
0a21065de1
3 changed files with 443 additions and 0 deletions
  1. +169
    -0
      examples/python2.g
  2. +189
    -0
      examples/python3.g
  3. +85
    -0
      examples/python_parser.py

+ 169
- 0
examples/python2.g View File

@@ -0,0 +1,169 @@
// Python 2 grammar for Lark

// NOTE: Work in progress!!! (XXX TODO)
// This grammar should parse all python 2.x code successfully,
// but the resulting parse-tree is still not well-organized.

// Adapted from: https://docs.python.org/2/reference/grammar.html
// Adapted by: Erez Shinan

// Start symbols for the grammar:
// single_input is a single interactive statement;
// file_input is a module or sequence of commands read from an input file;
// eval_input is the input for the eval() and input() functions.
// NB: compound_stmt in single_input is followed by extra _NEWLINE!
single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE
?file_input: (_NEWLINE | stmt)*
eval_input: testlist _NEWLINE?

decorator: "@" dotted_name [ "(" [arglist] ")" ] _NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef)
funcdef: "def" NAME "(" parameters ")" ":" suite
parameters: [paramlist]
paramlist: param ("," param)* ["," [star_params ["," kw_params] | kw_params]]
| star_params ["," kw_params]
| kw_params
star_params: "*" NAME
kw_params: "**" NAME
param: fpdef ["=" test]
fpdef: NAME | "(" fplist ")"
fplist: fpdef ("," fpdef)* [","]

?stmt: simple_stmt | compound_stmt
?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE
?small_stmt: (expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt
| import_stmt | global_stmt | exec_stmt | assert_stmt)
expr_stmt: testlist augassign (yield_expr|testlist) -> augassign2
| testlist ("=" (yield_expr|testlist))+ -> assign
| testlist

augassign: ("+=" | "-=" | "*=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=")
// For normal assignments, additional restrictions enforced by the interpreter
print_stmt: "print" ( [ test ("," test)* [","] ] | ">>" test [ ("," test)+ [","] ] )
del_stmt: "del" exprlist
pass_stmt: "pass"
?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: "break"
continue_stmt: "continue"
return_stmt: "return" [testlist]
yield_stmt: yield_expr
raise_stmt: "raise" [test ["," test ["," test]]]
import_stmt: import_name | import_from
import_name: "import" dotted_as_names
import_from: "from" ("."* dotted_name | "."+) "import" ("*" | "(" import_as_names ")" | import_as_names)
?import_as_name: NAME ["as" NAME]
?dotted_as_name: dotted_name ["as" NAME]
import_as_names: import_as_name ("," import_as_name)* [","]
dotted_as_names: dotted_as_name ("," dotted_as_name)*
dotted_name: NAME ("." NAME)*
global_stmt: "global" NAME ("," NAME)*
exec_stmt: "exec" expr ["in" test ["," test]]
assert_stmt: "assert" test ["," test]

?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated
if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite]
while_stmt: "while" test ":" suite ["else" ":" suite]
for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite]
try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite))
with_stmt: "with" with_item ("," with_item)* ":" suite
with_item: test ["as" expr]
// NB compile.c makes sure that the default except clause is last
except_clause: "except" [test [("as" | ",") test]]
suite: simple_stmt | _NEWLINE _INDENT _NEWLINE? stmt+ _DEDENT _NEWLINE?

// Backward compatibility cruft to support:
// [ x for x in lambda: True, lambda: False if x() ]
// even while also allowing:
// lambda x: 5 if x else 2
// (But not a mix of the two)
testlist_safe: old_test [("," old_test)+ [","]]
old_test: or_test | old_lambdef
old_lambdef: "lambda" [paramlist] ":" old_test

?test: or_test ["if" or_test "else" test] | lambdef
?or_test: and_test ("or" and_test)*
?and_test: not_test ("and" not_test)*
?not_test: "not" not_test | comparison
?comparison: expr (comp_op expr)*
comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"
?expr: xor_expr ("|" xor_expr)*
?xor_expr: and_expr ("^" and_expr)*
?and_expr: shift_expr ("&" shift_expr)*
?shift_expr: arith_expr (("<<"|">>") arith_expr)*
?arith_expr: term (("+"|"-") term)*
?term: factor (("*"|"/"|"%"|"//") factor)*
?factor: ("+"|"-"|"~") factor | power
?power: molecule ["**" factor]
// _trailer: "(" [arglist] ")" | "[" subscriptlist "]" | "." NAME
?molecule: molecule "(" [arglist] ")" -> func_call
| molecule "[" [subscriptlist] "]" -> getitem
| molecule "." NAME -> getattr
| atom
?atom: "(" [yield_expr|testlist_comp] ")"
| "[" [listmaker] "]"
| "{" [dictorsetmaker] "}"
| "`" testlist1 "`"
| NAME | number | string+
listmaker: test ( list_for | ("," test)* [","] )
?testlist_comp: test ( comp_for | ("," test)* [","] )
lambdef: "lambda" [paramlist] ":" test
?subscriptlist: subscript ("," subscript)* [","]
subscript: "." "." "." | test | [test] ":" [test] [sliceop]
sliceop: ":" [test]
?exprlist: expr ("," expr)* [","]
?testlist: test ("," test)* [","]
dictorsetmaker: ( (test ":" test (comp_for | ("," test ":" test)* [","])) | (test (comp_for | ("," test)* [","])) )

classdef: "class" NAME ["(" [testlist] ")"] ":" suite

arglist: (argument ",")* (argument [","]
| star_args ["," kw_args]
| kw_args)

star_args: "*" test
kw_args: "**" test


// The reason that keywords are test nodes instead of NAME is that using NAME
// results in an ambiguity. ast.c makes sure it's a NAME.
argument: test [comp_for] | test "=" test

list_iter: list_for | list_if
list_for: "for" exprlist "in" testlist_safe [list_iter]
list_if: "if" old_test [list_iter]

comp_iter: comp_for | comp_if
comp_for: "for" exprlist "in" or_test [comp_iter]
comp_if: "if" old_test [comp_iter]

testlist1: test ("," test)*

yield_expr: "yield" [testlist]

number: DEC_NUMBER | HEX_NUMBER | OCT_NUMBER | FLOAT | IMAG_NUMBER
string: STRING | LONG_STRING
// Tokens

COMMENT: /\#[^\n]*/
_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+

%ignore /[\t \f]+/ // WS
%ignore /\\\\[\t \f]*\r?\n/ // LINE_CONT
%ignore COMMENT


STRING : /(?i)[ub]?r?("(?!"").*?(?<!\\\\)(\\\\\\\\)*?"|'(?!'').*?(?<!\\\\)(\\\\\\\\)*?')/
LONG_STRING: /(?i)(?s)[ub]?r?(""".*?(?<!\\\\)(\\\\\\\\)*?"""|'''.*?(?<!\\\\)(\\\\\\\\)*?''')/

DEC_NUMBER: /(?i)[1-9]\d*l?/
HEX_NUMBER: /(?i)0x[\da-f]*l?/
OCT_NUMBER: /(?i)0o?[0-7]*l?/
%import common.FLOAT -> FLOAT
%import common.INT -> _INT
%import common.CNAME -> NAME
IMAG_NUMBER: (_INT | FLOAT) ("j"|"J")

_DEDENT: "<DEDENT>"
_INDENT: "<INDENT>"


+ 189
- 0
examples/python3.g View File

@@ -0,0 +1,189 @@
// Python 3 grammar for Lark

// NOTE: Work in progress!!! (XXX TODO)
// This grammar should parse all python 3.x code successfully,
// but the resulting parse-tree is still not well-organized.

// Adapted from: https://docs.python.org/3/reference/grammar.html
// Adapted by: Erez Shinan

// Start symbols for the grammar:
// single_input is a single interactive statement;
// file_input is a module or sequence of commands read from an input file;
// eval_input is the input for the eval() functions.
// NB: compound_stmt in single_input is followed by extra NEWLINE!
single_input: _NEWLINE | simple_stmt | compound_stmt _NEWLINE
file_input: (_NEWLINE | stmt)*
eval_input: testlist _NEWLINE*

decorator: "@" dotted_name [ "(" [arguments] ")" ] _NEWLINE
decorators: decorator+
decorated: decorators (classdef | funcdef | async_funcdef)

async_funcdef: "async" funcdef
funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite

parameters: paramvalue ("," paramvalue)* ["," [ starparams | kwparams]]
| starparams
| kwparams
starparams: "*" typedparam? ("," paramvalue)* ["," kwparams]
kwparams: "**" typedparam

?paramvalue: typedparam ["=" test]
?typedparam: NAME [":" test]

varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]]
| "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]]
| "**" vfpdef [","])

vfpdef: NAME

?stmt: simple_stmt | compound_stmt
?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE
?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist)
| ("=" (yield_expr|testlist_star_expr))*)
annassign: ":" test ["=" test]
?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","]
!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=")
// For normal and annotated assignments, additional restrictions enforced by the interpreter
del_stmt: "del" exprlist
pass_stmt: "pass"
?flow_stmt: break_stmt | continue_stmt | return_stmt | raise_stmt | yield_stmt
break_stmt: "break"
continue_stmt: "continue"
return_stmt: "return" [testlist]
yield_stmt: yield_expr
raise_stmt: "raise" [test ["from" test]]
import_stmt: import_name | import_from
import_name: "import" dotted_as_names
// note below: the ("." | "...") is necessary because "..." is tokenized as ELLIPSIS
import_from: "from" (dots? dotted_name | dots) "import" ("*" | "(" import_as_names ")" | import_as_names)
!dots: "."+
import_as_name: NAME ["as" NAME]
dotted_as_name: dotted_name ["as" NAME]
import_as_names: import_as_name ("," import_as_name)* [","]
dotted_as_names: dotted_as_name ("," dotted_as_name)*
dotted_name: NAME ("." NAME)*
global_stmt: "global" NAME ("," NAME)*
nonlocal_stmt: "nonlocal" NAME ("," NAME)*
assert_stmt: "assert" test ["," test]

compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
async_stmt: "async" (funcdef | with_stmt | for_stmt)
if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite]
while_stmt: "while" test ":" suite ["else" ":" suite]
for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite]
try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite))
with_stmt: "with" with_item ("," with_item)* ":" suite
with_item: test ["as" expr]
// NB compile.c makes sure that the default except clause is last
except_clause: "except" [test ["as" NAME]]
suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT

?test: or_test ["if" or_test "else" test] | lambdef
?test_nocond: or_test | lambdef_nocond
lambdef: "lambda" [varargslist] ":" test
lambdef_nocond: "lambda" [varargslist] ":" test_nocond
?or_test: and_test ("or" and_test)*
?and_test: not_test ("and" not_test)*
?not_test: "not" not_test -> not
| comparison
?comparison: expr (_comp_op expr)*
star_expr: "*" expr
?expr: xor_expr ("|" xor_expr)*
?xor_expr: and_expr ("^" and_expr)*
?and_expr: shift_expr ("&" shift_expr)*
?shift_expr: arith_expr (_shift_op arith_expr)*
?arith_expr: term (_add_op term)*
?term: factor (_mul_op factor)*
?factor: _factor_op factor | power

!_factor_op: "+"|"-"|"~"
!_add_op: "+"|"-"
!_shift_op: "<<"|">>"
!_mul_op: "*"|"@"|"/"|"%"|"//"
// <> isn't actually a valid comparison operator in Python. It's here for the
// sake of a __future__ import described in PEP 401 (which really works :-)
!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"

?power: await_expr ["**" factor]
?await_expr: AWAIT? atom_expr
AWAIT: "await"

?atom_expr: atom_expr "(" [arguments] ")" -> funccall
| atom_expr "[" subscriptlist "]" -> getitem
| atom_expr "." NAME -> getattr
| atom

?atom: "(" [yield_expr|testlist_comp] ")"
| "[" [testlist_comp] "]"
| "{" [dictorsetmaker] "}"
| NAME -> var
| number | string+ | "..."
| "None" -> const_none
| "True" -> const_true
| "False" -> const_false

testlist_comp: (test|star_expr) ( comp_for | ("," (test|star_expr))* [","] )
subscriptlist: subscript ("," subscript)* [","]
subscript: test | [test] ":" [test] [sliceop]
sliceop: ":" [test]
exprlist: (expr|star_expr) ("," (expr|star_expr))* [","]
testlist: test ("," test)* [","]
dictorsetmaker: ( ((test ":" test | "**" expr) (comp_for | ("," (test ":" test | "**" expr))* [","])) | ((test | star_expr) (comp_for | ("," (test | star_expr))* [","])) )

classdef: "class" NAME ["(" [arguments] ")"] ":" suite

arguments: argvalue ("," argvalue)* ["," [ starargs | kwargs]]
| starargs
| kwargs
| test comp_for

starargs: "*" test ("," argvalue)* ["," kwargs]
kwargs: "**" test

?argvalue: test ["=" test]



comp_iter: comp_for | comp_if | async_for
async_for: "async" "for" exprlist "in" or_test [comp_iter]
comp_for: "for" exprlist "in" or_test [comp_iter]
comp_if: "if" test_nocond [comp_iter]

// not used in grammar, but may appear in "node" passed from Parser to Compiler
encoding_decl: NAME

yield_expr: "yield" [yield_arg]
yield_arg: "from" test | testlist


number: DEC_NUMBER | HEX_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER
string: STRING | LONG_STRING
// Tokens

NAME: /[a-zA-Z_]\w*/
COMMENT: /\#[^\n]*/
_NEWLINE: /(\r?\n[\t ]*|${COMMENT})+/

%ignore /[\t \f]+/ // WS
%ignore /\\\\[\t \f]*\r?\n/ // LINE_CONT
%ignore COMMENT



// STRING : /[ub]?r?("(?!"").*?(?<!\\)(\\\\)*?"|'(?!'').*?(?<!\\)(\\\\)*?')/
// LONG_STRING: /(?s)[ub]?r?(""".*?(?<!\\)(\\\\)*?"""|'''.*?(?<!\\)(\\\\)*?''')/
STRING : /(?i)[ub]?r?("(?!"").*?(?<!\\\\)(\\\\\\\\)*?"|'(?!'').*?(?<!\\\\)(\\\\\\\\)*?')/
LONG_STRING: /(?i)(?s)[ub]?r?(""".*?(?<!\\\\)(\\\\\\\\)*?"""|'''.*?(?<!\\\\)(\\\\\\\\)*?''')/

DEC_NUMBER: /(?i)[1-9]\d*l?/
HEX_NUMBER: /(?i)0x[\da-f]*l?/
OCT_NUMBER: /(?i)0o?[0-7]*l?/
FLOAT_NUMBER: /(?i)((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/
IMAG_NUMBER: /(?i)\d+j|${FLOAT_NUMBER}j/

_DEDENT: "<DEDENT>"
_INDENT: "<INDENT>"


+ 85
- 0
examples/python_parser.py View File

@@ -0,0 +1,85 @@
#
# This example demonstrates usage of the included Python grammars
#

import sys
import os, os.path
from io import open
import glob, time

from lark import Lark
from lark.indenter import Indenter

__path__ = os.path.dirname(__file__)

class PythonIndenter(Indenter):
NL_type = '_NEWLINE'
OPEN_PAREN_types = ['__LPAR', '__LSQB', '__LBRACE']
CLOSE_PAREN_types = ['__RPAR', '__RSQB', '__RBRACE']
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 8


grammar2_filename = os.path.join(__path__, 'python2.g')
grammar3_filename = os.path.join(__path__, 'python3.g')
with open(grammar2_filename) as f:
python_parser2 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input')
with open(grammar3_filename) as f:
python_parser3 = Lark(f, parser='lalr', postlex=PythonIndenter(), start='file_input')


with open(grammar2_filename) as f:
python_parser2_earley = Lark(f, parser='lalr', lexer='standard', postlex=PythonIndenter(), start='file_input')

def _read(fn, *args):
kwargs = {'encoding': 'iso-8859-1'}
with open(fn, *args, **kwargs) as f:
return f.read()

def _get_lib_path():
if os.name == 'nt':
if 'PyPy' in sys.version:
return os.path.join(sys.prefix, 'lib-python', sys.winver)
else:
return os.path.join(sys.prefix, 'Lib')
else:
return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0]

def test_python_lib():

path = _get_lib_path()

start = time.time()
files = glob.glob(path+'/*.py')
for f in files:
print( f )
try:
# print list(python_parser.lex(_read(os.path.join(path, f)) + '\n'))
try:
xrange
except NameError:
python_parser3.parse(_read(os.path.join(path, f)) + '\n')
else:
python_parser2.parse(_read(os.path.join(path, f)) + '\n')
except:
print ('At %s' % f)
raise

end = time.time()
print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) )

def test_earley_equals_lalr():
path = _get_lib_path()

files = glob.glob(path+'/*.py')
for f in files:
print( f )
tree1 = python_parser2.parse(_read(os.path.join(path, f)) + '\n')
tree2 = python_parser2_earley.parse(_read(os.path.join(path, f)) + '\n')
assert tree1 == tree2


if __name__ == '__main__':
test_python_lib()
# test_earley_equals_lalr()

Loading…
Cancel
Save