| @@ -6,7 +6,7 @@ jobs: | |||
| runs-on: ubuntu-latest | |||
| strategy: | |||
| matrix: | |||
| python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy2, pypy3] | |||
| python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10.0-rc - 3.10, pypy2, pypy3] | |||
| steps: | |||
| - uses: actions/checkout@v2 | |||
| @@ -1,15 +0,0 @@ | |||
| dist: xenial | |||
| language: python | |||
| python: | |||
| - "2.7" | |||
| - "3.4" | |||
| - "3.5" | |||
| - "3.6" | |||
| - "3.7" | |||
| - "3.8" | |||
| - "3.9-dev" | |||
| - "pypy2.7-6.0" | |||
| - "pypy3.5-6.0" | |||
| install: pip install tox-travis | |||
| script: | |||
| - tox | |||
| @@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h | |||
| - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) | |||
| - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) | |||
| - [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html) | |||
| - [Online IDE](https://lark-parser.github.io/ide) | |||
| - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. | |||
| - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) | |||
| - [Gitter chat](https://gitter.im/lark-parser/Lobby) | |||
| @@ -37,7 +37,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h | |||
| Lark has no dependencies. | |||
| [](https://travis-ci.org/lark-parser/lark) | |||
| [](https://github.com/lark-parser/lark/actions/workflows/tests.yml) | |||
| ### Syntax Highlighting | |||
| @@ -51,7 +51,10 @@ Lark provides syntax highlighting for its grammar files (\*.lark): | |||
| ### Clones | |||
| These are implementations of Lark in other languages. They accept Lark grammars, and provide similar utilities. | |||
| - [Lerche (Julia)](https://github.com/jamesrhester/Lerche.jl) - an unofficial clone, written entirely in Julia. | |||
| - [Lark.js (Javascript)](https://github.com/lark-parser/lark.js) - a port of the stand-alone LALR(1) parser generator to Javascsript. | |||
| ### Hello World | |||
| @@ -143,6 +146,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail | |||
| ### Projects using Lark | |||
| - [Poetry](https://github.com/python-poetry/poetry-core) - A utility for dependency management and packaging | |||
| - [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL server by Dailymotion | |||
| - [Hypothesis](https://github.com/HypothesisWorks/hypothesis) - Library for property-based testing | |||
| - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration | |||
| @@ -66,6 +66,8 @@ UnexpectedInput | |||
| .. autoclass:: lark.exceptions.UnexpectedCharacters | |||
| .. autoclass:: lark.exceptions.UnexpectedEOF | |||
| InteractiveParser | |||
| ----------------- | |||
| @@ -159,14 +159,15 @@ start : (A | B)+ | |||
| A : "a" | "ab" | |||
| B : "b" | |||
| ``` | |||
| We get this behavior: | |||
| We get only one possible derivation, instead of two: | |||
| ```bash | |||
| >>> p = Lark(g, ambiguity="explicit") | |||
| >>> p.parse("ab") | |||
| Tree(start, [Token(A, 'a'), Token(B, 'b')]) | |||
| Tree('start', [Token('A', 'ab')]) | |||
| ``` | |||
| This is happening because Python's regex engine always returns the first matching option. | |||
| This is happening because Python's regex engine always returns the best matching option. There is no way to access the alternatives. | |||
| If you find yourself in this situation, the recommended solution is to use rules instead. | |||
| @@ -113,7 +113,7 @@ Resources | |||
| .. _Examples: https://github.com/lark-parser/lark/tree/master/examples | |||
| .. _Third-party examples: https://github.com/ligurio/lark-grammars | |||
| .. _Online IDE: https://lark-parser.github.io/lark/ide/app.html | |||
| .. _Online IDE: https://lark-parser.github.io/ide | |||
| .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ | |||
| .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html | |||
| .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf | |||
| @@ -103,7 +103,17 @@ v_args | |||
| .. autofunction:: lark.visitors.v_args | |||
| merge_transformers | |||
| ------------------ | |||
| .. autofunction:: lark.visitors.merge_transformers | |||
| Discard | |||
| ------- | |||
| .. autoclass:: lark.visitors.Discard | |||
| VisitError | |||
| ---------- | |||
| .. autoclass:: lark.exceptions.VisitError | |||
| @@ -15,6 +15,7 @@ from typing import List | |||
| from dataclasses import dataclass | |||
| from lark import Lark, ast_utils, Transformer, v_args | |||
| from lark.tree import Meta | |||
| this_module = sys.modules[__name__] | |||
| @@ -31,7 +32,9 @@ class _Statement(_Ast): | |||
| pass | |||
| @dataclass | |||
| class Value(_Ast): | |||
| class Value(_Ast, ast_utils.WithMeta): | |||
| "Uses WithMeta to include line-number metadata in the meta attribute" | |||
| meta: Meta | |||
| value: object | |||
| @dataclass | |||
| @@ -39,7 +39,7 @@ def name(n): | |||
| """, start='file_input') | |||
| # Remove the 'python3__' prefix that was add to the implicitely imported rules. | |||
| # Remove the 'python3__' prefix that was added to the implicitly imported rules. | |||
| for t in tree.iter_subtrees(): | |||
| t.data = t.data.rsplit('__', 1)[-1] | |||
| @@ -1,8 +1,6 @@ | |||
| // Python 3 grammar for Lark | |||
| // NOTE: Work in progress!!! (XXX TODO) | |||
| // This grammar should parse all python 3.x code successfully, | |||
| // but the resulting parse-tree is still not well-organized. | |||
| // This grammar should parse all python 3.x code successfully. | |||
| // Adapted from: https://docs.python.org/3/reference/grammar.html | |||
| // Adapted by: Erez Shinan | |||
| @@ -21,7 +19,7 @@ decorators: decorator+ | |||
| decorated: decorators (classdef | funcdef | async_funcdef) | |||
| async_funcdef: "async" funcdef | |||
| funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite | |||
| funcdef: "def" NAME "(" [parameters] ")" ["->" test] ":" suite | |||
| parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]] | |||
| | starparams | |||
| @@ -29,25 +27,36 @@ parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams | |||
| SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result | |||
| starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] | |||
| kwparams: "**" typedparam | |||
| kwparams: "**" typedparam ","? | |||
| ?paramvalue: typedparam ["=" test] | |||
| ?typedparam: NAME [":" test] | |||
| ?paramvalue: typedparam ("=" test)? | |||
| ?typedparam: NAME (":" test)? | |||
| varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]] | |||
| | "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | |||
| | "**" vfpdef [","]) | |||
| vfpdef: NAME | |||
| lambdef: "lambda" [lambda_params] ":" test | |||
| lambdef_nocond: "lambda" [lambda_params] ":" test_nocond | |||
| lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]] | |||
| | lambda_starparams | |||
| | lambda_kwparams | |||
| ?lambda_paramvalue: NAME ("=" test)? | |||
| lambda_starparams: "*" [NAME] ("," lambda_paramvalue)* ["," [lambda_kwparams]] | |||
| lambda_kwparams: "**" NAME ","? | |||
| ?stmt: simple_stmt | compound_stmt | |||
| ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE | |||
| ?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) | |||
| ?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) | |||
| | ("=" (yield_expr|testlist_star_expr))*) | |||
| annassign: ":" test ["=" test] | |||
| ?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","] | |||
| !augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") | |||
| ?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) | |||
| expr_stmt: testlist_star_expr | |||
| assign_stmt: annassign | augassign | assign | |||
| annassign: testlist_star_expr ":" test ["=" test] | |||
| assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+ | |||
| augassign: testlist_star_expr augassign_op (yield_expr|testlist) | |||
| !augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=" | |||
| ?testlist_star_expr: test_or_star_expr | |||
| | test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple | |||
| | test_or_star_expr "," -> tuple | |||
| // For normal and annotated assignments, additional restrictions enforced by the interpreter | |||
| del_stmt: "del" exprlist | |||
| pass_stmt: "pass" | |||
| @@ -71,43 +80,52 @@ global_stmt: "global" NAME ("," NAME)* | |||
| nonlocal_stmt: "nonlocal" NAME ("," NAME)* | |||
| assert_stmt: "assert" test ["," test] | |||
| compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt | |||
| ?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt | |||
| async_stmt: "async" (funcdef | with_stmt | for_stmt) | |||
| if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] | |||
| if_stmt: "if" test ":" suite elifs ["else" ":" suite] | |||
| elifs: elif_* | |||
| elif_: "elif" test ":" suite | |||
| while_stmt: "while" test ":" suite ["else" ":" suite] | |||
| for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] | |||
| try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) | |||
| with_stmt: "with" with_item ("," with_item)* ":" suite | |||
| try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally] | |||
| | "try" ":" suite finally -> try_finally | |||
| finally: "finally" ":" suite | |||
| except_clauses: except_clause+ | |||
| except_clause: "except" [test ["as" NAME]] ":" suite | |||
| with_stmt: "with" with_items ":" suite | |||
| with_items: with_item ("," with_item)* | |||
| with_item: test ["as" expr] | |||
| // NB compile.c makes sure that the default except clause is last | |||
| except_clause: "except" [test ["as" NAME]] | |||
| suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT | |||
| ?test: or_test ("if" or_test "else" test)? | lambdef | |||
| ?test: or_test ("if" or_test "else" test)? | |||
| | lambdef | |||
| ?test_nocond: or_test | lambdef_nocond | |||
| lambdef: "lambda" [varargslist] ":" test | |||
| lambdef_nocond: "lambda" [varargslist] ":" test_nocond | |||
| ?or_test: and_test ("or" and_test)* | |||
| ?and_test: not_test ("and" not_test)* | |||
| ?not_test: "not" not_test -> not | |||
| ?and_test: not_test_ ("and" not_test_)* | |||
| ?not_test_: "not" not_test_ -> not_test | |||
| | comparison | |||
| ?comparison: expr (_comp_op expr)* | |||
| ?comparison: expr (comp_op expr)* | |||
| star_expr: "*" expr | |||
| ?expr: xor_expr ("|" xor_expr)* | |||
| ?expr: or_expr | |||
| ?or_expr: xor_expr ("|" xor_expr)* | |||
| ?xor_expr: and_expr ("^" and_expr)* | |||
| ?and_expr: shift_expr ("&" shift_expr)* | |||
| ?shift_expr: arith_expr (_shift_op arith_expr)* | |||
| ?arith_expr: term (_add_op term)* | |||
| ?term: factor (_mul_op factor)* | |||
| ?factor: _factor_op factor | power | |||
| ?factor: _unary_op factor | power | |||
| !_factor_op: "+"|"-"|"~" | |||
| !_unary_op: "+"|"-"|"~" | |||
| !_add_op: "+"|"-" | |||
| !_shift_op: "<<"|">>" | |||
| !_mul_op: "*"|"@"|"/"|"%"|"//" | |||
| // <> isn't actually a valid comparison operator in Python. It's here for the | |||
| // sake of a __future__ import described in PEP 401 (which really works :-) | |||
| !_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" | |||
| !comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" | |||
| ?power: await_expr ("**" factor)? | |||
| ?await_expr: AWAIT? atom_expr | |||
| @@ -118,61 +136,75 @@ AWAIT: "await" | |||
| | atom_expr "." NAME -> getattr | |||
| | atom | |||
| ?atom: "(" [yield_expr|tuplelist_comp] ")" -> tuple | |||
| | "[" [testlist_comp] "]" -> list | |||
| | "{" [dict_comp] "}" -> dict | |||
| | "{" set_comp "}" -> set | |||
| ?atom: "(" yield_expr ")" | |||
| | "(" _tuple_inner? ")" -> tuple | |||
| | "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension | |||
| | "[" _testlist_comp? "]" -> list | |||
| | "[" comprehension{test_or_star_expr} "]" -> list_comprehension | |||
| | "{" _dict_exprlist? "}" -> dict | |||
| | "{" comprehension{key_value} "}" -> dict_comprehension | |||
| | "{" _set_exprlist "}" -> set | |||
| | "{" comprehension{test} "}" -> set_comprehension | |||
| | NAME -> var | |||
| | number | string+ | |||
| | number | |||
| | string_concat | |||
| | "(" test ")" | |||
| | "..." -> ellipsis | |||
| | "None" -> const_none | |||
| | "True" -> const_true | |||
| | "False" -> const_false | |||
| ?testlist_comp: test | tuplelist_comp | |||
| tuplelist_comp: (test|star_expr) (comp_for | ("," (test|star_expr))+ [","] | ",") | |||
| ?string_concat: string+ | |||
| _testlist_comp: test | _tuple_inner | |||
| _tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",") | |||
| ?test_or_star_expr: test | |||
| | star_expr | |||
| ?subscriptlist: subscript | |||
| | subscript (("," subscript)+ [","] | ",") -> subscript_tuple | |||
| subscript: test | ([test] ":" [test] [sliceop]) -> slice | |||
| ?subscript: test | ([test] ":" [test] [sliceop]) -> slice | |||
| sliceop: ":" [test] | |||
| exprlist: (expr|star_expr) | |||
| | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") -> exprlist_tuple | |||
| testlist: test | testlist_tuple | |||
| ?exprlist: (expr|star_expr) | |||
| | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") | |||
| ?testlist: test | testlist_tuple | |||
| testlist_tuple: test (("," test)+ [","] | ",") | |||
| dict_comp: key_value comp_for | |||
| | (key_value | "**" expr) ("," (key_value | "**" expr))* [","] | |||
| _dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","] | |||
| key_value: test ":" test | |||
| set_comp: test comp_for | |||
| | (test|star_expr) ("," (test | star_expr))* [","] | |||
| _set_exprlist: test_or_star_expr ("," test_or_star_expr)* [","] | |||
| classdef: "class" NAME ["(" [arguments] ")"] ":" suite | |||
| arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? | |||
| | starargs | |||
| | kwargs | |||
| | test comp_for | |||
| | comprehension{test} | |||
| starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs] | |||
| starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs] | |||
| stararg: "*" test | |||
| kwargs: "**" test | |||
| ?argvalue: test ("=" test)? | |||
| comp_iter: comp_for | comp_if | async_for | |||
| async_for: "async" "for" exprlist "in" or_test [comp_iter] | |||
| comp_for: "for" exprlist "in" or_test [comp_iter] | |||
| comp_if: "if" test_nocond [comp_iter] | |||
| comprehension{comp_result}: comp_result comp_fors [comp_if] | |||
| comp_fors: comp_for+ | |||
| comp_for: [ASYNC] "for" exprlist "in" or_test | |||
| ASYNC: "async" | |||
| ?comp_if: "if" test_nocond | |||
| // not used in grammar, but may appear in "node" passed from Parser to Compiler | |||
| encoding_decl: NAME | |||
| yield_expr: "yield" [yield_arg] | |||
| yield_arg: "from" test | testlist | |||
| yield_expr: "yield" [testlist] | |||
| | "yield" "from" test -> yield_from | |||
| number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER | |||
| string: STRING | LONG_STRING | |||
| @@ -181,6 +213,7 @@ string: STRING | LONG_STRING | |||
| %import python (NAME, COMMENT, STRING, LONG_STRING) | |||
| %import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER) | |||
| // Other terminals | |||
| _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ | |||
| @@ -1,81 +0,0 @@ | |||
| """ | |||
| Compile Python to Bytecode | |||
| ========================== | |||
| A toy example that compiles Python directly to bytecode, without generating an AST. | |||
| It currently only works for very very simple Python code. | |||
| It requires the 'bytecode' library. You can get it using | |||
| :: | |||
| $ pip install bytecode | |||
| """ | |||
| from lark import Lark, Transformer, v_args | |||
| from lark.indenter import Indenter | |||
| from bytecode import Instr, Bytecode | |||
| class PythonIndenter(Indenter): | |||
| NL_type = '_NEWLINE' | |||
| OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE'] | |||
| CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE'] | |||
| INDENT_type = '_INDENT' | |||
| DEDENT_type = '_DEDENT' | |||
| tab_len = 8 | |||
| @v_args(inline=True) | |||
| class Compile(Transformer): | |||
| def number(self, n): | |||
| return [Instr('LOAD_CONST', int(n))] | |||
| def string(self, s): | |||
| return [Instr('LOAD_CONST', s[1:-1])] | |||
| def var(self, n): | |||
| return [Instr('LOAD_NAME', n)] | |||
| def arith_expr(self, a, op, b): | |||
| # TODO support chain arithmetic | |||
| assert op == '+' | |||
| return a + b + [Instr('BINARY_ADD')] | |||
| def arguments(self, args): | |||
| return args | |||
| def funccall(self, name, args): | |||
| return name + args + [Instr('CALL_FUNCTION', 1)] | |||
| @v_args(inline=False) | |||
| def file_input(self, stmts): | |||
| return sum(stmts, []) + [Instr("RETURN_VALUE")] | |||
| def expr_stmt(self, lval, rval): | |||
| # TODO more complicated than that | |||
| name ,= lval | |||
| assert name.name == 'LOAD_NAME' # XXX avoid with another layer of abstraction | |||
| return rval + [Instr("STORE_NAME", name.arg)] | |||
| def __default__(self, *args): | |||
| assert False, args | |||
| python_parser3 = Lark.open('python3.lark', rel_to=__file__, start='file_input', | |||
| parser='lalr', postlex=PythonIndenter(), | |||
| transformer=Compile(), propagate_positions=False) | |||
| def compile_python(s): | |||
| insts = python_parser3.parse(s+"\n") | |||
| return Bytecode(insts).to_code() | |||
| code = compile_python(""" | |||
| a = 3 | |||
| b = 5 | |||
| print("Hello World!") | |||
| print(a+(b+2)) | |||
| print((a+b)+2) | |||
| """) | |||
| exec(code) | |||
| # -- Output -- | |||
| # Hello World! | |||
| # 10 | |||
| # 10 | |||
| @@ -0,0 +1,10 @@ | |||
| Grammar Composition | |||
| =================== | |||
| This example shows how to do grammar composition in Lark, by creating a new | |||
| file format that allows both CSV and JSON to co-exist. | |||
| We show how, by using namespaces, Lark grammars and their transformers can be fully reused - | |||
| they don't need to care if their grammar is used directly, or being imported, or who is doing the importing. | |||
| See [``main.py``](main.py) for more details. | |||
| @@ -0,0 +1,6 @@ | |||
| {"header": ["this", "is", "json", 1111]} | |||
| # file lines author | |||
| data.json 12 Robin | |||
| data.csv 30 erezsh | |||
| compiler.py 123123 Megalng | |||
| {"footer": "done"} | |||
| @@ -0,0 +1,14 @@ | |||
| start: header _NL row+ | |||
| header: "#" " "? (WORD _SEPARATOR?)+ | |||
| row: (_anything _SEPARATOR?)+ _NL | |||
| _anything: INT | WORD | NON_SEPARATOR_STRING | FLOAT | SIGNED_FLOAT | |||
| NON_SEPARATOR_STRING: /[a-zA-z.;\\\/]+/ | |||
| _SEPARATOR: /[ ]+/ | |||
| | "\t" | |||
| | "," | |||
| %import common.NEWLINE -> _NL | |||
| %import common.WORD | |||
| %import common.INT | |||
| %import common.FLOAT | |||
| %import common.SIGNED_FLOAT | |||
| @@ -0,0 +1,26 @@ | |||
| "Transformer for evaluating csv.lark" | |||
| from lark import Transformer | |||
| class CsvTreeToPandasDict(Transformer): | |||
| INT = int | |||
| FLOAT = float | |||
| SIGNED_FLOAT = float | |||
| WORD = str | |||
| NON_SEPARATOR_STRING = str | |||
| def row(self, children): | |||
| return children | |||
| def start(self, children): | |||
| data = {} | |||
| header = children[0].children | |||
| for heading in header: | |||
| data[heading] = [] | |||
| for row in children[1:]: | |||
| for i, element in enumerate(row): | |||
| data[header[i]].append(element) | |||
| return data | |||
| @@ -0,0 +1,17 @@ | |||
| "Transformer for evaluating json.lark" | |||
| from lark import Transformer, v_args | |||
| class JsonTreeToJson(Transformer): | |||
| @v_args(inline=True) | |||
| def string(self, s): | |||
| return s[1:-1].replace('\\"', '"') | |||
| array = list | |||
| pair = tuple | |||
| object = dict | |||
| number = v_args(inline=True)(float) | |||
| null = lambda self, _: None | |||
| true = lambda self, _: True | |||
| false = lambda self, _: False | |||
| @@ -0,0 +1,19 @@ | |||
| ?start: value | |||
| ?value: object | |||
| | array | |||
| | string | |||
| | SIGNED_NUMBER -> number | |||
| | "true" -> true | |||
| | "false" -> false | |||
| | "null" -> null | |||
| array : "[" _WS? [value ("," _WS? value)*] "]" | |||
| object : "{" _WS? [pair ("," _WS? pair)*] "}" | |||
| pair : string ":" _WS value | |||
| string : ESCAPED_STRING | |||
| %import common.ESCAPED_STRING | |||
| %import common.SIGNED_NUMBER | |||
| %import common.WS -> _WS | |||
| @@ -0,0 +1,51 @@ | |||
| """ | |||
| Grammar Composition | |||
| =================== | |||
| This example shows how to do grammar composition in Lark, by creating a new | |||
| file format that allows both CSV and JSON to co-exist. | |||
| 1) We define ``storage.lark``, which imports both ``csv.lark`` and ``json.lark``, | |||
| and allows them to be used one after the other. | |||
| In the generated tree, each imported rule/terminal is automatically prefixed (with ``json__`` or ``csv__), | |||
| which creates an implicit namespace and allows them to coexist without collisions. | |||
| 2) We merge their respective transformers (unaware of each other) into a new base transformer. | |||
| The resulting transformer can evaluate both JSON and CSV in the parse tree. | |||
| The methods of each transformer are renamed into their appropriate namespace, using the given prefix. | |||
| This approach allows full re-use: the transformers don't need to care if their grammar is used directly, | |||
| or being imported, or who is doing the importing. | |||
| """ | |||
| from pathlib import Path | |||
| from lark import Lark | |||
| from json import dumps | |||
| from lark.visitors import Transformer, merge_transformers | |||
| from eval_csv import CsvTreeToPandasDict | |||
| from eval_json import JsonTreeToJson | |||
| __dir__ = Path(__file__).parent | |||
| class Storage(Transformer): | |||
| def start(self, children): | |||
| return children | |||
| storage_transformer = merge_transformers(Storage(), csv=CsvTreeToPandasDict(), json=JsonTreeToJson()) | |||
| parser = Lark.open("storage.lark", rel_to=__file__) | |||
| def main(): | |||
| json_tree = parser.parse(dumps({"test": "a", "dict": { "list": [1, 1.2] }})) | |||
| res = storage_transformer.transform(json_tree) | |||
| print("Just JSON: ", res) | |||
| csv_json_tree = parser.parse(open(__dir__ / 'combined_csv_and_json.txt').read()) | |||
| res = storage_transformer.transform(csv_json_tree) | |||
| print("JSON + CSV: ", dumps(res, indent=2)) | |||
| if __name__ == "__main__": | |||
| main() | |||
| @@ -0,0 +1,9 @@ | |||
| start: (csv__start | json__start _NL?)+ | |||
| // Renaming of the import variables is required, as they receive the namespace of this file. | |||
| // See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565 | |||
| %import .csv.start -> csv__start | |||
| %import .json.start -> json__start | |||
| %import .csv._NL -> _NL | |||
| @@ -10,7 +10,9 @@ Standalone Parser | |||
| import sys | |||
| from json_parser import Lark_StandAlone, Transformer, inline_args | |||
| from json_parser import Lark_StandAlone, Transformer, v_args | |||
| inline_args = v_args(inline=True) | |||
| class TreeToJson(Transformer): | |||
| @inline_args | |||
| @@ -40,6 +40,9 @@ class Tree: | |||
| def expand_kids_by_index(self, *indices: int) -> None: | |||
| ... | |||
| def expand_kids_by_data(self, *data_values: str) -> bool: | |||
| ... | |||
| def scan_values(self, pred: Callable[[Union[str, Tree]], bool]) -> Iterator[str]: | |||
| ... | |||
| @@ -7,4 +7,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, | |||
| from .lexer import Token | |||
| from .lark import Lark | |||
| __version__ = "0.11.4" | |||
| __version__ = "0.12.0" | |||
| @@ -19,15 +19,17 @@ class AsList(object): | |||
| Subclasses will be instanciated with the parse results as a single list, instead of as arguments. | |||
| """ | |||
| def camel_to_snake(name): | |||
| return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() | |||
| class WithMeta(object): | |||
| """Abstract class | |||
| def _call(func, _data, children, _meta): | |||
| return func(*children) | |||
| Subclasses will be instanciated with the Meta instance of the tree. (see ``v_args`` for more detail) | |||
| """ | |||
| pass | |||
| inline = v_args(wrapper=_call) | |||
| def camel_to_snake(name): | |||
| return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower() | |||
| def create_transformer(ast_module, transformer=None): | |||
| def create_transformer(ast_module, transformer=None, decorator_factory=v_args): | |||
| """Collects `Ast` subclasses from the given module, and creates a Lark transformer that builds the AST. | |||
| For each class, we create a corresponding rule in the transformer, with a matching name. | |||
| @@ -36,17 +38,18 @@ def create_transformer(ast_module, transformer=None): | |||
| Classes starting with an underscore (`_`) will be skipped. | |||
| Parameters: | |||
| ast_module - A Python module containing all the subclasses of `ast_utils.Ast` | |||
| transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten. | |||
| ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` | |||
| transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. | |||
| decorator_factory (Callable): An optional callable accepting two booleans, inline, and meta, | |||
| and returning a decorator for the methods of ``transformer``. (default: ``v_args``). | |||
| """ | |||
| t = transformer or Transformer() | |||
| for name, obj in inspect.getmembers(ast_module): | |||
| if not name.startswith('_') and inspect.isclass(obj): | |||
| if issubclass(obj, Ast): | |||
| if not issubclass(obj, AsList): | |||
| obj = inline(obj).__get__(t) | |||
| wrapper = decorator_factory(inline=not issubclass(obj, AsList), meta=issubclass(obj, WithMeta)) | |||
| obj = wrapper(obj).__get__(t) | |||
| setattr(t, camel_to_snake(name), obj) | |||
| return t | |||
| @@ -36,8 +36,9 @@ class UnexpectedInput(LarkError): | |||
| Used as a base class for the following exceptions: | |||
| - ``UnexpectedToken``: The parser received an unexpected token | |||
| - ``UnexpectedCharacters``: The lexer encountered an unexpected string | |||
| - ``UnexpectedToken``: The parser received an unexpected token | |||
| - ``UnexpectedEOF``: The parser expected a token, but the input ended | |||
| After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | |||
| """ | |||
| @@ -128,6 +129,9 @@ class UnexpectedInput(LarkError): | |||
| class UnexpectedEOF(ParseError, UnexpectedInput): | |||
| """An exception that is raised by the parser, when the input ends while it still expects a token. | |||
| """ | |||
| def __init__(self, expected, state=None, terminals_by_name=None): | |||
| super(UnexpectedEOF, self).__init__() | |||
| @@ -148,6 +152,10 @@ class UnexpectedEOF(ParseError, UnexpectedInput): | |||
| class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| """An exception that is raised by the lexer, when it cannot match the next | |||
| string of characters to any of its terminals. | |||
| """ | |||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, | |||
| terminals_by_name=None, considered_rules=None): | |||
| super(UnexpectedCharacters, self).__init__() | |||
| @@ -185,10 +193,15 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
| """An exception that is raised by the parser, when the token it received | |||
| doesn't match any valid step forward. | |||
| The parser provides an interactive instance through `interactive_parser`, | |||
| which is initialized to the point of failture, and can be used for debugging and error handling. | |||
| Parameters: | |||
| token: The mismatched token | |||
| expected: The set of expected tokens | |||
| considered_rules: Which rules were considered, to deduce the expected tokens | |||
| state: A value representing the parser state. Do not rely on its value or type. | |||
| interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture, | |||
| and can be used for debugging and error handling. | |||
| see: ``InteractiveParser``. | |||
| Note: These parameters are available as attributes of the instance. | |||
| """ | |||
| def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): | |||
| @@ -197,7 +210,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
| # TODO considered_rules and expected can be figured out using state | |||
| self.line = getattr(token, 'line', '?') | |||
| self.column = getattr(token, 'column', '?') | |||
| self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
| self.pos_in_stream = getattr(token, 'start_pos', None) | |||
| self.state = state | |||
| self.token = token | |||
| @@ -234,14 +247,20 @@ class VisitError(LarkError): | |||
| """VisitError is raised when visitors are interrupted by an exception | |||
| It provides the following attributes for inspection: | |||
| - obj: the tree node or token it was processing when the exception was raised | |||
| - orig_exc: the exception that cause it to fail | |||
| Parameters: | |||
| rule: the name of the visit rule that failed | |||
| obj: the tree-node or token that was being processed | |||
| orig_exc: the exception that cause it to fail | |||
| Note: These parameters are available as attributes | |||
| """ | |||
| def __init__(self, rule, obj, orig_exc): | |||
| message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | |||
| super(VisitError, self).__init__(message) | |||
| self.rule = rule | |||
| self.obj = obj | |||
| self.orig_exc = orig_exc | |||
| @@ -102,7 +102,7 @@ class LarkOptions(Serialize): | |||
| A List of either paths or loader functions to specify from where grammars are imported | |||
| source_path | |||
| Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading | |||
| **=== End Options ===** | |||
| **=== End of Options ===** | |||
| """ | |||
| if __doc__: | |||
| __doc__ += OPTIONS_DOC | |||
| @@ -340,7 +340,9 @@ class Lark(Serialize): | |||
| if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: | |||
| raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) | |||
| if self.options.postlex is not None: | |||
| if self.options.parser is None: | |||
| terminals_to_keep = '*' | |||
| elif self.options.postlex is not None: | |||
| terminals_to_keep = set(self.options.postlex.always_accept) | |||
| else: | |||
| terminals_to_keep = set() | |||
| @@ -527,6 +529,8 @@ class Lark(Serialize): | |||
| """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' | |||
| When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. | |||
| :raises UnexpectedCharacters: In case the lexer cannot find a suitable match. | |||
| """ | |||
| if not hasattr(self, 'lexer') or dont_ignore: | |||
| lexer = self._build_lexer(dont_ignore) | |||
| @@ -569,6 +573,10 @@ class Lark(Serialize): | |||
| If a transformer is supplied to ``__init__``, returns whatever is the | |||
| result of the transformation. Otherwise, returns a Tree instance. | |||
| :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: | |||
| ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. | |||
| For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. | |||
| """ | |||
| return self.parser.parse(text, start=start, on_error=on_error) | |||
| @@ -150,7 +150,7 @@ class Token(Str): | |||
| @property | |||
| def pos_in_stream(self): | |||
| warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning) | |||
| warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning, 2) | |||
| return self.start_pos | |||
| def update(self, type_=None, value=None): | |||
| @@ -9,7 +9,7 @@ import pkgutil | |||
| from ast import literal_eval | |||
| from numbers import Integral | |||
| from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique | |||
| from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors | |||
| from .lexer import Token, TerminalDef, PatternStr, PatternRE | |||
| from .parse_tree_builder import ParseTreeBuilder | |||
| @@ -91,6 +91,7 @@ TERMINALS = { | |||
| 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', | |||
| 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, | |||
| '_NL': r'(\r?\n)+\s*', | |||
| '_NL_OR': r'(\r?\n)+\s*\|', | |||
| 'WS': r'[ \t]+', | |||
| 'COMMENT': r'\s*//[^\n]*', | |||
| '_TO': '->', | |||
| @@ -113,9 +114,10 @@ RULES = { | |||
| ''], | |||
| '_template_params': ['RULE', | |||
| '_template_params _COMMA RULE'], | |||
| 'expansions': ['alias', | |||
| 'expansions _OR alias', | |||
| 'expansions _NL _OR alias'], | |||
| 'expansions': ['_expansions'], | |||
| '_expansions': ['alias', | |||
| '_expansions _OR alias', | |||
| '_expansions _NL_OR alias'], | |||
| '?alias': ['expansion _TO RULE', 'expansion'], | |||
| 'expansion': ['_expansion'], | |||
| @@ -175,27 +177,136 @@ RULES = { | |||
| } | |||
| # Value 5 keeps the number of states in the lalr parser somewhat minimal | |||
| # It isn't optimal, but close to it. See PR #949 | |||
| SMALL_FACTOR_THRESHOLD = 5 | |||
| # The Threshold whether repeat via ~ are split up into different rules | |||
| # 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, | |||
| # while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. | |||
| # (See PR #949) | |||
| REPEAT_BREAK_THRESHOLD = 50 | |||
| @inline_args | |||
| class EBNF_to_BNF(Transformer_InPlace): | |||
| def __init__(self): | |||
| self.new_rules = [] | |||
| self.rules_by_expr = {} | |||
| self.rules_cache = {} | |||
| self.prefix = 'anon' | |||
| self.i = 0 | |||
| self.rule_options = None | |||
| def _add_recurse_rule(self, type_, expr): | |||
| if expr in self.rules_by_expr: | |||
| return self.rules_by_expr[expr] | |||
| new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||
| def _name_rule(self, inner): | |||
| new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) | |||
| self.i += 1 | |||
| t = NonTerminal(new_name) | |||
| tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) | |||
| self.new_rules.append((new_name, tree, self.rule_options)) | |||
| self.rules_by_expr[expr] = t | |||
| return new_name | |||
| def _add_rule(self, key, name, expansions): | |||
| t = NonTerminal(name) | |||
| self.new_rules.append((name, expansions, self.rule_options)) | |||
| self.rules_cache[key] = t | |||
| return t | |||
| def _add_recurse_rule(self, type_, expr): | |||
| try: | |||
| return self.rules_cache[expr] | |||
| except KeyError: | |||
| new_name = self._name_rule(type_) | |||
| t = NonTerminal(new_name) | |||
| tree = ST('expansions', [ | |||
| ST('expansion', [expr]), | |||
| ST('expansion', [t, expr]) | |||
| ]) | |||
| return self._add_rule(expr, new_name, tree) | |||
| def _add_repeat_rule(self, a, b, target, atom): | |||
| """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. | |||
| When called recursively (into target), it repeats atom for x(n) times, where: | |||
| x(0) = 1 | |||
| x(n) = a(n) * x(n-1) + b | |||
| Example rule when a=3, b=4: | |||
| new_rule: target target target atom atom atom atom | |||
| """ | |||
| key = (a, b, target, atom) | |||
| try: | |||
| return self.rules_cache[key] | |||
| except KeyError: | |||
| new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) | |||
| tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) | |||
| return self._add_rule(key, new_name, tree) | |||
| def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): | |||
| """Creates a rule that matches atom 0 to (a*n+b)-1 times. | |||
| When target matches n times atom, and target_opt 0 to n-1 times target_opt, | |||
| First we generate target * i followed by target_opt, for i from 0 to a-1 | |||
| These match 0 to n*a - 1 times atom | |||
| Then we generate target * a followed by atom * i, for i from 0 to b-1 | |||
| These match n*a to n*a + b-1 times atom | |||
| The created rule will not have any shift/reduce conflicts so that it can be used with lalr | |||
| Example rule when a=3, b=4: | |||
| new_rule: target_opt | |||
| | target target_opt | |||
| | target target target_opt | |||
| | target target target | |||
| | target target target atom | |||
| | target target target atom atom | |||
| | target target target atom atom atom | |||
| """ | |||
| key = (a, b, target, atom, "opt") | |||
| try: | |||
| return self.rules_cache[key] | |||
| except KeyError: | |||
| new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) | |||
| tree = ST('expansions', [ | |||
| ST('expansion', [target]*i + [target_opt]) for i in range(a) | |||
| ] + [ | |||
| ST('expansion', [target]*a + [atom]*i) for i in range(b) | |||
| ]) | |||
| return self._add_rule(key, new_name, tree) | |||
| def _generate_repeats(self, rule, mn, mx): | |||
| """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. | |||
| """ | |||
| # For a small number of repeats, we can take the naive approach | |||
| if mx < REPEAT_BREAK_THRESHOLD: | |||
| return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) | |||
| # For large repeat values, we break the repetition into sub-rules. | |||
| # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. | |||
| # We then use small_factors to split up mn and diff up into values [(a, b), ...] | |||
| # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt | |||
| # to generate a complete rule/expression that matches the corresponding number of repeats | |||
| mn_target = rule | |||
| for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): | |||
| mn_target = self._add_repeat_rule(a, b, mn_target, rule) | |||
| if mx == mn: | |||
| return mn_target | |||
| diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less | |||
| diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) | |||
| diff_target = rule # Match rule 1 times | |||
| diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) | |||
| for a, b in diff_factors[:-1]: | |||
| diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) | |||
| diff_target = self._add_repeat_rule(a, b, diff_target, rule) | |||
| a, b = diff_factors[-1] | |||
| diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) | |||
| return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) | |||
| def expr(self, rule, op, *args): | |||
| if op.value == '?': | |||
| empty = ST('expansion', []) | |||
| @@ -220,7 +331,9 @@ class EBNF_to_BNF(Transformer_InPlace): | |||
| mn, mx = map(int, args) | |||
| if mx < mn or mn < 0: | |||
| raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | |||
| return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||
| return self._generate_repeats(rule, mn, mx) | |||
| assert False, op | |||
| def maybe(self, rule): | |||
| @@ -244,12 +357,8 @@ class SimplifyRule_Visitor(Visitor): | |||
| @staticmethod | |||
| def _flatten(tree): | |||
| while True: | |||
| to_expand = [i for i, child in enumerate(tree.children) | |||
| if isinstance(child, Tree) and child.data == tree.data] | |||
| if not to_expand: | |||
| break | |||
| tree.expand_kids_by_index(*to_expand) | |||
| while tree.expand_kids_by_data(tree.data): | |||
| pass | |||
| def expansion(self, tree): | |||
| # rules_list unpacking | |||
| @@ -487,8 +596,7 @@ def _make_joined_pattern(regexp, flags_set): | |||
| return PatternRE(regexp, flags) | |||
| class TerminalTreeToPattern(Transformer): | |||
| class TerminalTreeToPattern(Transformer_NonRecursive): | |||
| def pattern(self, ps): | |||
| p ,= ps | |||
| return p | |||
| @@ -505,6 +613,10 @@ class TerminalTreeToPattern(Transformer): | |||
| if len(exps) == 1: | |||
| return exps[0] | |||
| # Do a bit of sorting to make sure that the longest option is returned | |||
| # (Python's re module otherwise prefers just 'l' when given (l|ll) and both could match) | |||
| exps.sort(key=lambda x: (-x.max_width, -x.min_width, -len(x.value))) | |||
| pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps)) | |||
| return _make_joined_pattern(pattern, {i.flags for i in exps}) | |||
| @@ -558,8 +670,8 @@ class Grammar: | |||
| def compile(self, start, terminals_to_keep): | |||
| # We change the trees in-place (to support huge grammars) | |||
| # So deepcopy allows calling compile more than once. | |||
| term_defs = deepcopy(list(self.term_defs)) | |||
| rule_defs = [(n,p,nr_deepcopy_tree(t),o) for n,p,t,o in self.rule_defs] | |||
| term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs] | |||
| rule_defs = [(n, p, nr_deepcopy_tree(t), o) for n, p, t, o in self.rule_defs] | |||
| # =================== | |||
| # Compile Terminals | |||
| @@ -632,7 +744,10 @@ class Grammar: | |||
| else: | |||
| exp_options = options | |||
| assert all(isinstance(x, Symbol) for x in expansion), expansion | |||
| for sym in expansion: | |||
| assert isinstance(sym, Symbol) | |||
| if sym.is_term and exp_options and exp_options.keep_all_tokens: | |||
| sym.filter_out = False | |||
| rule = Rule(NonTerminal(name), expansion, i, alias, exp_options) | |||
| compiled_rules.append(rule) | |||
| @@ -666,12 +781,13 @@ class Grammar: | |||
| break | |||
| # Filter out unused terminals | |||
| used_terms = {t.name for r in compiled_rules | |||
| for t in r.expansion | |||
| if isinstance(t, Terminal)} | |||
| terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) | |||
| if unused: | |||
| logger.debug("Unused terminals: %s", [t.name for t in unused]) | |||
| if terminals_to_keep != '*': | |||
| used_terms = {t.name for r in compiled_rules | |||
| for t in r.expansion | |||
| if isinstance(t, Terminal)} | |||
| terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) | |||
| if unused: | |||
| logger.debug("Unused terminals: %s", [t.name for t in unused]) | |||
| return terminals, compiled_rules, self.ignore | |||
| @@ -804,7 +920,7 @@ def _get_parser(): | |||
| parser_conf = ParserConf(rules, callback, ['start']) | |||
| lexer_conf.lexer_type = 'standard' | |||
| parser_conf.parser_type = 'lalr' | |||
| _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {}) | |||
| _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) | |||
| return _get_parser.cache | |||
| GRAMMAR_ERRORS = [ | |||
| @@ -981,9 +1097,7 @@ class GrammarBuilder: | |||
| # TODO: think about what to do with 'options' | |||
| base = self._definitions[name][1] | |||
| while len(base.children) == 2: | |||
| assert isinstance(base.children[0], Tree) and base.children[0].data == 'expansions', base | |||
| base = base.children[0] | |||
| assert isinstance(base, Tree) and base.data == 'expansions' | |||
| base.children.insert(0, exp) | |||
| def _ignore(self, exp_or_name): | |||
| @@ -1226,6 +1340,12 @@ def verify_used_files(file_hashes): | |||
| return False | |||
| return True | |||
| def list_grammar_imports(grammar, import_paths=[]): | |||
| "Returns a list of paths to the lark grammars imported by the given grammar (recursively)" | |||
| builder = GrammarBuilder(False, import_paths) | |||
| builder.load_grammar(grammar, '<string>') | |||
| return list(builder.used_files.keys()) | |||
| def load_grammar(grammar, source, import_paths, global_keep_all_tokens): | |||
| builder = GrammarBuilder(global_keep_all_tokens, import_paths) | |||
| builder.load_grammar(grammar, source) | |||
| @@ -204,8 +204,7 @@ class AmbiguousExpander: | |||
| if i in self.to_expand: | |||
| ambiguous.append(i) | |||
| to_expand = [j for j, grandchild in enumerate(child.children) if _is_ambig_tree(grandchild)] | |||
| child.expand_kids_by_index(*to_expand) | |||
| child.expand_kids_by_data('_ambig') | |||
| if not ambiguous: | |||
| return self.node_builder(children) | |||
| @@ -65,7 +65,7 @@ class InteractiveParser(object): | |||
| """Print the output of ``choices()`` in a way that's easier to read.""" | |||
| out = ["Parser choices:"] | |||
| for k, v in self.choices().items(): | |||
| out.append('\t- %s -> %s' % (k, v)) | |||
| out.append('\t- %s -> %r' % (k, v)) | |||
| out.append('stack size: %s' % len(self.parser_state.state_stack)) | |||
| return '\n'.join(out) | |||
| @@ -107,6 +107,17 @@ class Tree(object): | |||
| kid = self.children[i] | |||
| self.children[i:i+1] = kid.children | |||
| def expand_kids_by_data(self, *data_values): | |||
| """Expand (inline) children with any of the given data values. Returns True if anything changed""" | |||
| changed = False | |||
| for i in range(len(self.children)-1, -1, -1): | |||
| child = self.children[i] | |||
| if isinstance(child, Tree) and child.data in data_values: | |||
| self.children[i:i+1] = child.children | |||
| changed = True | |||
| return changed | |||
| def scan_values(self, pred): | |||
| """Return all values in the tree that evaluate pred(value) as true. | |||
| @@ -187,7 +187,7 @@ def get_regexp_width(expr): | |||
| return 1, sre_constants.MAXREPEAT | |||
| else: | |||
| return 0, sre_constants.MAXREPEAT | |||
| ###} | |||
| @@ -287,8 +287,8 @@ except ImportError: | |||
| atomicwrites = None | |||
| class FS: | |||
| exists = os.path.exists | |||
| exists = staticmethod(os.path.exists) | |||
| @staticmethod | |||
| def open(name, mode="r", **kwargs): | |||
| if atomicwrites and "w" in mode: | |||
| @@ -359,3 +359,29 @@ def _serialize(value, memo): | |||
| return {key:_serialize(elem, memo) for key, elem in value.items()} | |||
| # assert value is None or isinstance(value, (int, float, str, tuple)), value | |||
| return value | |||
| def small_factors(n, max_factor): | |||
| """ | |||
| Splits n up into smaller factors and summands <= max_factor. | |||
| Returns a list of [(a, b), ...] | |||
| so that the following code returns n: | |||
| n = 1 | |||
| for a, b in values: | |||
| n = n * a + b | |||
| Currently, we also keep a + b <= max_factor, but that might change | |||
| """ | |||
| assert n >= 0 | |||
| assert max_factor > 2 | |||
| if n <= max_factor: | |||
| return [(n, 0)] | |||
| for a in range(max_factor, 1, -1): | |||
| r, b = divmod(n, a) | |||
| if a + b <= max_factor: | |||
| return small_factors(r, max_factor) + [(a, b)] | |||
| assert False, "Failed to factorize %s" % n | |||
| @@ -149,6 +149,59 @@ class Transformer(_Decoratable): | |||
| return token | |||
| def merge_transformers(base_transformer=None, **transformers_to_merge): | |||
| """Merge a collection of transformers into the base_transformer, each into its own 'namespace'. | |||
| When called, it will collect the methods from each transformer, and assign them to base_transformer, | |||
| with their name prefixed with the given keyword, as ``prefix__methodname``. | |||
| This function is especially useful for processing grammars that import other grammars, | |||
| thereby creating some of their rules in a 'namespace'. (i.e with a consistent name prefix). | |||
| In this case, the key for the transformer should match the name of the imported grammar. | |||
| Parameters: | |||
| base_transformer (Transformer, optional): The transformer that all other transformers will be added to. | |||
| **transformers_to_merge: Keyword arguments, in the form of ``name_prefix = transformer``. | |||
| Raises: | |||
| AttributeError: In case of a name collision in the merged methods | |||
| Example: | |||
| :: | |||
| class TBase(Transformer): | |||
| def start(self, children): | |||
| return children[0] + 'bar' | |||
| class TImportedGrammar(Transformer): | |||
| def foo(self, children): | |||
| return "foo" | |||
| composed_transformer = merge_transformers(TBase(), imported=TImportedGrammar()) | |||
| t = Tree('start', [ Tree('imported__foo', []) ]) | |||
| assert composed_transformer.transform(t) == 'foobar' | |||
| """ | |||
| if base_transformer is None: | |||
| base_transformer = Transformer() | |||
| for prefix, transformer in transformers_to_merge.items(): | |||
| for method_name in dir(transformer): | |||
| method = getattr(transformer, method_name) | |||
| if not callable(method): | |||
| continue | |||
| if method_name.startswith("_") or method_name == "transform": | |||
| continue | |||
| prefixed_method = prefix + "__" + method_name | |||
| if hasattr(base_transformer, prefixed_method): | |||
| raise AttributeError("Cannot merge: method '%s' appears more than once" % prefixed_method) | |||
| setattr(base_transformer, prefixed_method, method) | |||
| return base_transformer | |||
| class InlineTransformer(Transformer): # XXX Deprecated | |||
| def _call_userfunc(self, tree, new_children=None): | |||
| # Assumes tree is already transformed | |||
| @@ -5,6 +5,4 @@ zip_safe= | |||
| universal = 1 | |||
| [metadata] | |||
| description-file = README.md | |||
| license_file = LICENSE | |||
| @@ -1,10 +1,10 @@ | |||
| from __future__ import absolute_import | |||
| import sys | |||
| import os | |||
| from unittest import TestCase, main | |||
| from lark import Lark, Token, Tree | |||
| from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors | |||
| from lark import Lark, Token, Tree, ParseError, UnexpectedInput | |||
| from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors, list_grammar_imports | |||
| from lark.load_grammar import FromPackageLoader | |||
| @@ -198,6 +198,77 @@ class TestGrammar(TestCase): | |||
| x = find_grammar_errors(text) | |||
| assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] | |||
| def test_ranged_repeat_terms(self): | |||
| g = u"""!start: AAA | |||
| AAA: "A"~3 | |||
| """ | |||
| l = Lark(g, parser='lalr') | |||
| self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') | |||
| g = u"""!start: AABB CC | |||
| AABB: "A"~0..2 "B"~2 | |||
| CC: "C"~1..2 | |||
| """ | |||
| l = Lark(g, parser='lalr') | |||
| self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) | |||
| self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) | |||
| self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | |||
| def test_ranged_repeat_large(self): | |||
| g = u"""!start: "A"~60 | |||
| """ | |||
| l = Lark(g, parser='lalr') | |||
| self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") | |||
| self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) | |||
| self.assertRaises(ParseError, l.parse, u'A' * 59) | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) | |||
| g = u"""!start: "A"~15..100 | |||
| """ | |||
| l = Lark(g, parser='lalr') | |||
| for i in range(0, 110): | |||
| if 15 <= i <= 100: | |||
| self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) | |||
| else: | |||
| self.assertRaises(UnexpectedInput, l.parse, u'A' * i) | |||
| # 8191 is a Mersenne prime | |||
| g = u"""start: "A"~8191 | |||
| """ | |||
| l = Lark(g, parser='lalr') | |||
| self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) | |||
| self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) | |||
| self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) | |||
| def test_large_terminal(self): | |||
| g = "start: NUMBERS\n" | |||
| g += "NUMBERS: " + '|'.join('"%s"' % i for i in range(0, 1000)) | |||
| l = Lark(g, parser='lalr') | |||
| for i in (0, 9, 99, 999): | |||
| self.assertEqual(l.parse(str(i)), Tree('start', [str(i)])) | |||
| for i in (-1, 1000): | |||
| self.assertRaises(UnexpectedInput, l.parse, str(i)) | |||
| def test_list_grammar_imports(self): | |||
| grammar = """ | |||
| %import .test_templates_import (start, sep) | |||
| %override sep{item, delim}: item (delim item)* delim? | |||
| %ignore " " | |||
| """ | |||
| imports = list_grammar_imports(grammar, [os.path.dirname(__file__)]) | |||
| self.assertEqual({os.path.split(i)[-1] for i in imports}, {'test_templates_import.lark', 'templates.lark'}) | |||
| imports = list_grammar_imports('%import common.WS', []) | |||
| assert len(imports) == 1 and imports[0].pkg_name == 'lark' | |||
| if __name__ == '__main__': | |||
| @@ -15,8 +15,8 @@ TEST_PATH = os.path.abspath(os.path.dirname(__file__)) | |||
| NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') | |||
| BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') | |||
| if not os.path.exists(NEARLEY_PATH): | |||
| logger.warn("Nearley not installed. Skipping Nearley tests!") | |||
| if not os.path.exists(BUILTIN_PATH): | |||
| logger.warn("Nearley not included. Skipping Nearley tests! (use git submodule to add)") | |||
| raise ImportError("Skipping Nearley tests!") | |||
| import js2py # Ensures that js2py exists, to avoid failing tests | |||
| @@ -2204,27 +2204,7 @@ def _make_parser_test(LEXER, PARSER): | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | |||
| def test_ranged_repeat_terms(self): | |||
| g = u"""!start: AAA | |||
| AAA: "A"~3 | |||
| """ | |||
| l = _Lark(g) | |||
| self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') | |||
| g = u"""!start: AABB CC | |||
| AABB: "A"~0..2 "B"~2 | |||
| CC: "C"~1..2 | |||
| """ | |||
| l = _Lark(g) | |||
| self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) | |||
| self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) | |||
| self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | |||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | |||
| @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX | |||
| def test_priority_vs_embedded(self): | |||
| @@ -3,6 +3,7 @@ | |||
| import json | |||
| import sys | |||
| import unittest | |||
| from itertools import product | |||
| from unittest import TestCase | |||
| from lark import Lark | |||
| @@ -20,8 +21,8 @@ def _remove_ws(s): | |||
| class TestReconstructor(TestCase): | |||
| def assert_reconstruct(self, grammar, code): | |||
| parser = Lark(grammar, parser='lalr', maybe_placeholders=False) | |||
| def assert_reconstruct(self, grammar, code, **options): | |||
| parser = Lark(grammar, parser='lalr', maybe_placeholders=False, **options) | |||
| tree = parser.parse(code) | |||
| new = Reconstructor(parser).reconstruct(tree) | |||
| self.assertEqual(_remove_ws(code), _remove_ws(new)) | |||
| @@ -142,6 +143,17 @@ class TestReconstructor(TestCase): | |||
| new_json = Reconstructor(json_parser).reconstruct(tree) | |||
| self.assertEqual(json.loads(new_json), json.loads(test_json)) | |||
| def test_keep_all_tokens(self): | |||
| g = """ | |||
| start: "a"? _B? c? _d? | |||
| _B: "b" | |||
| c: "c" | |||
| _d: "d" | |||
| """ | |||
| examples = list(map(''.join, product(('', 'a'), ('', 'b'), ('', 'c'), ('', 'd'), ))) | |||
| for code in examples: | |||
| self.assert_reconstruct(g, code, keep_all_tokens=True) | |||
| @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.") | |||
| def test_switch_grammar_unicode_terminal(self): | |||
| """ | |||
| @@ -9,7 +9,7 @@ import functools | |||
| from lark.tree import Tree | |||
| from lark.lexer import Token | |||
| from lark.visitors import Visitor, Visitor_Recursive, Transformer, Interpreter, visit_children_decor, v_args, Discard, Transformer_InPlace, \ | |||
| Transformer_InPlaceRecursive, Transformer_NonRecursive | |||
| Transformer_InPlaceRecursive, Transformer_NonRecursive, merge_transformers | |||
| class TestTrees(TestCase): | |||
| @@ -233,21 +233,62 @@ class TestTrees(TestCase): | |||
| x = MyTransformer().transform( t ) | |||
| self.assertEqual(x, t2) | |||
| def test_transformer_variants(self): | |||
| tree = Tree('start', [Tree('add', [Token('N', '1'), Token('N', '2')]), Tree('add', [Token('N', '3'), Token('N', '4')])]) | |||
| for base in (Transformer, Transformer_InPlace, Transformer_NonRecursive, Transformer_InPlaceRecursive): | |||
| class T(base): | |||
| def add(self, children): | |||
| return sum(children) | |||
| def N(self, token): | |||
| return int(token) | |||
| copied = copy.deepcopy(tree) | |||
| result = T().transform(copied) | |||
| self.assertEqual(result, Tree('start', [3, 7])) | |||
| def test_merge_transformers(self): | |||
| tree = Tree('start', [ | |||
| Tree('main', [ | |||
| Token("A", '1'), Token("B", '2') | |||
| ]), | |||
| Tree("module__main", [ | |||
| Token("A", "2"), Token("B", "3") | |||
| ]) | |||
| ]) | |||
| class T1(Transformer): | |||
| A = int | |||
| B = int | |||
| main = sum | |||
| start = list | |||
| def module__main(self, children): | |||
| return sum(children) | |||
| class T2(Transformer): | |||
| A = int | |||
| B = int | |||
| main = sum | |||
| start = list | |||
| class T3(Transformer): | |||
| def main(self, children): | |||
| return sum(children) | |||
| class T4(Transformer): | |||
| main = sum | |||
| t1_res = T1().transform(tree) | |||
| composed_res = merge_transformers(T2(), module=T3()).transform(tree) | |||
| self.assertEqual(t1_res, composed_res) | |||
| composed_res2 = merge_transformers(T2(), module=T4()).transform(tree) | |||
| self.assertEqual(t1_res, composed_res2) | |||
| with self.assertRaises(AttributeError): | |||
| merge_transformers(T1(), module=T3()) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -1,18 +1,7 @@ | |||
| [tox] | |||
| envlist = py27, py34, py35, py36, py37, py38, py39, pypy, pypy3 | |||
| envlist = py27, py34, py35, py36, py37, py38, py39, py310, pypy, pypy3 | |||
| skip_missing_interpreters=true | |||
| [travis] | |||
| 2.7 = py27 | |||
| 3.4 = py34 | |||
| 3.5 = py35 | |||
| 3.6 = py36 | |||
| 3.7 = py37 | |||
| 3.8 = py38 | |||
| 3.9 = py39 | |||
| pypy = pypy | |||
| pypy3 = pypy3 | |||
| [testenv] | |||
| whitelist_externals = git | |||
| deps = | |||