@@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h | |||||
- [Documentation @readthedocs](https://lark-parser.readthedocs.io/) | - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) | ||||
- [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) | - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) | ||||
- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html) | |||||
- [Online IDE](https://lark-parser.github.io/ide) | |||||
- [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. | - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. | ||||
- Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) | - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) | ||||
- [Gitter chat](https://gitter.im/lark-parser/Lobby) | - [Gitter chat](https://gitter.im/lark-parser/Lobby) | ||||
@@ -66,6 +66,8 @@ UnexpectedInput | |||||
.. autoclass:: lark.exceptions.UnexpectedCharacters | .. autoclass:: lark.exceptions.UnexpectedCharacters | ||||
.. autoclass:: lark.exceptions.UnexpectedEOF | |||||
InteractiveParser | InteractiveParser | ||||
----------------- | ----------------- | ||||
@@ -113,7 +113,7 @@ Resources | |||||
.. _Examples: https://github.com/lark-parser/lark/tree/master/examples | .. _Examples: https://github.com/lark-parser/lark/tree/master/examples | ||||
.. _Third-party examples: https://github.com/ligurio/lark-grammars | .. _Third-party examples: https://github.com/ligurio/lark-grammars | ||||
.. _Online IDE: https://lark-parser.github.io/lark/ide/app.html | |||||
.. _Online IDE: https://lark-parser.github.io/ide | |||||
.. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ | .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ | ||||
.. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html | .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html | ||||
.. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf | .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf | ||||
@@ -107,3 +107,8 @@ Discard | |||||
------- | ------- | ||||
.. autoclass:: lark.visitors.Discard | .. autoclass:: lark.visitors.Discard | ||||
VisitError | |||||
------- | |||||
.. autoclass:: lark.exceptions.VisitError |
@@ -21,7 +21,7 @@ decorators: decorator+ | |||||
decorated: decorators (classdef | funcdef | async_funcdef) | decorated: decorators (classdef | funcdef | async_funcdef) | ||||
async_funcdef: "async" funcdef | async_funcdef: "async" funcdef | ||||
funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite | |||||
funcdef: "def" NAME "(" [parameters] ")" ["->" test] ":" suite | |||||
parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]] | parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]] | ||||
| starparams | | starparams | ||||
@@ -29,25 +29,36 @@ parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams | |||||
SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result | SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result | ||||
starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] | starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] | ||||
kwparams: "**" typedparam | |||||
kwparams: "**" typedparam ","? | |||||
?paramvalue: typedparam ["=" test] | |||||
?typedparam: NAME [":" test] | |||||
?paramvalue: typedparam ("=" test)? | |||||
?typedparam: NAME (":" test)? | |||||
varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]] | |||||
| "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | |||||
| "**" vfpdef [","]) | |||||
vfpdef: NAME | |||||
lambdef: "lambda" [lambda_params] ":" test | |||||
lambdef_nocond: "lambda" [lambda_params] ":" test_nocond | |||||
lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]] | |||||
| lambda_starparams | |||||
| lambda_kwparams | |||||
?lambda_paramvalue: NAME ("=" test)? | |||||
lambda_starparams: "*" [NAME] ("," lambda_paramvalue)* ["," [lambda_kwparams]] | |||||
lambda_kwparams: "**" NAME ","? | |||||
?stmt: simple_stmt | compound_stmt | ?stmt: simple_stmt | compound_stmt | ||||
?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE | ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE | ||||
?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) | |||||
?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) | |||||
| ("=" (yield_expr|testlist_star_expr))*) | |||||
annassign: ":" test ["=" test] | |||||
?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","] | |||||
!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") | |||||
?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) | |||||
expr_stmt: testlist_star_expr | |||||
assign_stmt: annassign | augassign | assign | |||||
annassign: testlist_star_expr ":" test ["=" test] | |||||
assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+ | |||||
augassign: testlist_star_expr augassign_op (yield_expr|testlist) | |||||
!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=" | |||||
?testlist_star_expr: test_or_star_expr | |||||
| test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple | |||||
| test_or_star_expr "," -> tuple | |||||
// For normal and annotated assignments, additional restrictions enforced by the interpreter | // For normal and annotated assignments, additional restrictions enforced by the interpreter | ||||
del_stmt: "del" exprlist | del_stmt: "del" exprlist | ||||
pass_stmt: "pass" | pass_stmt: "pass" | ||||
@@ -71,43 +82,52 @@ global_stmt: "global" NAME ("," NAME)* | |||||
nonlocal_stmt: "nonlocal" NAME ("," NAME)* | nonlocal_stmt: "nonlocal" NAME ("," NAME)* | ||||
assert_stmt: "assert" test ["," test] | assert_stmt: "assert" test ["," test] | ||||
compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt | |||||
?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt | |||||
async_stmt: "async" (funcdef | with_stmt | for_stmt) | async_stmt: "async" (funcdef | with_stmt | for_stmt) | ||||
if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] | |||||
if_stmt: "if" test ":" suite elifs ["else" ":" suite] | |||||
elifs: elif_* | |||||
elif_: "elif" test ":" suite | |||||
while_stmt: "while" test ":" suite ["else" ":" suite] | while_stmt: "while" test ":" suite ["else" ":" suite] | ||||
for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] | for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] | ||||
try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) | |||||
with_stmt: "with" with_item ("," with_item)* ":" suite | |||||
try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally] | |||||
| "try" ":" suite finally -> try_finally | |||||
finally: "finally" ":" suite | |||||
except_clauses: except_clause+ | |||||
except_clause: "except" [test ["as" NAME]] ":" suite | |||||
with_stmt: "with" with_items ":" suite | |||||
with_items: with_item ("," with_item)* | |||||
with_item: test ["as" expr] | with_item: test ["as" expr] | ||||
// NB compile.c makes sure that the default except clause is last | // NB compile.c makes sure that the default except clause is last | ||||
except_clause: "except" [test ["as" NAME]] | |||||
suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT | suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT | ||||
?test: or_test ("if" or_test "else" test)? | lambdef | |||||
?test: or_test ("if" or_test "else" test)? | |||||
| lambdef | |||||
?test_nocond: or_test | lambdef_nocond | ?test_nocond: or_test | lambdef_nocond | ||||
lambdef: "lambda" [varargslist] ":" test | |||||
lambdef_nocond: "lambda" [varargslist] ":" test_nocond | |||||
?or_test: and_test ("or" and_test)* | ?or_test: and_test ("or" and_test)* | ||||
?and_test: not_test ("and" not_test)* | ?and_test: not_test ("and" not_test)* | ||||
?not_test: "not" not_test -> not | |||||
?not_test: "not" not_test -> not_test | |||||
| comparison | | comparison | ||||
?comparison: expr (_comp_op expr)* | |||||
?comparison: expr (comp_op expr)* | |||||
star_expr: "*" expr | star_expr: "*" expr | ||||
?expr: xor_expr ("|" xor_expr)* | |||||
?expr: or_expr | |||||
?or_expr: xor_expr ("|" xor_expr)* | |||||
?xor_expr: and_expr ("^" and_expr)* | ?xor_expr: and_expr ("^" and_expr)* | ||||
?and_expr: shift_expr ("&" shift_expr)* | ?and_expr: shift_expr ("&" shift_expr)* | ||||
?shift_expr: arith_expr (_shift_op arith_expr)* | ?shift_expr: arith_expr (_shift_op arith_expr)* | ||||
?arith_expr: term (_add_op term)* | ?arith_expr: term (_add_op term)* | ||||
?term: factor (_mul_op factor)* | ?term: factor (_mul_op factor)* | ||||
?factor: _factor_op factor | power | |||||
?factor: _unary_op factor | power | |||||
!_factor_op: "+"|"-"|"~" | |||||
!_unary_op: "+"|"-"|"~" | |||||
!_add_op: "+"|"-" | !_add_op: "+"|"-" | ||||
!_shift_op: "<<"|">>" | !_shift_op: "<<"|">>" | ||||
!_mul_op: "*"|"@"|"/"|"%"|"//" | !_mul_op: "*"|"@"|"/"|"%"|"//" | ||||
// <> isn't actually a valid comparison operator in Python. It's here for the | // <> isn't actually a valid comparison operator in Python. It's here for the | ||||
// sake of a __future__ import described in PEP 401 (which really works :-) | // sake of a __future__ import described in PEP 401 (which really works :-) | ||||
!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" | |||||
!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" | |||||
?power: await_expr ("**" factor)? | ?power: await_expr ("**" factor)? | ||||
?await_expr: AWAIT? atom_expr | ?await_expr: AWAIT? atom_expr | ||||
@@ -118,61 +138,75 @@ AWAIT: "await" | |||||
| atom_expr "." NAME -> getattr | | atom_expr "." NAME -> getattr | ||||
| atom | | atom | ||||
?atom: "(" [yield_expr|tuplelist_comp] ")" -> tuple | |||||
| "[" [testlist_comp] "]" -> list | |||||
| "{" [dict_comp] "}" -> dict | |||||
| "{" set_comp "}" -> set | |||||
?atom: "(" yield_expr ")" | |||||
| "(" _tuple_inner? ")" -> tuple | |||||
| "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension | |||||
| "[" _testlist_comp? "]" -> list | |||||
| "[" comprehension{test_or_star_expr} "]" -> list_comprehension | |||||
| "{" _dict_exprlist? "}" -> dict | |||||
| "{" comprehension{key_value} "}" -> dict_comprehension | |||||
| "{" _set_exprlist "}" -> set | |||||
| "{" comprehension{test} "}" -> set_comprehension | |||||
| NAME -> var | | NAME -> var | ||||
| number | string+ | |||||
| number | |||||
| string_concat | |||||
| "(" test ")" | | "(" test ")" | ||||
| "..." -> ellipsis | | "..." -> ellipsis | ||||
| "None" -> const_none | | "None" -> const_none | ||||
| "True" -> const_true | | "True" -> const_true | ||||
| "False" -> const_false | | "False" -> const_false | ||||
?testlist_comp: test | tuplelist_comp | |||||
tuplelist_comp: (test|star_expr) (comp_for | ("," (test|star_expr))+ [","] | ",") | |||||
?string_concat: string+ | |||||
_testlist_comp: test | _tuple_inner | |||||
_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",") | |||||
?test_or_star_expr: test | |||||
| star_expr | |||||
?subscriptlist: subscript | ?subscriptlist: subscript | ||||
| subscript (("," subscript)+ [","] | ",") -> subscript_tuple | | subscript (("," subscript)+ [","] | ",") -> subscript_tuple | ||||
subscript: test | ([test] ":" [test] [sliceop]) -> slice | |||||
?subscript: test | ([test] ":" [test] [sliceop]) -> slice | |||||
sliceop: ":" [test] | sliceop: ":" [test] | ||||
exprlist: (expr|star_expr) | |||||
| (expr|star_expr) (("," (expr|star_expr))+ [","]|",") -> exprlist_tuple | |||||
testlist: test | testlist_tuple | |||||
?exprlist: (expr|star_expr) | |||||
| (expr|star_expr) (("," (expr|star_expr))+ [","]|",") | |||||
?testlist: test | testlist_tuple | |||||
testlist_tuple: test (("," test)+ [","] | ",") | testlist_tuple: test (("," test)+ [","] | ",") | ||||
dict_comp: key_value comp_for | |||||
| (key_value | "**" expr) ("," (key_value | "**" expr))* [","] | |||||
_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","] | |||||
key_value: test ":" test | key_value: test ":" test | ||||
set_comp: test comp_for | |||||
| (test|star_expr) ("," (test | star_expr))* [","] | |||||
_set_exprlist: test_or_star_expr ("," test_or_star_expr)* [","] | |||||
classdef: "class" NAME ["(" [arguments] ")"] ":" suite | classdef: "class" NAME ["(" [arguments] ")"] ":" suite | ||||
arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? | arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? | ||||
| starargs | | starargs | ||||
| kwargs | | kwargs | ||||
| test comp_for | |||||
| comprehension{test} | |||||
starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs] | |||||
starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs] | |||||
stararg: "*" test | |||||
kwargs: "**" test | kwargs: "**" test | ||||
?argvalue: test ("=" test)? | ?argvalue: test ("=" test)? | ||||
comp_iter: comp_for | comp_if | async_for | |||||
async_for: "async" "for" exprlist "in" or_test [comp_iter] | |||||
comp_for: "for" exprlist "in" or_test [comp_iter] | |||||
comp_if: "if" test_nocond [comp_iter] | |||||
comprehension{comp_result}: comp_result comp_fors [comp_if] | |||||
comp_fors: comp_for+ | |||||
comp_for: [ASYNC] "for" exprlist "in" or_test | |||||
ASYNC: "async" | |||||
?comp_if: "if" test_nocond | |||||
// not used in grammar, but may appear in "node" passed from Parser to Compiler | // not used in grammar, but may appear in "node" passed from Parser to Compiler | ||||
encoding_decl: NAME | encoding_decl: NAME | ||||
yield_expr: "yield" [yield_arg] | |||||
yield_arg: "from" test | testlist | |||||
yield_expr: "yield" [testlist] | |||||
| "yield" "from" test -> yield_from | |||||
number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER | number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER | ||||
string: STRING | LONG_STRING | string: STRING | LONG_STRING | ||||
@@ -181,6 +215,7 @@ string: STRING | LONG_STRING | |||||
%import python (NAME, COMMENT, STRING, LONG_STRING) | %import python (NAME, COMMENT, STRING, LONG_STRING) | ||||
%import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER) | %import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER) | ||||
// Other terminals | // Other terminals | ||||
_NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ | _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ | ||||
@@ -10,7 +10,9 @@ Standalone Parser | |||||
import sys | import sys | ||||
from json_parser import Lark_StandAlone, Transformer, inline_args | |||||
from json_parser import Lark_StandAlone, Transformer, v_args | |||||
inline_args = v_args(inline=True) | |||||
class TreeToJson(Transformer): | class TreeToJson(Transformer): | ||||
@inline_args | @inline_args | ||||
@@ -38,8 +38,8 @@ def create_transformer(ast_module: types.ModuleType, transformer: Optional[Trans | |||||
Classes starting with an underscore (`_`) will be skipped. | Classes starting with an underscore (`_`) will be skipped. | ||||
Parameters: | Parameters: | ||||
ast_module - A Python module containing all the subclasses of `ast_utils.Ast` | |||||
transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten. | |||||
ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` | |||||
transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. | |||||
""" | """ | ||||
t = transformer or Transformer() | t = transformer or Transformer() | ||||
@@ -40,8 +40,9 @@ class UnexpectedInput(LarkError): | |||||
Used as a base class for the following exceptions: | Used as a base class for the following exceptions: | ||||
- ``UnexpectedToken``: The parser received an unexpected token | |||||
- ``UnexpectedCharacters``: The lexer encountered an unexpected string | - ``UnexpectedCharacters``: The lexer encountered an unexpected string | ||||
- ``UnexpectedToken``: The parser received an unexpected token | |||||
- ``UnexpectedEOF``: The parser expected a token, but the input ended | |||||
After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | ||||
""" | """ | ||||
@@ -135,7 +136,8 @@ class UnexpectedInput(LarkError): | |||||
class UnexpectedEOF(ParseError, UnexpectedInput): | class UnexpectedEOF(ParseError, UnexpectedInput): | ||||
"""An exception that is raised by the parser, when the input ends while it still expects a token. | |||||
""" | |||||
expected: 'List[Token]' | expected: 'List[Token]' | ||||
def __init__(self, expected, state=None, terminals_by_name=None): | def __init__(self, expected, state=None, terminals_by_name=None): | ||||
@@ -158,6 +160,9 @@ class UnexpectedEOF(ParseError, UnexpectedInput): | |||||
class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
"""An exception that is raised by the lexer, when it cannot match the next | |||||
string of characters to any of its terminals. | |||||
""" | |||||
allowed: Set[str] | allowed: Set[str] | ||||
considered_tokens: Set[Any] | considered_tokens: Set[Any] | ||||
@@ -199,10 +204,15 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||||
"""An exception that is raised by the parser, when the token it received | """An exception that is raised by the parser, when the token it received | ||||
doesn't match any valid step forward. | doesn't match any valid step forward. | ||||
The parser provides an interactive instance through `interactive_parser`, | |||||
which is initialized to the point of failture, and can be used for debugging and error handling. | |||||
Parameters: | |||||
token: The mismatched token | |||||
expected: The set of expected tokens | |||||
considered_rules: Which rules were considered, to deduce the expected tokens | |||||
state: A value representing the parser state. Do not rely on its value or type. | |||||
interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture, | |||||
and can be used for debugging and error handling. | |||||
see: ``InteractiveParser``. | |||||
Note: These parameters are available as attributes of the instance. | |||||
""" | """ | ||||
expected: Set[str] | expected: Set[str] | ||||
@@ -247,8 +257,13 @@ class VisitError(LarkError): | |||||
"""VisitError is raised when visitors are interrupted by an exception | """VisitError is raised when visitors are interrupted by an exception | ||||
It provides the following attributes for inspection: | It provides the following attributes for inspection: | ||||
- obj: the tree node or token it was processing when the exception was raised | |||||
- orig_exc: the exception that cause it to fail | |||||
Parameters: | |||||
rule: the name of the visit rule that failed | |||||
obj: the tree-node or token that was being processed | |||||
orig_exc: the exception that cause it to fail | |||||
Note: These parameters are available as attributes | |||||
""" | """ | ||||
obj: 'Union[Tree, Token]' | obj: 'Union[Tree, Token]' | ||||
@@ -258,6 +273,7 @@ class VisitError(LarkError): | |||||
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | ||||
super(VisitError, self).__init__(message) | super(VisitError, self).__init__(message) | ||||
self.rule = rule | |||||
self.obj = obj | self.obj = obj | ||||
self.orig_exc = orig_exc | self.orig_exc = orig_exc | ||||
@@ -137,7 +137,7 @@ class LarkOptions(Serialize): | |||||
A List of either paths or loader functions to specify from where grammars are imported | A List of either paths or loader functions to specify from where grammars are imported | ||||
source_path | source_path | ||||
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading | Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading | ||||
**=== End Options ===** | |||||
**=== End of Options ===** | |||||
""" | """ | ||||
if __doc__: | if __doc__: | ||||
__doc__ += OPTIONS_DOC | __doc__ += OPTIONS_DOC | ||||
@@ -560,6 +560,8 @@ class Lark(Serialize): | |||||
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' | """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' | ||||
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. | When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. | ||||
:raises UnexpectedCharacters: In case the lexer cannot find a suitable match. | |||||
""" | """ | ||||
if not hasattr(self, 'lexer') or dont_ignore: | if not hasattr(self, 'lexer') or dont_ignore: | ||||
lexer = self._build_lexer(dont_ignore) | lexer = self._build_lexer(dont_ignore) | ||||
@@ -602,6 +604,10 @@ class Lark(Serialize): | |||||
If a transformer is supplied to ``__init__``, returns whatever is the | If a transformer is supplied to ``__init__``, returns whatever is the | ||||
result of the transformation. Otherwise, returns a Tree instance. | result of the transformation. Otherwise, returns a Tree instance. | ||||
:raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: | |||||
``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. | |||||
For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. | |||||
""" | """ | ||||
return self.parser.parse(text, start=start, on_error=on_error) | return self.parser.parse(text, start=start, on_error=on_error) | ||||
@@ -281,7 +281,6 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): | |||||
return new_terminals, callback | return new_terminals, callback | ||||
class Scanner: | class Scanner: | ||||
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): | def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): | ||||
self.terminals = terminals | self.terminals = terminals | ||||
@@ -10,7 +10,7 @@ from numbers import Integral | |||||
from contextlib import suppress | from contextlib import suppress | ||||
from typing import List, Tuple, Union, Callable, Dict, Optional | from typing import List, Tuple, Union, Callable, Dict, Optional | ||||
from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique | |||||
from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors | |||||
from .lexer import Token, TerminalDef, PatternStr, PatternRE | from .lexer import Token, TerminalDef, PatternStr, PatternRE | ||||
from .parse_tree_builder import ParseTreeBuilder | from .parse_tree_builder import ParseTreeBuilder | ||||
@@ -176,27 +176,136 @@ RULES = { | |||||
} | } | ||||
# Value 5 keeps the number of states in the lalr parser somewhat minimal | |||||
# It isn't optimal, but close to it. See PR #949 | |||||
SMALL_FACTOR_THRESHOLD = 5 | |||||
# The Threshold whether repeat via ~ are split up into different rules | |||||
# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, | |||||
# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. | |||||
# (See PR #949) | |||||
REPEAT_BREAK_THRESHOLD = 50 | |||||
@inline_args | @inline_args | ||||
class EBNF_to_BNF(Transformer_InPlace): | class EBNF_to_BNF(Transformer_InPlace): | ||||
def __init__(self): | def __init__(self): | ||||
self.new_rules = [] | self.new_rules = [] | ||||
self.rules_by_expr = {} | |||||
self.rules_cache = {} | |||||
self.prefix = 'anon' | self.prefix = 'anon' | ||||
self.i = 0 | self.i = 0 | ||||
self.rule_options = None | self.rule_options = None | ||||
def _add_recurse_rule(self, type_, expr): | |||||
if expr in self.rules_by_expr: | |||||
return self.rules_by_expr[expr] | |||||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||||
def _name_rule(self, inner): | |||||
new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) | |||||
self.i += 1 | self.i += 1 | ||||
t = NonTerminal(new_name) | |||||
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) | |||||
self.new_rules.append((new_name, tree, self.rule_options)) | |||||
self.rules_by_expr[expr] = t | |||||
return new_name | |||||
def _add_rule(self, key, name, expansions): | |||||
t = NonTerminal(name) | |||||
self.new_rules.append((name, expansions, self.rule_options)) | |||||
self.rules_cache[key] = t | |||||
return t | return t | ||||
def _add_recurse_rule(self, type_, expr): | |||||
try: | |||||
return self.rules_cache[expr] | |||||
except KeyError: | |||||
new_name = self._name_rule(type_) | |||||
t = NonTerminal(new_name) | |||||
tree = ST('expansions', [ | |||||
ST('expansion', [expr]), | |||||
ST('expansion', [t, expr]) | |||||
]) | |||||
return self._add_rule(expr, new_name, tree) | |||||
def _add_repeat_rule(self, a, b, target, atom): | |||||
"""Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. | |||||
When called recursively (into target), it repeats atom for x(n) times, where: | |||||
x(0) = 1 | |||||
x(n) = a(n) * x(n-1) + b | |||||
Example rule when a=3, b=4: | |||||
new_rule: target target target atom atom atom atom | |||||
""" | |||||
key = (a, b, target, atom) | |||||
try: | |||||
return self.rules_cache[key] | |||||
except KeyError: | |||||
new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) | |||||
tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) | |||||
return self._add_rule(key, new_name, tree) | |||||
def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): | |||||
"""Creates a rule that matches atom 0 to (a*n+b)-1 times. | |||||
When target matches n times atom, and target_opt 0 to n-1 times target_opt, | |||||
First we generate target * i followed by target_opt, for i from 0 to a-1 | |||||
These match 0 to n*a - 1 times atom | |||||
Then we generate target * a followed by atom * i, for i from 0 to b-1 | |||||
These match n*a to n*a + b-1 times atom | |||||
The created rule will not have any shift/reduce conflicts so that it can be used with lalr | |||||
Example rule when a=3, b=4: | |||||
new_rule: target_opt | |||||
| target target_opt | |||||
| target target target_opt | |||||
| target target target | |||||
| target target target atom | |||||
| target target target atom atom | |||||
| target target target atom atom atom | |||||
""" | |||||
key = (a, b, target, atom, "opt") | |||||
try: | |||||
return self.rules_cache[key] | |||||
except KeyError: | |||||
new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) | |||||
tree = ST('expansions', [ | |||||
ST('expansion', [target]*i + [target_opt]) for i in range(a) | |||||
] + [ | |||||
ST('expansion', [target]*a + [atom]*i) for i in range(b) | |||||
]) | |||||
return self._add_rule(key, new_name, tree) | |||||
def _generate_repeats(self, rule, mn, mx): | |||||
"""Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. | |||||
""" | |||||
# For a small number of repeats, we can take the naive approach | |||||
if mx < REPEAT_BREAK_THRESHOLD: | |||||
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) | |||||
# For large repeat values, we break the repetition into sub-rules. | |||||
# We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. | |||||
# We then use small_factors to split up mn and diff up into values [(a, b), ...] | |||||
# This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt | |||||
# to generate a complete rule/expression that matches the corresponding number of repeats | |||||
mn_target = rule | |||||
for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): | |||||
mn_target = self._add_repeat_rule(a, b, mn_target, rule) | |||||
if mx == mn: | |||||
return mn_target | |||||
diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less | |||||
diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) | |||||
diff_target = rule # Match rule 1 times | |||||
diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) | |||||
for a, b in diff_factors[:-1]: | |||||
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) | |||||
diff_target = self._add_repeat_rule(a, b, diff_target, rule) | |||||
a, b = diff_factors[-1] | |||||
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) | |||||
return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) | |||||
def expr(self, rule, op, *args): | def expr(self, rule, op, *args): | ||||
if op.value == '?': | if op.value == '?': | ||||
empty = ST('expansion', []) | empty = ST('expansion', []) | ||||
@@ -221,7 +330,9 @@ class EBNF_to_BNF(Transformer_InPlace): | |||||
mn, mx = map(int, args) | mn, mx = map(int, args) | ||||
if mx < mn or mn < 0: | if mx < mn or mn < 0: | ||||
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | ||||
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||||
return self._generate_repeats(rule, mn, mx) | |||||
assert False, op | assert False, op | ||||
def maybe(self, rule): | def maybe(self, rule): | ||||
@@ -238,4 +238,4 @@ class MakeParsingFrontend(MakeParsingFrontend): | |||||
assert isinstance(parser_conf, ParserConf) | assert isinstance(parser_conf, ParserConf) | ||||
parser_conf.parser_type = self.parser_type | parser_conf.parser_type = self.parser_type | ||||
lexer_conf.lexer_type = self.lexer_type | lexer_conf.lexer_type = self.lexer_type | ||||
return ParsingFrontend(lexer_conf, parser_conf, options) | |||||
return ParsingFrontend(lexer_conf, parser_conf, options) |
@@ -65,7 +65,7 @@ class InteractiveParser(object): | |||||
"""Print the output of ``choices()`` in a way that's easier to read.""" | """Print the output of ``choices()`` in a way that's easier to read.""" | ||||
out = ["Parser choices:"] | out = ["Parser choices:"] | ||||
for k, v in self.choices().items(): | for k, v in self.choices().items(): | ||||
out.append('\t- %s -> %s' % (k, v)) | |||||
out.append('\t- %s -> %r' % (k, v)) | |||||
out.append('stack size: %s' % len(self.parser_state.state_stack)) | out.append('stack size: %s' % len(self.parser_state.state_stack)) | ||||
return '\n'.join(out) | return '\n'.join(out) | ||||
@@ -163,7 +163,7 @@ def get_regexp_width(expr): | |||||
return 1, sre_constants.MAXREPEAT | return 1, sre_constants.MAXREPEAT | ||||
else: | else: | ||||
return 0, sre_constants.MAXREPEAT | return 0, sre_constants.MAXREPEAT | ||||
###} | ###} | ||||
@@ -245,7 +245,7 @@ except ImportError: | |||||
class FS: | class FS: | ||||
exists = os.path.exists | exists = os.path.exists | ||||
@staticmethod | @staticmethod | ||||
def open(name, mode="r", **kwargs): | def open(name, mode="r", **kwargs): | ||||
if atomicwrites and "w" in mode: | if atomicwrites and "w" in mode: | ||||
@@ -316,3 +316,29 @@ def _serialize(value, memo): | |||||
return {key:_serialize(elem, memo) for key, elem in value.items()} | return {key:_serialize(elem, memo) for key, elem in value.items()} | ||||
# assert value is None or isinstance(value, (int, float, str, tuple)), value | # assert value is None or isinstance(value, (int, float, str, tuple)), value | ||||
return value | return value | ||||
def small_factors(n, max_factor): | |||||
""" | |||||
Splits n up into smaller factors and summands <= max_factor. | |||||
Returns a list of [(a, b), ...] | |||||
so that the following code returns n: | |||||
n = 1 | |||||
for a, b in values: | |||||
n = n * a + b | |||||
Currently, we also keep a + b <= max_factor, but that might change | |||||
""" | |||||
assert n >= 0 | |||||
assert max_factor > 2 | |||||
if n <= max_factor: | |||||
return [(n, 0)] | |||||
for a in range(max_factor, 1, -1): | |||||
r, b = divmod(n, a) | |||||
if a + b <= max_factor: | |||||
return small_factors(r, max_factor) + [(a, b)] | |||||
assert False, "Failed to factorize %s" % n |
@@ -3,7 +3,7 @@ from __future__ import absolute_import | |||||
import sys | import sys | ||||
from unittest import TestCase, main | from unittest import TestCase, main | ||||
from lark import Lark, Token, Tree | |||||
from lark import Lark, Token, Tree, ParseError, UnexpectedInput | |||||
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors | from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors | ||||
from lark.load_grammar import FromPackageLoader | from lark.load_grammar import FromPackageLoader | ||||
@@ -198,6 +198,53 @@ class TestGrammar(TestCase): | |||||
x = find_grammar_errors(text) | x = find_grammar_errors(text) | ||||
assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] | assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] | ||||
def test_ranged_repeat_terms(self): | |||||
g = u"""!start: AAA | |||||
AAA: "A"~3 | |||||
""" | |||||
l = Lark(g, parser='lalr') | |||||
self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') | |||||
g = u"""!start: AABB CC | |||||
AABB: "A"~0..2 "B"~2 | |||||
CC: "C"~1..2 | |||||
""" | |||||
l = Lark(g, parser='lalr') | |||||
self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) | |||||
self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) | |||||
self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | |||||
def test_ranged_repeat_large(self): | |||||
g = u"""!start: "A"~60 | |||||
""" | |||||
l = Lark(g, parser='lalr') | |||||
self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") | |||||
self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) | |||||
self.assertRaises(ParseError, l.parse, u'A' * 59) | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) | |||||
g = u"""!start: "A"~15..100 | |||||
""" | |||||
l = Lark(g, parser='lalr') | |||||
for i in range(0, 110): | |||||
if 15 <= i <= 100: | |||||
self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) | |||||
else: | |||||
self.assertRaises(UnexpectedInput, l.parse, u'A' * i) | |||||
# 8191 is a Mersenne prime | |||||
g = u"""start: "A"~8191 | |||||
""" | |||||
l = Lark(g, parser='lalr') | |||||
self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) | |||||
self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) | |||||
self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
@@ -2203,27 +2203,7 @@ def _make_parser_test(LEXER, PARSER): | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | ||||
def test_ranged_repeat_terms(self): | |||||
g = u"""!start: AAA | |||||
AAA: "A"~3 | |||||
""" | |||||
l = _Lark(g) | |||||
self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') | |||||
g = u"""!start: AABB CC | |||||
AABB: "A"~0..2 "B"~2 | |||||
CC: "C"~1..2 | |||||
""" | |||||
l = _Lark(g) | |||||
self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) | |||||
self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) | |||||
self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | |||||
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | |||||
@unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX | @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX | ||||
def test_priority_vs_embedded(self): | def test_priority_vs_embedded(self): | ||||