diff --git a/README.md b/README.md index 8ec22ed..82f6148 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) -- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html) +- [Online IDE](https://lark-parser.github.io/ide) - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - [Gitter chat](https://gitter.im/lark-parser/Lobby) diff --git a/docs/classes.rst b/docs/classes.rst index 7b18460..1287896 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -66,6 +66,8 @@ UnexpectedInput .. autoclass:: lark.exceptions.UnexpectedCharacters +.. autoclass:: lark.exceptions.UnexpectedEOF + InteractiveParser ----------------- diff --git a/docs/index.rst b/docs/index.rst index 39ecd5a..e8bd6b2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -113,7 +113,7 @@ Resources .. _Examples: https://github.com/lark-parser/lark/tree/master/examples .. _Third-party examples: https://github.com/ligurio/lark-grammars -.. _Online IDE: https://lark-parser.github.io/lark/ide/app.html +.. _Online IDE: https://lark-parser.github.io/ide .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf diff --git a/docs/visitors.rst b/docs/visitors.rst index a0e1711..f263712 100644 --- a/docs/visitors.rst +++ b/docs/visitors.rst @@ -107,3 +107,8 @@ Discard ------- .. autoclass:: lark.visitors.Discard + +VisitError +------- + +.. autoclass:: lark.exceptions.VisitError \ No newline at end of file diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark index 0fc5949..7fb5ae5 100644 --- a/examples/advanced/python3.lark +++ b/examples/advanced/python3.lark @@ -21,7 +21,7 @@ decorators: decorator+ decorated: decorators (classdef | funcdef | async_funcdef) async_funcdef: "async" funcdef -funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite +funcdef: "def" NAME "(" [parameters] ")" ["->" test] ":" suite parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]] | starparams @@ -29,25 +29,36 @@ parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] -kwparams: "**" typedparam +kwparams: "**" typedparam ","? -?paramvalue: typedparam ["=" test] -?typedparam: NAME [":" test] +?paramvalue: typedparam ("=" test)? +?typedparam: NAME (":" test)? -varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]] - | "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] - | "**" vfpdef [","]) -vfpdef: NAME +lambdef: "lambda" [lambda_params] ":" test +lambdef_nocond: "lambda" [lambda_params] ":" test_nocond +lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]] + | lambda_starparams + | lambda_kwparams +?lambda_paramvalue: NAME ("=" test)? +lambda_starparams: "*" [NAME] ("," lambda_paramvalue)* ["," [lambda_kwparams]] +lambda_kwparams: "**" NAME ","? + ?stmt: simple_stmt | compound_stmt ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE -?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) -?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) - | ("=" (yield_expr|testlist_star_expr))*) -annassign: ":" test ["=" test] -?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","] -!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") +?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) +expr_stmt: testlist_star_expr +assign_stmt: annassign | augassign | assign + +annassign: testlist_star_expr ":" test ["=" test] +assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+ +augassign: testlist_star_expr augassign_op (yield_expr|testlist) +!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=" +?testlist_star_expr: test_or_star_expr + | test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple + | test_or_star_expr "," -> tuple + // For normal and annotated assignments, additional restrictions enforced by the interpreter del_stmt: "del" exprlist pass_stmt: "pass" @@ -71,43 +82,52 @@ global_stmt: "global" NAME ("," NAME)* nonlocal_stmt: "nonlocal" NAME ("," NAME)* assert_stmt: "assert" test ["," test] -compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt +?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt async_stmt: "async" (funcdef | with_stmt | for_stmt) -if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] +if_stmt: "if" test ":" suite elifs ["else" ":" suite] +elifs: elif_* +elif_: "elif" test ":" suite while_stmt: "while" test ":" suite ["else" ":" suite] for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] -try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) -with_stmt: "with" with_item ("," with_item)* ":" suite +try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally] + | "try" ":" suite finally -> try_finally +finally: "finally" ":" suite +except_clauses: except_clause+ +except_clause: "except" [test ["as" NAME]] ":" suite + +with_stmt: "with" with_items ":" suite +with_items: with_item ("," with_item)* with_item: test ["as" expr] // NB compile.c makes sure that the default except clause is last -except_clause: "except" [test ["as" NAME]] suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT -?test: or_test ("if" or_test "else" test)? | lambdef +?test: or_test ("if" or_test "else" test)? + | lambdef ?test_nocond: or_test | lambdef_nocond -lambdef: "lambda" [varargslist] ":" test -lambdef_nocond: "lambda" [varargslist] ":" test_nocond + ?or_test: and_test ("or" and_test)* ?and_test: not_test ("and" not_test)* -?not_test: "not" not_test -> not +?not_test: "not" not_test -> not_test | comparison -?comparison: expr (_comp_op expr)* +?comparison: expr (comp_op expr)* star_expr: "*" expr -?expr: xor_expr ("|" xor_expr)* + +?expr: or_expr +?or_expr: xor_expr ("|" xor_expr)* ?xor_expr: and_expr ("^" and_expr)* ?and_expr: shift_expr ("&" shift_expr)* ?shift_expr: arith_expr (_shift_op arith_expr)* ?arith_expr: term (_add_op term)* ?term: factor (_mul_op factor)* -?factor: _factor_op factor | power +?factor: _unary_op factor | power -!_factor_op: "+"|"-"|"~" +!_unary_op: "+"|"-"|"~" !_add_op: "+"|"-" !_shift_op: "<<"|">>" !_mul_op: "*"|"@"|"/"|"%"|"//" // <> isn't actually a valid comparison operator in Python. It's here for the // sake of a __future__ import described in PEP 401 (which really works :-) -!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" +!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" ?power: await_expr ("**" factor)? ?await_expr: AWAIT? atom_expr @@ -118,61 +138,75 @@ AWAIT: "await" | atom_expr "." NAME -> getattr | atom -?atom: "(" [yield_expr|tuplelist_comp] ")" -> tuple - | "[" [testlist_comp] "]" -> list - | "{" [dict_comp] "}" -> dict - | "{" set_comp "}" -> set +?atom: "(" yield_expr ")" + | "(" _tuple_inner? ")" -> tuple + | "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension + | "[" _testlist_comp? "]" -> list + | "[" comprehension{test_or_star_expr} "]" -> list_comprehension + | "{" _dict_exprlist? "}" -> dict + | "{" comprehension{key_value} "}" -> dict_comprehension + | "{" _set_exprlist "}" -> set + | "{" comprehension{test} "}" -> set_comprehension | NAME -> var - | number | string+ + | number + | string_concat | "(" test ")" | "..." -> ellipsis | "None" -> const_none | "True" -> const_true | "False" -> const_false -?testlist_comp: test | tuplelist_comp -tuplelist_comp: (test|star_expr) (comp_for | ("," (test|star_expr))+ [","] | ",") + +?string_concat: string+ + +_testlist_comp: test | _tuple_inner +_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",") + + +?test_or_star_expr: test + | star_expr + ?subscriptlist: subscript | subscript (("," subscript)+ [","] | ",") -> subscript_tuple -subscript: test | ([test] ":" [test] [sliceop]) -> slice +?subscript: test | ([test] ":" [test] [sliceop]) -> slice sliceop: ":" [test] -exprlist: (expr|star_expr) - | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") -> exprlist_tuple -testlist: test | testlist_tuple +?exprlist: (expr|star_expr) + | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") +?testlist: test | testlist_tuple testlist_tuple: test (("," test)+ [","] | ",") -dict_comp: key_value comp_for - | (key_value | "**" expr) ("," (key_value | "**" expr))* [","] +_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","] key_value: test ":" test -set_comp: test comp_for - | (test|star_expr) ("," (test | star_expr))* [","] +_set_exprlist: test_or_star_expr ("," test_or_star_expr)* [","] classdef: "class" NAME ["(" [arguments] ")"] ":" suite + + arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? | starargs | kwargs - | test comp_for + | comprehension{test} -starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs] +starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs] +stararg: "*" test kwargs: "**" test ?argvalue: test ("=" test)? - -comp_iter: comp_for | comp_if | async_for -async_for: "async" "for" exprlist "in" or_test [comp_iter] -comp_for: "for" exprlist "in" or_test [comp_iter] -comp_if: "if" test_nocond [comp_iter] +comprehension{comp_result}: comp_result comp_fors [comp_if] +comp_fors: comp_for+ +comp_for: [ASYNC] "for" exprlist "in" or_test +ASYNC: "async" +?comp_if: "if" test_nocond // not used in grammar, but may appear in "node" passed from Parser to Compiler encoding_decl: NAME -yield_expr: "yield" [yield_arg] -yield_arg: "from" test | testlist - +yield_expr: "yield" [testlist] + | "yield" "from" test -> yield_from number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER string: STRING | LONG_STRING @@ -181,6 +215,7 @@ string: STRING | LONG_STRING %import python (NAME, COMMENT, STRING, LONG_STRING) %import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER) + // Other terminals _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ diff --git a/examples/standalone/json_parser_main.py b/examples/standalone/json_parser_main.py index 503b249..3d9b5a6 100644 --- a/examples/standalone/json_parser_main.py +++ b/examples/standalone/json_parser_main.py @@ -10,7 +10,9 @@ Standalone Parser import sys -from json_parser import Lark_StandAlone, Transformer, inline_args +from json_parser import Lark_StandAlone, Transformer, v_args + +inline_args = v_args(inline=True) class TreeToJson(Transformer): @inline_args diff --git a/lark/ast_utils.py b/lark/ast_utils.py index c535f11..abd7384 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -38,8 +38,8 @@ def create_transformer(ast_module: types.ModuleType, transformer: Optional[Trans Classes starting with an underscore (`_`) will be skipped. Parameters: - ast_module - A Python module containing all the subclasses of `ast_utils.Ast` - transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten. + ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` + transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. """ t = transformer or Transformer() diff --git a/lark/exceptions.py b/lark/exceptions.py index 55e9a3a..7a331ad 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -40,8 +40,9 @@ class UnexpectedInput(LarkError): Used as a base class for the following exceptions: - - ``UnexpectedToken``: The parser received an unexpected token - ``UnexpectedCharacters``: The lexer encountered an unexpected string + - ``UnexpectedToken``: The parser received an unexpected token + - ``UnexpectedEOF``: The parser expected a token, but the input ended After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ @@ -135,7 +136,8 @@ class UnexpectedInput(LarkError): class UnexpectedEOF(ParseError, UnexpectedInput): - + """An exception that is raised by the parser, when the input ends while it still expects a token. + """ expected: 'List[Token]' def __init__(self, expected, state=None, terminals_by_name=None): @@ -158,6 +160,9 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): + """An exception that is raised by the lexer, when it cannot match the next + string of characters to any of its terminals. + """ allowed: Set[str] considered_tokens: Set[Any] @@ -199,10 +204,15 @@ class UnexpectedToken(ParseError, UnexpectedInput): """An exception that is raised by the parser, when the token it received doesn't match any valid step forward. - The parser provides an interactive instance through `interactive_parser`, - which is initialized to the point of failture, and can be used for debugging and error handling. + Parameters: + token: The mismatched token + expected: The set of expected tokens + considered_rules: Which rules were considered, to deduce the expected tokens + state: A value representing the parser state. Do not rely on its value or type. + interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture, + and can be used for debugging and error handling. - see: ``InteractiveParser``. + Note: These parameters are available as attributes of the instance. """ expected: Set[str] @@ -247,8 +257,13 @@ class VisitError(LarkError): """VisitError is raised when visitors are interrupted by an exception It provides the following attributes for inspection: - - obj: the tree node or token it was processing when the exception was raised - - orig_exc: the exception that cause it to fail + + Parameters: + rule: the name of the visit rule that failed + obj: the tree-node or token that was being processed + orig_exc: the exception that cause it to fail + + Note: These parameters are available as attributes """ obj: 'Union[Tree, Token]' @@ -258,6 +273,7 @@ class VisitError(LarkError): message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) + self.rule = rule self.obj = obj self.orig_exc = orig_exc diff --git a/lark/lark.py b/lark/lark.py index aed3346..206541a 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -137,7 +137,7 @@ class LarkOptions(Serialize): A List of either paths or loader functions to specify from where grammars are imported source_path Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading - **=== End Options ===** + **=== End of Options ===** """ if __doc__: __doc__ += OPTIONS_DOC @@ -560,6 +560,8 @@ class Lark(Serialize): """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. + + :raises UnexpectedCharacters: In case the lexer cannot find a suitable match. """ if not hasattr(self, 'lexer') or dont_ignore: lexer = self._build_lexer(dont_ignore) @@ -602,6 +604,10 @@ class Lark(Serialize): If a transformer is supplied to ``__init__``, returns whatever is the result of the transformation. Otherwise, returns a Tree instance. + :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: + ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. + For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. + """ return self.parser.parse(text, start=start, on_error=on_error) diff --git a/lark/lexer.py b/lark/lexer.py index 512e8ff..0b7bef8 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -281,7 +281,6 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): return new_terminals, callback - class Scanner: def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): self.terminals = terminals diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 31b2c35..3b46426 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -10,7 +10,7 @@ from numbers import Integral from contextlib import suppress from typing import List, Tuple, Union, Callable, Dict, Optional -from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique +from .utils import bfs, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -176,27 +176,136 @@ RULES = { } +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 +# The Threshold whether repeat via ~ are split up into different rules +# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, +# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. +# (See PR #949) +REPEAT_BREAK_THRESHOLD = 50 + + @inline_args class EBNF_to_BNF(Transformer_InPlace): def __init__(self): self.new_rules = [] - self.rules_by_expr = {} + self.rules_cache = {} self.prefix = 'anon' self.i = 0 self.rule_options = None - def _add_recurse_rule(self, type_, expr): - if expr in self.rules_by_expr: - return self.rules_by_expr[expr] - - new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) + def _name_rule(self, inner): + new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) self.i += 1 - t = NonTerminal(new_name) - tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) - self.new_rules.append((new_name, tree, self.rule_options)) - self.rules_by_expr[expr] = t + return new_name + + def _add_rule(self, key, name, expansions): + t = NonTerminal(name) + self.new_rules.append((name, expansions, self.rule_options)) + self.rules_cache[key] = t return t + def _add_recurse_rule(self, type_, expr): + try: + return self.rules_cache[expr] + except KeyError: + new_name = self._name_rule(type_) + t = NonTerminal(new_name) + tree = ST('expansions', [ + ST('expansion', [expr]), + ST('expansion', [t, expr]) + ]) + return self._add_rule(expr, new_name, tree) + + def _add_repeat_rule(self, a, b, target, atom): + """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. + + When called recursively (into target), it repeats atom for x(n) times, where: + x(0) = 1 + x(n) = a(n) * x(n-1) + b + + Example rule when a=3, b=4: + + new_rule: target target target atom atom atom atom + + """ + key = (a, b, target, atom) + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) + tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) + return self._add_rule(key, new_name, tree) + + def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): + """Creates a rule that matches atom 0 to (a*n+b)-1 times. + + When target matches n times atom, and target_opt 0 to n-1 times target_opt, + + First we generate target * i followed by target_opt, for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i, for i from 0 to b-1 + These match n*a to n*a + b-1 times atom + + The created rule will not have any shift/reduce conflicts so that it can be used with lalr + + Example rule when a=3, b=4: + + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target + | target target target atom + | target target target atom atom + | target target target atom atom atom + + """ + key = (a, b, target, atom, "opt") + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) + tree = ST('expansions', [ + ST('expansion', [target]*i + [target_opt]) for i in range(a) + ] + [ + ST('expansion', [target]*a + [atom]*i) for i in range(b) + ]) + return self._add_rule(key, new_name, tree) + + def _generate_repeats(self, rule, mn, mx): + """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. + """ + # For a small number of repeats, we can take the naive approach + if mx < REPEAT_BREAK_THRESHOLD: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + + # For large repeat values, we break the repetition into sub-rules. + # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. + # We then use small_factors to split up mn and diff up into values [(a, b), ...] + # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + # to generate a complete rule/expression that matches the corresponding number of repeats + mn_target = rule + for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): + mn_target = self._add_repeat_rule(a, b, mn_target, rule) + if mx == mn: + return mn_target + + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less + diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) + diff_target = rule # Match rule 1 times + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) + for a, b in diff_factors[:-1]: + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + diff_target = self._add_repeat_rule(a, b, diff_target, rule) + + a, b = diff_factors[-1] + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + + return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) + def expr(self, rule, op, *args): if op.value == '?': empty = ST('expansion', []) @@ -221,7 +330,9 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) + + return self._generate_repeats(rule, mn, mx) + assert False, op def maybe(self, rule): diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index dab869e..37d59ae 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -238,4 +238,4 @@ class MakeParsingFrontend(MakeParsingFrontend): assert isinstance(parser_conf, ParserConf) parser_conf.parser_type = self.parser_type lexer_conf.lexer_type = self.lexer_type - return ParsingFrontend(lexer_conf, parser_conf, options) \ No newline at end of file + return ParsingFrontend(lexer_conf, parser_conf, options) diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py index eeadef8..99dfc92 100644 --- a/lark/parsers/lalr_interactive_parser.py +++ b/lark/parsers/lalr_interactive_parser.py @@ -65,7 +65,7 @@ class InteractiveParser(object): """Print the output of ``choices()`` in a way that's easier to read.""" out = ["Parser choices:"] for k, v in self.choices().items(): - out.append('\t- %s -> %s' % (k, v)) + out.append('\t- %s -> %r' % (k, v)) out.append('stack size: %s' % len(self.parser_state.state_stack)) return '\n'.join(out) diff --git a/lark/utils.py b/lark/utils.py index 924300a..2b6e3b6 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -163,7 +163,7 @@ def get_regexp_width(expr): return 1, sre_constants.MAXREPEAT else: return 0, sre_constants.MAXREPEAT - + ###} @@ -245,7 +245,7 @@ except ImportError: class FS: exists = os.path.exists - + @staticmethod def open(name, mode="r", **kwargs): if atomicwrites and "w" in mode: @@ -316,3 +316,29 @@ def _serialize(value, memo): return {key:_serialize(elem, memo) for key, elem in value.items()} # assert value is None or isinstance(value, (int, float, str, tuple)), value return value + + + + +def small_factors(n, max_factor): + """ + Splits n up into smaller factors and summands <= max_factor. + Returns a list of [(a, b), ...] + so that the following code returns n: + + n = 1 + for a, b in values: + n = n * a + b + + Currently, we also keep a + b <= max_factor, but that might change + """ + assert n >= 0 + assert max_factor > 2 + if n <= max_factor: + return [(n, 0)] + + for a in range(max_factor, 1, -1): + r, b = divmod(n, a) + if a + b <= max_factor: + return small_factors(r, max_factor) + [(a, b)] + assert False, "Failed to factorize %s" % n diff --git a/tests/test_grammar.py b/tests/test_grammar.py index a643117..3ae65f2 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -3,7 +3,7 @@ from __future__ import absolute_import import sys from unittest import TestCase, main -from lark import Lark, Token, Tree +from lark import Lark, Token, Tree, ParseError, UnexpectedInput from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors from lark.load_grammar import FromPackageLoader @@ -198,6 +198,53 @@ class TestGrammar(TestCase): x = find_grammar_errors(text) assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] + def test_ranged_repeat_terms(self): + g = u"""!start: AAA + AAA: "A"~3 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') + + g = u"""!start: AABB CC + AABB: "A"~0..2 "B"~2 + CC: "C"~1..2 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) + self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) + self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + + def test_ranged_repeat_large(self): + g = u"""!start: "A"~60 + """ + l = Lark(g, parser='lalr') + self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") + self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) + self.assertRaises(ParseError, l.parse, u'A' * 59) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) + + g = u"""!start: "A"~15..100 + """ + l = Lark(g, parser='lalr') + for i in range(0, 110): + if 15 <= i <= 100: + self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) + else: + self.assertRaises(UnexpectedInput, l.parse, u'A' * i) + + # 8191 is a Mersenne prime + g = u"""start: "A"~8191 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) if __name__ == '__main__': diff --git a/tests/test_parser.py b/tests/test_parser.py index ef1992a..ebc6152 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2203,27 +2203,7 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - def test_ranged_repeat_terms(self): - g = u"""!start: AAA - AAA: "A"~3 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') - g = u"""!start: AABB CC - AABB: "A"~0..2 "B"~2 - CC: "C"~1..2 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) - self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) - self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX def test_priority_vs_embedded(self):