diff --git a/docs/classes.rst b/docs/classes.rst index 7b18460..1287896 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -66,6 +66,8 @@ UnexpectedInput .. autoclass:: lark.exceptions.UnexpectedCharacters +.. autoclass:: lark.exceptions.UnexpectedEOF + InteractiveParser ----------------- diff --git a/docs/visitors.rst b/docs/visitors.rst index a0e1711..f263712 100644 --- a/docs/visitors.rst +++ b/docs/visitors.rst @@ -107,3 +107,8 @@ Discard ------- .. autoclass:: lark.visitors.Discard + +VisitError +------- + +.. autoclass:: lark.exceptions.VisitError \ No newline at end of file diff --git a/lark/ast_utils.py b/lark/ast_utils.py index 0f2e498..b5463a2 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -36,8 +36,8 @@ def create_transformer(ast_module, transformer=None): Classes starting with an underscore (`_`) will be skipped. Parameters: - ast_module - A Python module containing all the subclasses of `ast_utils.Ast` - transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten. + ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` + transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. """ t = transformer or Transformer() diff --git a/lark/exceptions.py b/lark/exceptions.py index 9d326b8..fdcd52b 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -36,8 +36,9 @@ class UnexpectedInput(LarkError): Used as a base class for the following exceptions: - - ``UnexpectedToken``: The parser received an unexpected token - ``UnexpectedCharacters``: The lexer encountered an unexpected string + - ``UnexpectedToken``: The parser received an unexpected token + - ``UnexpectedEOF``: The parser expected a token, but the input ended After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ @@ -128,6 +129,9 @@ class UnexpectedInput(LarkError): class UnexpectedEOF(ParseError, UnexpectedInput): + """An exception that is raised by the parser, when the input ends while it still expects a token. + """ + def __init__(self, expected, state=None, terminals_by_name=None): super(UnexpectedEOF, self).__init__() @@ -148,6 +152,10 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): + """An exception that is raised by the lexer, when it cannot match the next + string of characters to any of its terminals. + """ + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, terminals_by_name=None, considered_rules=None): super(UnexpectedCharacters, self).__init__() @@ -185,10 +193,15 @@ class UnexpectedToken(ParseError, UnexpectedInput): """An exception that is raised by the parser, when the token it received doesn't match any valid step forward. - The parser provides an interactive instance through `interactive_parser`, - which is initialized to the point of failture, and can be used for debugging and error handling. + Parameters: + token: The mismatched token + expected: The set of expected tokens + considered_rules: Which rules were considered, to deduce the expected tokens + state: A value representing the parser state. Do not rely on its value or type. + interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture, + and can be used for debugging and error handling. - see: ``InteractiveParser``. + Note: These parameters are available as attributes of the instance. """ def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): @@ -234,14 +247,20 @@ class VisitError(LarkError): """VisitError is raised when visitors are interrupted by an exception It provides the following attributes for inspection: - - obj: the tree node or token it was processing when the exception was raised - - orig_exc: the exception that cause it to fail + + Parameters: + rule: the name of the visit rule that failed + obj: the tree-node or token that was being processed + orig_exc: the exception that cause it to fail + + Note: These parameters are available as attributes """ def __init__(self, rule, obj, orig_exc): message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) + self.rule = rule self.obj = obj self.orig_exc = orig_exc diff --git a/lark/lark.py b/lark/lark.py index 9a4b2d5..45dec4d 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -102,7 +102,7 @@ class LarkOptions(Serialize): A List of either paths or loader functions to specify from where grammars are imported source_path Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading - **=== End Options ===** + **=== End of Options ===** """ if __doc__: __doc__ += OPTIONS_DOC @@ -527,6 +527,8 @@ class Lark(Serialize): """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. + + :raises UnexpectedCharacters: In case the lexer cannot find a suitable match. """ if not hasattr(self, 'lexer') or dont_ignore: lexer = self._build_lexer(dont_ignore) @@ -569,6 +571,10 @@ class Lark(Serialize): If a transformer is supplied to ``__init__``, returns whatever is the result of the transformation. Otherwise, returns a Tree instance. + :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: + ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. + For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. + """ return self.parser.parse(text, start=start, on_error=on_error) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dbf4a1f..36f6e2c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -9,7 +9,7 @@ import pkgutil from ast import literal_eval from numbers import Integral -from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique +from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -174,28 +174,141 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } +REPEAT_BREAK_THRESHOLD = 50 +# The Threshold whether repeat via ~ are split up into different rules +# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, +# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. +# For a grammar of the form start: "A"~0..N, these are the timing stats: +# N t +# 10 0.000 +# 20 0.004 +# 30 0.016 +# 40 0.049 +# 50 0.109 +# 60 0.215 +# 70 0.383 +# 80 0.631 +# (See PR #949) + @inline_args class EBNF_to_BNF(Transformer_InPlace): def __init__(self): self.new_rules = [] - self.rules_by_expr = {} + self.rules_cache = {} self.prefix = 'anon' self.i = 0 self.rule_options = None - def _add_recurse_rule(self, type_, expr): - if expr in self.rules_by_expr: - return self.rules_by_expr[expr] - - new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) + def _name_rule(self, inner): + new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) self.i += 1 - t = NonTerminal(new_name) - tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) - self.new_rules.append((new_name, tree, self.rule_options)) - self.rules_by_expr[expr] = t + return new_name + + def _add_rule(self, key, name, expansions): + t = NonTerminal(name) + self.new_rules.append((name, expansions, self.rule_options)) + self.rules_cache[key] = t return t + def _add_recurse_rule(self, type_, expr): + try: + return self.rules_cache[expr] + except KeyError: + new_name = self._name_rule(type_) + t = NonTerminal(new_name) + tree = ST('expansions', [ + ST('expansion', [expr]), + ST('expansion', [t, expr]) + ]) + return self._add_rule(expr, new_name, tree) + + def _add_repeat_rule(self, a, b, target, atom): + """ + When target matches n times atom + This builds a rule that matches atom (a*n + b) times + + The rule is of the form: + + The rules are of the form: (Example a = 3, b = 4) + + new_rule: target target target atom atom atom atom + + e.g. we use target * a and atom * b + """ + key = (a, b, target, atom) + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) + tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) + return self._add_rule(key, new_name, tree) + + def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): + """ + When target matches n times atom, and target_opt 0 to n-1 times target_opt, + This builds a rule that matches atom 0 to (a*n+b)-1 times. + The created rule will not have any shift/reduce conflicts so that it can be used with lalr + + The rules are of the form: (Example a = 3, b = 4) + + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target + | target target target atom + | target target target atom atom + | target target target atom atom atom + + First we generate target * i followed by target_opt for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i for i from 0 to b-1 + These match n*a to n*a + b-1 times atom + """ + key = (a, b, target, atom, "opt") + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) + tree = ST('expansions', [ + ST('expansion', [target] * i + [target_opt]) + for i in range(a) + ] + [ + ST('expansion', [target] * a + [atom] * i) + for i in range(b) + ]) + return self._add_rule(key, new_name, tree) + + def _generate_repeats(self, rule, mn, mx): + """ + We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn). + We then use small_factors to split up mn and diff up into values [(a, b), ...] + This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + to generate a complete rule/expression that matches the corresponding number of repeats + """ + mn_factors = small_factors(mn) + mn_target = rule + for a, b in mn_factors: + mn_target = self._add_repeat_rule(a, b, mn_target, rule) + if mx == mn: + return mn_target + + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less + diff_factors = small_factors(diff) + diff_target = rule # Match rule 1 times + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) + for a, b in diff_factors[:-1]: + new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + diff_target = new_diff_target + + a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + + return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) + def expr(self, rule, op, *args): if op.value == '?': empty = ST('expansion', []) @@ -220,7 +333,11 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) + # For small number of repeats, we don't need to build new rules. + if mx > REPEAT_BREAK_THRESHOLD: + return self._generate_repeats(rule, mn, mx) + else: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) assert False, op def maybe(self, rule): diff --git a/lark/utils.py b/lark/utils.py index ea78801..610d160 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -187,7 +187,7 @@ def get_regexp_width(expr): return 1, sre_constants.MAXREPEAT else: return 0, sre_constants.MAXREPEAT - + ###} @@ -288,7 +288,7 @@ except ImportError: class FS: exists = os.path.exists - + @staticmethod def open(name, mode="r", **kwargs): if atomicwrites and "w" in mode: @@ -359,3 +359,32 @@ def _serialize(value, memo): return {key:_serialize(elem, memo) for key, elem in value.items()} # assert value is None or isinstance(value, (int, float, str, tuple)), value return value + + +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 + + +def small_factors(n): + """ + Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD. + Returns a list of [(a, b), ...] + so that the following code returns n: + + n = 1 + for a, b in values: + n = n * a + b + + Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change + """ + assert n >= 0 + if n <= SMALL_FACTOR_THRESHOLD: + return [(n, 0)] + # While this does not provide an optimal solution, it produces a pretty good one. + # See above comment and PR #949 + for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): + r, b = divmod(n, a) + if a + b <= SMALL_FACTOR_THRESHOLD: + return small_factors(r) + [(a, b)] + assert False, "Failed to factorize %s" % n diff --git a/tests/test_parser.py b/tests/test_parser.py index 8fec82d..b55f848 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2226,6 +2226,35 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") + def test_ranged_repeat_large(self): + # Large is currently arbitrarily chosen to be large than 20 + g = u"""!start: "A"~60 + """ + l = _Lark(g) + self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") + self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) + self.assertRaises(ParseError, l.parse, u'A' * 59) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) + + g = u"""!start: "A"~15..100 + """ + l = _Lark(g) + for i in range(0, 110): + if 15 <= i <= 100: + self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) + else: + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i) + + # 8191 is a Mersenne prime + g = u"""start: "A"~8191 + """ + l = _Lark(g) + self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) + + @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX def test_priority_vs_embedded(self): g = """