Quellcode durchsuchen

Merge branch 'better_repeats' of https://github.com/MegaIng/lark into MegaIng-better_repeats

Erez Sh vor 3 Jahren
8 geänderte Dateien mit 230 neuen und 23 gelöschten Zeilen
  1. +2
  2. +5
  3. +2
  4. +25
  5. +7
  6. +129
  7. +31
  8. +29

+ 2
- 0
docs/classes.rst Datei anzeigen

@@ -66,6 +66,8 @@ UnexpectedInput

.. autoclass:: lark.exceptions.UnexpectedCharacters

.. autoclass:: lark.exceptions.UnexpectedEOF


+ 5
- 0
docs/visitors.rst Datei anzeigen

@@ -107,3 +107,8 @@ Discard

.. autoclass:: lark.visitors.Discard


.. autoclass:: lark.exceptions.VisitError

+ 2
- 2
lark/ast_utils.py Datei anzeigen

@@ -36,8 +36,8 @@ def create_transformer(ast_module, transformer=None):
Classes starting with an underscore (`_`) will be skipped.

ast_module - A Python module containing all the subclasses of `ast_utils.Ast`
transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten.
ast_module: A Python module containing all the subclasses of ``ast_utils.Ast``
transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten.
t = transformer or Transformer()

+ 25
- 6
lark/exceptions.py Datei anzeigen

@@ -36,8 +36,9 @@ class UnexpectedInput(LarkError):

Used as a base class for the following exceptions:

- ``UnexpectedToken``: The parser received an unexpected token
- ``UnexpectedCharacters``: The lexer encountered an unexpected string
- ``UnexpectedToken``: The parser received an unexpected token
- ``UnexpectedEOF``: The parser expected a token, but the input ended

After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
@@ -128,6 +129,9 @@ class UnexpectedInput(LarkError):

class UnexpectedEOF(ParseError, UnexpectedInput):
"""An exception that is raised by the parser, when the input ends while it still expects a token.

def __init__(self, expected, state=None, terminals_by_name=None):
super(UnexpectedEOF, self).__init__()

@@ -148,6 +152,10 @@ class UnexpectedEOF(ParseError, UnexpectedInput):

class UnexpectedCharacters(LexError, UnexpectedInput):
"""An exception that is raised by the lexer, when it cannot match the next
string of characters to any of its terminals.

def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
terminals_by_name=None, considered_rules=None):
super(UnexpectedCharacters, self).__init__()
@@ -185,10 +193,15 @@ class UnexpectedToken(ParseError, UnexpectedInput):
"""An exception that is raised by the parser, when the token it received
doesn't match any valid step forward.

The parser provides an interactive instance through `interactive_parser`,
which is initialized to the point of failture, and can be used for debugging and error handling.
token: The mismatched token
expected: The set of expected tokens
considered_rules: Which rules were considered, to deduce the expected tokens
state: A value representing the parser state. Do not rely on its value or type.
interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture,
and can be used for debugging and error handling.

see: ``InteractiveParser``.
Note: These parameters are available as attributes of the instance.

def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
@@ -234,14 +247,20 @@ class VisitError(LarkError):
"""VisitError is raised when visitors are interrupted by an exception

It provides the following attributes for inspection:
- obj: the tree node or token it was processing when the exception was raised
- orig_exc: the exception that cause it to fail

rule: the name of the visit rule that failed
obj: the tree-node or token that was being processed
orig_exc: the exception that cause it to fail

Note: These parameters are available as attributes

def __init__(self, rule, obj, orig_exc):
message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message)

self.rule = rule
self.obj = obj
self.orig_exc = orig_exc

+ 7
- 1
lark/lark.py Datei anzeigen

@@ -102,7 +102,7 @@ class LarkOptions(Serialize):
A List of either paths or loader functions to specify from where grammars are imported
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading
**=== End Options ===**
**=== End of Options ===**
if __doc__:
__doc__ += OPTIONS_DOC
@@ -527,6 +527,8 @@ class Lark(Serialize):
"""Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'

When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.

:raises UnexpectedCharacters: In case the lexer cannot find a suitable match.
if not hasattr(self, 'lexer') or dont_ignore:
lexer = self._build_lexer(dont_ignore)
@@ -569,6 +571,10 @@ class Lark(Serialize):
If a transformer is supplied to ``__init__``, returns whatever is the
result of the transformation. Otherwise, returns a Tree instance.

:raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise:
``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``.
For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.

return self.parser.parse(text, start=start, on_error=on_error)

+ 129
- 12
lark/load_grammar.py Datei anzeigen

@@ -9,7 +9,7 @@ import pkgutil
from ast import literal_eval
from numbers import Integral

from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
@@ -174,28 +174,141 @@ RULES = {
'literal': ['REGEXP', 'STRING'],

# The Threshold whether repeat via ~ are split up into different rules
# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low,
# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts.
# For a grammar of the form start: "A"~0..N, these are the timing stats:
# N t
# 10 0.000
# 20 0.004
# 30 0.016
# 40 0.049
# 50 0.109
# 60 0.215
# 70 0.383
# 80 0.631
# (See PR #949)

class EBNF_to_BNF(Transformer_InPlace):
def __init__(self):
self.new_rules = []
self.rules_by_expr = {}
self.rules_cache = {}
self.prefix = 'anon'
self.i = 0
self.rule_options = None

def _add_recurse_rule(self, type_, expr):
if expr in self.rules_by_expr:
return self.rules_by_expr[expr]

new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
def _name_rule(self, inner):
new_name = '__%s_%s_%d' % (self.prefix, inner, self.i)
self.i += 1
t = NonTerminal(new_name)
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
self.new_rules.append((new_name, tree, self.rule_options))
self.rules_by_expr[expr] = t
return new_name

def _add_rule(self, key, name, expansions):
t = NonTerminal(name)
self.new_rules.append((name, expansions, self.rule_options))
self.rules_cache[key] = t
return t

def _add_recurse_rule(self, type_, expr):
return self.rules_cache[expr]
except KeyError:
new_name = self._name_rule(type_)
t = NonTerminal(new_name)
tree = ST('expansions', [
ST('expansion', [expr]),
ST('expansion', [t, expr])
return self._add_rule(expr, new_name, tree)

def _add_repeat_rule(self, a, b, target, atom):
When target matches n times atom
This builds a rule that matches atom (a*n + b) times

The rule is of the form:

The rules are of the form: (Example a = 3, b = 4)

new_rule: target target target atom atom atom atom

e.g. we use target * a and atom * b
key = (a, b, target, atom)
return self.rules_cache[key]
except KeyError:
new_name = self._name_rule('repeat_a%d_b%d' % (a, b))
tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
return self._add_rule(key, new_name, tree)

def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
When target matches n times atom, and target_opt 0 to n-1 times target_opt,
This builds a rule that matches atom 0 to (a*n+b)-1 times.
The created rule will not have any shift/reduce conflicts so that it can be used with lalr

The rules are of the form: (Example a = 3, b = 4)

new_rule: target_opt
| target target_opt
| target target target_opt

| target target target
| target target target atom
| target target target atom atom
| target target target atom atom atom

First we generate target * i followed by target_opt for i from 0 to a-1
These match 0 to n*a - 1 times atom

Then we generate target * a followed by atom * i for i from 0 to b-1
These match n*a to n*a + b-1 times atom
key = (a, b, target, atom, "opt")
return self.rules_cache[key]
except KeyError:
new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b))
tree = ST('expansions', [
ST('expansion', [target] * i + [target_opt])
for i in range(a)
] + [
ST('expansion', [target] * a + [atom] * i)
for i in range(b)
return self._add_rule(key, new_name, tree)

def _generate_repeats(self, rule, mn, mx):
We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn).
We then use small_factors to split up mn and diff up into values [(a, b), ...]
This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
to generate a complete rule/expression that matches the corresponding number of repeats
mn_factors = small_factors(mn)
mn_target = rule
for a, b in mn_factors:
mn_target = self._add_repeat_rule(a, b, mn_target, rule)
if mx == mn:
return mn_target

diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less
diff_factors = small_factors(diff)
diff_target = rule # Match rule 1 times
diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times)
for a, b in diff_factors[:-1]:
new_diff_target = self._add_repeat_rule(a, b, diff_target, rule)
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
diff_target = new_diff_target

a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)

return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])

def expr(self, rule, op, *args):
if op.value == '?':
empty = ST('expansion', [])
@@ -220,7 +333,11 @@ class EBNF_to_BNF(Transformer_InPlace):
mn, mx = map(int, args)
if mx < mn or mn < 0:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
# For small number of repeats, we don't need to build new rules.
return self._generate_repeats(rule, mn, mx)
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
assert False, op

def maybe(self, rule):

+ 31
- 2
lark/utils.py Datei anzeigen

@@ -187,7 +187,7 @@ def get_regexp_width(expr):
return 1, sre_constants.MAXREPEAT
return 0, sre_constants.MAXREPEAT

@@ -288,7 +288,7 @@ except ImportError:

class FS:
exists = os.path.exists
def open(name, mode="r", **kwargs):
if atomicwrites and "w" in mode:
@@ -359,3 +359,32 @@ def _serialize(value, memo):
return {key:_serialize(elem, memo) for key, elem in value.items()}
# assert value is None or isinstance(value, (int, float, str, tuple)), value
return value

# Value 5 keeps the number of states in the lalr parser somewhat minimal
# It isn't optimal, but close to it. See PR #949

def small_factors(n):
Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD.
Returns a list of [(a, b), ...]
so that the following code returns n:

n = 1
for a, b in values:
n = n * a + b

Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change
assert n >= 0
return [(n, 0)]
# While this does not provide an optimal solution, it produces a pretty good one.
# See above comment and PR #949
for a in range(SMALL_FACTOR_THRESHOLD, 1, -1):
r, b = divmod(n, a)
return small_factors(r) + [(a, b)]
assert False, "Failed to factorize %s" % n

+ 29
- 0
tests/test_parser.py Datei anzeigen

@@ -2226,6 +2226,35 @@ def _make_parser_test(LEXER, PARSER):
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')

@unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated")
def test_ranged_repeat_large(self):
# Large is currently arbitrarily chosen to be large than 20
g = u"""!start: "A"~60
l = _Lark(g)
self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60))
self.assertRaises(ParseError, l.parse, u'A' * 59)
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61)

g = u"""!start: "A"~15..100
l = _Lark(g)
for i in range(0, 110):
if 15 <= i <= 100:
self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i)

# 8191 is a Mersenne prime
g = u"""start: "A"~8191
l = _Lark(g)
self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190)
self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192)

@unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
def test_priority_vs_embedded(self):
g = """
