Merge branch 'better_repeats' of https://github.com/MegaIng/lark into MegaIng-better_repeats

vor 3 Jahren · 953f2534fe
--- a/docs/classes.rst
+++ b/docs/classes.rst
@@ -66,6 +66,8 @@ UnexpectedInput

 .. autoclass:: lark.exceptions.UnexpectedCharacters

 .. autoclass:: lark.exceptions.UnexpectedEOF

 InteractiveParser
 -----------------

--- a/docs/visitors.rst
+++ b/docs/visitors.rst
@@ -107,3 +107,8 @@ Discard
 -------

 .. autoclass:: lark.visitors.Discard

 VisitError
 -------

 .. autoclass:: lark.exceptions.VisitError
--- a/lark/ast_utils.py
+++ b/lark/ast_utils.py
@@ -36,8 +36,8 @@ def create_transformer(ast_module, transformer=None):
    Classes starting with an underscore (`_`) will be skipped.

    Parameters:
        ast_module - A Python module containing all the subclasses of `ast_utils.Ast`
        transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten.
        ast_module: A Python module containing all the subclasses of ``ast_utils.Ast``
        transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten.
    """
    t = transformer or Transformer()

--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -36,8 +36,9 @@ class UnexpectedInput(LarkError):

    Used as a base class for the following exceptions:

    - ``UnexpectedToken``: The parser received an unexpected token
    - ``UnexpectedCharacters``: The lexer encountered an unexpected string
    - ``UnexpectedToken``: The parser received an unexpected token
    - ``UnexpectedEOF``: The parser expected a token, but the input ended

    After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
    """
@@ -128,6 +129,9 @@ class UnexpectedInput(LarkError):


 class UnexpectedEOF(ParseError, UnexpectedInput):
    """An exception that is raised by the parser, when the input ends while it still expects a token.
    """

    def __init__(self, expected, state=None, terminals_by_name=None):
        super(UnexpectedEOF, self).__init__()

@@ -148,6 +152,10 @@ class UnexpectedEOF(ParseError, UnexpectedInput):


 class UnexpectedCharacters(LexError, UnexpectedInput):
    """An exception that is raised by the lexer, when it cannot match the next 
    string of characters to any of its terminals.
    """

    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
                 terminals_by_name=None, considered_rules=None):
        super(UnexpectedCharacters, self).__init__()
@@ -185,10 +193,15 @@ class UnexpectedToken(ParseError, UnexpectedInput):
    """An exception that is raised by the parser, when the token it received
    doesn't match any valid step forward.

    The parser provides an interactive instance through `interactive_parser`,
    which is initialized to the point of failture, and can be used for debugging and error handling.
    Parameters:
        token: The mismatched token
        expected: The set of expected tokens
        considered_rules: Which rules were considered, to deduce the expected tokens
        state: A value representing the parser state. Do not rely on its value or type.
        interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture,
                            and can be used for debugging and error handling.

    see: ``InteractiveParser``.
    Note: These parameters are available as attributes of the instance.
    """

    def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
@@ -234,14 +247,20 @@ class VisitError(LarkError):
    """VisitError is raised when visitors are interrupted by an exception

    It provides the following attributes for inspection:
    - obj: the tree node or token it was processing when the exception was raised
    - orig_exc: the exception that cause it to fail

    Parameters:
        rule: the name of the visit rule that failed
        obj: the tree-node or token that was being processed
        orig_exc: the exception that cause it to fail

    Note: These parameters are available as attributes
    """

    def __init__(self, rule, obj, orig_exc):
        message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
        super(VisitError, self).__init__(message)

        self.rule = rule
        self.obj = obj
        self.orig_exc = orig_exc

--- a/lark/lark.py
+++ b/lark/lark.py
@@ -102,7 +102,7 @@ class LarkOptions(Serialize):
            A List of either paths or loader functions to specify from where grammars are imported
    source_path
            Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading
    **=== End Options ===**
    **=== End of Options ===**
    """
    if __doc__:
        __doc__ += OPTIONS_DOC
@@ -527,6 +527,8 @@ class Lark(Serialize):
        """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'

        When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.

        :raises UnexpectedCharacters: In case the lexer cannot find a suitable match.
        """
        if not hasattr(self, 'lexer') or dont_ignore:
            lexer = self._build_lexer(dont_ignore)
@@ -569,6 +571,10 @@ class Lark(Serialize):
            If a transformer is supplied to ``__init__``, returns whatever is the
            result of the transformation. Otherwise, returns a Tree instance.

        :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise:
                ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``.
                For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.

        """
        return self.parser.parse(text, start=start, on_error=on_error)

--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -9,7 +9,7 @@ import pkgutil
 from ast import literal_eval
 from numbers import Integral

 from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique
 from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors
 from .lexer import Token, TerminalDef, PatternStr, PatternRE

 from .parse_tree_builder import ParseTreeBuilder
@@ -174,28 +174,141 @@ RULES = {
    'literal': ['REGEXP', 'STRING'],
 }

 REPEAT_BREAK_THRESHOLD = 50
 # The Threshold whether repeat via ~ are split up into different rules
 # 50 is chosen since it keeps the number of states low and therefore lalr analysis time low,
 # while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts.
 # For a grammar  of the form start: "A"~0..N, these are the timing stats:
 #  N  t
 # 10 0.000
 # 20 0.004
 # 30 0.016
 # 40 0.049
 # 50 0.109
 # 60 0.215
 # 70 0.383
 # 80 0.631
 # (See PR #949)


@inline_args
 class EBNF_to_BNF(Transformer_InPlace):
    def __init__(self):
        self.new_rules = []
        self.rules_by_expr = {}
        self.rules_cache = {}
        self.prefix = 'anon'
        self.i = 0
        self.rule_options = None

    def _add_recurse_rule(self, type_, expr):
        if expr in self.rules_by_expr:
            return self.rules_by_expr[expr]

        new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
    def _name_rule(self, inner):
        new_name = '__%s_%s_%d' % (self.prefix, inner, self.i)
        self.i += 1
        t = NonTerminal(new_name)
        tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
        self.new_rules.append((new_name, tree, self.rule_options))
        self.rules_by_expr[expr] = t
        return new_name

    def _add_rule(self, key, name, expansions):
        t = NonTerminal(name)
        self.new_rules.append((name, expansions, self.rule_options))
        self.rules_cache[key] = t
        return t

    def _add_recurse_rule(self, type_, expr):
        try:
            return self.rules_cache[expr]
        except KeyError:
            new_name = self._name_rule(type_)
            t = NonTerminal(new_name)
            tree = ST('expansions', [
                ST('expansion', [expr]),
                ST('expansion', [t, expr])
            ])
            return self._add_rule(expr, new_name, tree)

    def _add_repeat_rule(self, a, b, target, atom):
        """
        When target matches n times atom
        This builds a rule that matches atom (a*n + b) times

        The rule is of the form:

        The rules are of the form: (Example a = 3, b = 4)

        new_rule: target target target atom atom atom atom

        e.g. we use target * a and atom * b
        """
        key = (a, b, target, atom)
        try:
            return self.rules_cache[key]
        except KeyError:
            new_name = self._name_rule('repeat_a%d_b%d' % (a, b))
            tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
            return self._add_rule(key, new_name, tree)

    def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
        """
        When target matches n times atom, and target_opt 0 to n-1 times target_opt,
        This builds a rule that matches atom 0 to (a*n+b)-1 times.
        The created rule will not have any shift/reduce conflicts so that it can be used with lalr

        The rules are of the form: (Example a = 3, b = 4)

        new_rule: target_opt
                | target target_opt
                | target target target_opt

                | target target target
                | target target target atom
                | target target target atom atom
                | target target target atom atom atom

        First we generate target * i followed by target_opt for i from 0 to a-1
        These match 0 to n*a - 1 times atom

        Then we generate target * a followed by atom * i for i from 0 to b-1
        These match n*a to n*a + b-1 times atom
        """
        key = (a, b, target, atom, "opt")
        try:
            return self.rules_cache[key]
        except KeyError:
            new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b))
            tree = ST('expansions', [
                ST('expansion', [target] * i + [target_opt])
                for i in range(a)
            ] + [
                ST('expansion', [target] * a + [atom] * i)
                for i in range(b)
            ])
            return self._add_rule(key, new_name, tree)

    def _generate_repeats(self, rule, mn, mx):
        """
        We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn).
        We then use small_factors to split up mn and diff up into values [(a, b), ...]
        This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
        to generate a complete rule/expression that matches the corresponding number of repeats
        """
        mn_factors = small_factors(mn)
        mn_target = rule
        for a, b in mn_factors:
            mn_target = self._add_repeat_rule(a, b, mn_target, rule)
        if mx == mn:
            return mn_target

        diff = mx - mn + 1  # We add one because _add_repeat_opt_rule generates rules that match one less
        diff_factors = small_factors(diff)
        diff_target = rule  # Match rule 1 times
        diff_opt_target = ST('expansion', [])  # match rule 0 times (e.g. up to 1 -1 times)
        for a, b in diff_factors[:-1]:
            new_diff_target = self._add_repeat_rule(a, b, diff_target, rule)
            diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
            diff_target = new_diff_target

        a, b = diff_factors[-1]  # We do the last on separately since we don't need to call self._add_repeat_rule
        diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)

        return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])

    def expr(self, rule, op, *args):
        if op.value == '?':
            empty = ST('expansion', [])
@@ -220,7 +333,11 @@ class EBNF_to_BNF(Transformer_InPlace):
                mn, mx = map(int, args)
                if mx < mn or mn < 0:
                    raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
            return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
            # For small number of repeats, we don't need to build new rules.
            if mx > REPEAT_BREAK_THRESHOLD:
                return self._generate_repeats(rule, mn, mx)
            else:
                return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
        assert False, op

    def maybe(self, rule):
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -187,7 +187,7 @@ def get_regexp_width(expr):
                return 1, sre_constants.MAXREPEAT
            else:
                return 0, sre_constants.MAXREPEAT
            

 ###}


@@ -288,7 +288,7 @@ except ImportError:

 class FS:
    exists = os.path.exists
    

    @staticmethod
    def open(name, mode="r", **kwargs):
        if atomicwrites and "w" in mode:
@@ -359,3 +359,32 @@ def _serialize(value, memo):
        return {key:_serialize(elem, memo) for key, elem in value.items()}
    # assert value is None or isinstance(value, (int, float, str, tuple)), value
    return value


 # Value 5 keeps the number of states in the lalr parser somewhat minimal
 # It isn't optimal, but close to it. See PR #949
 SMALL_FACTOR_THRESHOLD = 5


 def small_factors(n):
    """
    Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD.
    Returns a list of [(a, b), ...]
    so that the following code returns n:

    n = 1
    for a, b in values:
        n = n * a + b

    Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change
    """
    assert n >= 0
    if n <= SMALL_FACTOR_THRESHOLD:
        return [(n, 0)]
    # While this does not provide an optimal solution, it produces a pretty good one.
    # See above comment and PR #949
    for a in range(SMALL_FACTOR_THRESHOLD, 1, -1):
        r, b = divmod(n, a)
        if a + b <= SMALL_FACTOR_THRESHOLD:
            return small_factors(r) + [(a, b)]
    assert False, "Failed to factorize %s" % n
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2226,6 +2226,35 @@ def _make_parser_test(LEXER, PARSER):
            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')

        @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated")
        def test_ranged_repeat_large(self):
            # Large is currently arbitrarily chosen to be large than 20
            g = u"""!start: "A"~60
                """
            l = _Lark(g)
            self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
            self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60))
            self.assertRaises(ParseError, l.parse, u'A' * 59)
            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61)

            g = u"""!start: "A"~15..100
                """
            l = _Lark(g)
            for i in range(0, 110):
                if 15 <= i <= 100:
                    self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
                else:
                    self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i)

            # 8191 is a Mersenne prime
            g = u"""start: "A"~8191
                """
            l = _Lark(g)
            self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
            self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190)
            self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192)


        @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now")  # TODO XXX
        def test_priority_vs_embedded(self):
            g = """