Reworked grammar to simplify later processing + expand_kids_by_data + tests

3 jaren geleden · 3a4568df24
--- a/lark-stubs/tree.pyi
+++ b/lark-stubs/tree.pyi
@@ -40,6 +40,9 @@ class Tree:
    def expand_kids_by_index(self, *indices: int) -> None:
        ...

    def expand_kids_by_data(self, *data_values: str) -> bool:
        ...

    def scan_values(self, pred: Callable[[Union[str, Tree]], bool]) -> Iterator[str]:
        ...

--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -210,7 +210,7 @@ class UnexpectedToken(ParseError, UnexpectedInput):
        # TODO considered_rules and expected can be figured out using state
        self.line = getattr(token, 'line', '?')
        self.column = getattr(token, 'column', '?')
        self.pos_in_stream = getattr(token, 'pos_in_stream', None)
        self.pos_in_stream = getattr(token, 'start_pos', None)
        self.state = state

        self.token = token
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -150,7 +150,7 @@ class Token(Str):

    @property
    def pos_in_stream(self):
        warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning)
        warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning, 2)
        return self.start_pos

    def update(self, type_=None, value=None):
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -91,6 +91,7 @@ TERMINALS = {
    'STRING': r'"(\\"|\\\\|[^"\n])*?"i?',
    'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS,
    '_NL': r'(\r?\n)+\s*',
    '_NL_OR': r'(\r?\n)+\s*\|',
    'WS': r'[ \t]+',
    'COMMENT': r'\s*//[^\n]*',
    '_TO': '->',
@@ -113,9 +114,10 @@ RULES = {
                        ''],
    '_template_params': ['RULE',
                         '_template_params _COMMA RULE'],
    'expansions': ['alias',
                   'expansions _OR alias',
                   'expansions _NL _OR alias'],
    'expansions': ['_expansions'],
    '_expansions': ['alias',
                    '_expansions _OR alias',
                    '_expansions _NL_OR alias'],

    '?alias':     ['expansion _TO RULE', 'expansion'],
    'expansion': ['_expansion'],
@@ -357,11 +359,8 @@ class SimplifyRule_Visitor(Visitor):
    @staticmethod
    def _flatten(tree):
        while True:
            to_expand = [i for i, child in enumerate(tree.children)
                         if isinstance(child, Tree) and child.data == tree.data]
            if not to_expand:
            if not tree.expand_kids_by_data(tree.data):
                break
            tree.expand_kids_by_index(*to_expand)

    def expansion(self, tree):
        # rules_list unpacking
@@ -599,21 +598,6 @@ def _make_joined_pattern(regexp, flags_set):

    return PatternRE(regexp, flags)


 class FlattenExpansions(Transformer_InPlace):
    @v_args(tree=True)
    def expansions(self, tree):
        i = 0
        while i < len(tree.children):
            c = tree.children[i]
            if isinstance(c, Tree) and c.data == 'expansions':
                tree.children[i:i+1] = c.children
                i += len(c.children)
            else:
                i += 1
        return tree


 class TerminalTreeToPattern(Transformer_NonRecursive):
    def pattern(self, ps):
        p ,= ps
@@ -684,8 +668,8 @@ class Grammar:
    def compile(self, start, terminals_to_keep):
        # We change the trees in-place (to support huge grammars)
        # So deepcopy allows calling compile more than once.
        term_defs = [(n,(nr_deepcopy_tree(t), p)) for n,(t,p) in self.term_defs]
        rule_defs = [(n,p,nr_deepcopy_tree(t),o) for n,p,t,o in self.rule_defs]
        term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs]
        rule_defs = [(n, p, nr_deepcopy_tree(t), o) for n, p, t, o in self.rule_defs]

        # ===================
        #  Compile Terminals
@@ -700,7 +684,7 @@ class Grammar:
            if len(expansions) == 1 and not expansions[0].children:
                raise GrammarError("Terminals cannot be empty (%s)" % name)

        transformer = PrepareLiterals() * FlattenExpansions() * TerminalTreeToPattern()
        transformer = PrepareLiterals() * TerminalTreeToPattern()
        terminals = [TerminalDef(name, transformer.transform(term_tree), priority)
                     for name, (term_tree, priority) in term_defs if term_tree]

@@ -933,7 +917,7 @@ def _get_parser():
        parser_conf = ParserConf(rules, callback, ['start'])
        lexer_conf.lexer_type = 'standard'
        parser_conf.parser_type = 'lalr'
        _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {})
        _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None)
        return _get_parser.cache

 GRAMMAR_ERRORS = [
@@ -1110,9 +1094,7 @@ class GrammarBuilder:
        # TODO: think about what to do with 'options'
        base = self._definitions[name][1]

        while len(base.children) == 2:
            assert isinstance(base.children[0], Tree) and base.children[0].data == 'expansions', base
            base = base.children[0]
        assert isinstance(base, Tree) and base.data == 'expansions'
        base.children.insert(0, exp)

    def _ignore(self, exp_or_name):
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -204,8 +204,7 @@ class AmbiguousExpander:
                if i in self.to_expand:
                    ambiguous.append(i)

                to_expand = [j for j, grandchild in enumerate(child.children) if _is_ambig_tree(grandchild)]
                child.expand_kids_by_index(*to_expand)
                child.expand_kids_by_data('_ambig')

        if not ambiguous:
            return self.node_builder(children)
--- a/lark/tree.py
+++ b/lark/tree.py
@@ -107,6 +107,15 @@ class Tree(object):
            kid = self.children[i]
            self.children[i:i+1] = kid.children

    def expand_kids_by_data(self, *data_values):
        """Expand (inline) children with any of the given data values. Returns True if anything changed"""
        indices = [i for i, c in enumerate(self.children) if isinstance(c, Tree) and c.data in data_values]
        if indices:
            self.expand_kids_by_index(*indices)
            return True
        else:
            return False

    def scan_values(self, pred):
        """Return all values in the tree that evaluate pred(value) as true.

--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@@ -246,6 +246,18 @@ class TestGrammar(TestCase):
        self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190)
        self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192)

    def test_large_terminal(self):
        # TODO: The `reversed` below is required because otherwise the regex engine is happy
        #       with just parsing 9 from the string 999 instead of consuming the longest
        g = "start: NUMBERS\n"
        g += "NUMBERS: " + '|'.join('"%s"' % i for i in reversed(range(0, 1000)))

        l = Lark(g, parser='lalr')
        for i in (0, 9, 99, 999):
            self.assertEqual(l.parse(str(i)), Tree('start', [str(i)]))
        for i in (-1, 1000):
            self.assertRaises(UnexpectedInput, l.parse, str(i))


 if __name__ == '__main__':
    main()