From 3d3858a30cad78136ff07bdedfe27b90b3400956 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 24 Aug 2021 14:20:10 +0100 Subject: [PATCH 1/6] README: Added Poetry to 'projects using Lark' --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 76dcdea..156a671 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail ### Projects using Lark + - [Poetry](https://github.com/python-poetry/poetry-core) - A utility for dependency management and packaging - [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL server by Dailymotion - [Hypothesis](https://github.com/HypothesisWorks/hypothesis) - Library for property-based testing - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration From c9c33423fca6b7d4b8cc3cbb794e991670d78c0a Mon Sep 17 00:00:00 2001 From: MegaIng Date: Wed, 25 Aug 2021 03:10:11 +0200 Subject: [PATCH 2/6] Fix recursion error for many options in Terminal --- lark/load_grammar.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 69bd788..e1f9223 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -600,7 +600,21 @@ def _make_joined_pattern(regexp, flags_set): return PatternRE(regexp, flags) -class TerminalTreeToPattern(Transformer): +class FlattenExpansions(Transformer_InPlace): + @v_args(tree=True) + def expansions(self, tree): + i = 0 + while i < len(tree.children): + c = tree.children[i] + if isinstance(c, Tree) and c.data == 'expansions': + tree.children[i:i+1] = c.children + i += len(c.children) + else: + i += 1 + return tree + + +class TerminalTreeToPattern(Transformer_NonRecursive): def pattern(self, ps): p ,= ps return p @@ -670,7 +684,7 @@ class Grammar: def compile(self, start, terminals_to_keep): # We change the trees in-place (to support huge grammars) # So deepcopy allows calling compile more than once. - term_defs = deepcopy(list(self.term_defs)) + term_defs = [(n,(nr_deepcopy_tree(t), p)) for n,(t,p) in self.term_defs] rule_defs = [(n,p,nr_deepcopy_tree(t),o) for n,p,t,o in self.rule_defs] # =================== @@ -686,7 +700,7 @@ class Grammar: if len(expansions) == 1 and not expansions[0].children: raise GrammarError("Terminals cannot be empty (%s)" % name) - transformer = PrepareLiterals() * TerminalTreeToPattern() + transformer = PrepareLiterals() * FlattenExpansions() * TerminalTreeToPattern() terminals = [TerminalDef(name, transformer.transform(term_tree), priority) for name, (term_tree, priority) in term_defs if term_tree] From 3a4568df246b7c413cbe2309b6411c3ac599136b Mon Sep 17 00:00:00 2001 From: MegaIng Date: Thu, 26 Aug 2021 00:09:22 +0200 Subject: [PATCH 3/6] Reworked grammar to simplify later processing + expand_kids_by_data + tests --- lark-stubs/tree.pyi | 3 +++ lark/exceptions.py | 2 +- lark/lexer.py | 2 +- lark/load_grammar.py | 40 +++++++++++--------------------------- lark/parse_tree_builder.py | 3 +-- lark/tree.py | 9 +++++++++ tests/test_grammar.py | 12 ++++++++++++ 7 files changed, 38 insertions(+), 33 deletions(-) diff --git a/lark-stubs/tree.pyi b/lark-stubs/tree.pyi index ea99ff6..0c12819 100644 --- a/lark-stubs/tree.pyi +++ b/lark-stubs/tree.pyi @@ -40,6 +40,9 @@ class Tree: def expand_kids_by_index(self, *indices: int) -> None: ... + def expand_kids_by_data(self, *data_values: str) -> bool: + ... + def scan_values(self, pred: Callable[[Union[str, Tree]], bool]) -> Iterator[str]: ... diff --git a/lark/exceptions.py b/lark/exceptions.py index fdcd52b..9f18753 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -210,7 +210,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') - self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.pos_in_stream = getattr(token, 'start_pos', None) self.state = state self.token = token diff --git a/lark/lexer.py b/lark/lexer.py index 7c2f979..a82cc18 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -150,7 +150,7 @@ class Token(Str): @property def pos_in_stream(self): - warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning) + warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning, 2) return self.start_pos def update(self, type_=None, value=None): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index e1f9223..f2f5499 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -91,6 +91,7 @@ TERMINALS = { 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', + '_NL_OR': r'(\r?\n)+\s*\|', 'WS': r'[ \t]+', 'COMMENT': r'\s*//[^\n]*', '_TO': '->', @@ -113,9 +114,10 @@ RULES = { ''], '_template_params': ['RULE', '_template_params _COMMA RULE'], - 'expansions': ['alias', - 'expansions _OR alias', - 'expansions _NL _OR alias'], + 'expansions': ['_expansions'], + '_expansions': ['alias', + '_expansions _OR alias', + '_expansions _NL_OR alias'], '?alias': ['expansion _TO RULE', 'expansion'], 'expansion': ['_expansion'], @@ -357,11 +359,8 @@ class SimplifyRule_Visitor(Visitor): @staticmethod def _flatten(tree): while True: - to_expand = [i for i, child in enumerate(tree.children) - if isinstance(child, Tree) and child.data == tree.data] - if not to_expand: + if not tree.expand_kids_by_data(tree.data): break - tree.expand_kids_by_index(*to_expand) def expansion(self, tree): # rules_list unpacking @@ -599,21 +598,6 @@ def _make_joined_pattern(regexp, flags_set): return PatternRE(regexp, flags) - -class FlattenExpansions(Transformer_InPlace): - @v_args(tree=True) - def expansions(self, tree): - i = 0 - while i < len(tree.children): - c = tree.children[i] - if isinstance(c, Tree) and c.data == 'expansions': - tree.children[i:i+1] = c.children - i += len(c.children) - else: - i += 1 - return tree - - class TerminalTreeToPattern(Transformer_NonRecursive): def pattern(self, ps): p ,= ps @@ -684,8 +668,8 @@ class Grammar: def compile(self, start, terminals_to_keep): # We change the trees in-place (to support huge grammars) # So deepcopy allows calling compile more than once. - term_defs = [(n,(nr_deepcopy_tree(t), p)) for n,(t,p) in self.term_defs] - rule_defs = [(n,p,nr_deepcopy_tree(t),o) for n,p,t,o in self.rule_defs] + term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs] + rule_defs = [(n, p, nr_deepcopy_tree(t), o) for n, p, t, o in self.rule_defs] # =================== # Compile Terminals @@ -700,7 +684,7 @@ class Grammar: if len(expansions) == 1 and not expansions[0].children: raise GrammarError("Terminals cannot be empty (%s)" % name) - transformer = PrepareLiterals() * FlattenExpansions() * TerminalTreeToPattern() + transformer = PrepareLiterals() * TerminalTreeToPattern() terminals = [TerminalDef(name, transformer.transform(term_tree), priority) for name, (term_tree, priority) in term_defs if term_tree] @@ -933,7 +917,7 @@ def _get_parser(): parser_conf = ParserConf(rules, callback, ['start']) lexer_conf.lexer_type = 'standard' parser_conf.parser_type = 'lalr' - _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {}) + _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) return _get_parser.cache GRAMMAR_ERRORS = [ @@ -1110,9 +1094,7 @@ class GrammarBuilder: # TODO: think about what to do with 'options' base = self._definitions[name][1] - while len(base.children) == 2: - assert isinstance(base.children[0], Tree) and base.children[0].data == 'expansions', base - base = base.children[0] + assert isinstance(base, Tree) and base.data == 'expansions' base.children.insert(0, exp) def _ignore(self, exp_or_name): diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 286038e..fa526b0 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -204,8 +204,7 @@ class AmbiguousExpander: if i in self.to_expand: ambiguous.append(i) - to_expand = [j for j, grandchild in enumerate(child.children) if _is_ambig_tree(grandchild)] - child.expand_kids_by_index(*to_expand) + child.expand_kids_by_data('_ambig') if not ambiguous: return self.node_builder(children) diff --git a/lark/tree.py b/lark/tree.py index bee53cf..8a29bcb 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -107,6 +107,15 @@ class Tree(object): kid = self.children[i] self.children[i:i+1] = kid.children + def expand_kids_by_data(self, *data_values): + """Expand (inline) children with any of the given data values. Returns True if anything changed""" + indices = [i for i, c in enumerate(self.children) if isinstance(c, Tree) and c.data in data_values] + if indices: + self.expand_kids_by_index(*indices) + return True + else: + return False + def scan_values(self, pred): """Return all values in the tree that evaluate pred(value) as true. diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 3ae65f2..47a345c 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -246,6 +246,18 @@ class TestGrammar(TestCase): self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) + def test_large_terminal(self): + # TODO: The `reversed` below is required because otherwise the regex engine is happy + # with just parsing 9 from the string 999 instead of consuming the longest + g = "start: NUMBERS\n" + g += "NUMBERS: " + '|'.join('"%s"' % i for i in reversed(range(0, 1000))) + + l = Lark(g, parser='lalr') + for i in (0, 9, 99, 999): + self.assertEqual(l.parse(str(i)), Tree('start', [str(i)])) + for i in (-1, 1000): + self.assertRaises(UnexpectedInput, l.parse, str(i)) + if __name__ == '__main__': main() From f5c7af8ce9996f303e05aa4caec32abccde615fa Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 25 Aug 2021 23:57:20 +0100 Subject: [PATCH 4/6] Proposed corrections to PR #970 --- lark/load_grammar.py | 5 ++--- lark/tree.py | 10 ++++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index f2f5499..7c64196 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -358,9 +358,8 @@ class SimplifyRule_Visitor(Visitor): @staticmethod def _flatten(tree): - while True: - if not tree.expand_kids_by_data(tree.data): - break + while tree.expand_kids_by_data(tree.data): + pass def expansion(self, tree): # rules_list unpacking diff --git a/lark/tree.py b/lark/tree.py index 8a29bcb..1d14bf3 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -110,12 +110,14 @@ class Tree(object): def expand_kids_by_data(self, *data_values): """Expand (inline) children with any of the given data values. Returns True if anything changed""" indices = [i for i, c in enumerate(self.children) if isinstance(c, Tree) and c.data in data_values] - if indices: - self.expand_kids_by_index(*indices) - return True - else: + if not indices: return False + for i in reversed(indices): # reverse so that changing tail won't affect indices + child = self.children[i] + self.children[i:i+1] = child.children + return True + def scan_values(self, pred): """Return all values in the tree that evaluate pred(value) as true. From d2e8b15c2f846a15c57f1e6ade625c46560a43de Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 26 Aug 2021 08:22:02 +0100 Subject: [PATCH 5/6] Another update for the PR --- lark/tree.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lark/tree.py b/lark/tree.py index 1d14bf3..2cd8233 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -109,14 +109,14 @@ class Tree(object): def expand_kids_by_data(self, *data_values): """Expand (inline) children with any of the given data values. Returns True if anything changed""" - indices = [i for i, c in enumerate(self.children) if isinstance(c, Tree) and c.data in data_values] - if not indices: - return False + changed = False + for i, c in reversed(list(enumerate(self.children))): + if isinstance(c, Tree) and c.data in data_values: + child = self.children[i] + self.children[i:i+1] = child.children + changed = True + return changed - for i in reversed(indices): # reverse so that changing tail won't affect indices - child = self.children[i] - self.children[i:i+1] = child.children - return True def scan_values(self, pred): """Return all values in the tree that evaluate pred(value) as true. From 4fe49c9cdb34e3a1fdd7bb6f1ff1321e87f390b7 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 26 Aug 2021 13:36:05 +0100 Subject: [PATCH 6/6] Change expand_kids_by_data to use range --- lark/tree.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lark/tree.py b/lark/tree.py index 2cd8233..0937b85 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -110,9 +110,9 @@ class Tree(object): def expand_kids_by_data(self, *data_values): """Expand (inline) children with any of the given data values. Returns True if anything changed""" changed = False - for i, c in reversed(list(enumerate(self.children))): - if isinstance(c, Tree) and c.data in data_values: - child = self.children[i] + for i in range(len(self.children)-1, -1, -1): + child = self.children[i] + if isinstance(child, Tree) and child.data in data_values: self.children[i:i+1] = child.children changed = True return changed