From 467483553bc77cd785d963d908e3726b99b1b789 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 20 Dec 2018 01:24:23 +0200 Subject: [PATCH] Fixed placeholder code, should work as expected (Issue #285) --- lark/load_grammar.py | 8 ++-- lark/parse_tree_builder.py | 88 ++++++++++++++++++++++++++------------ tests/test_parser.py | 25 +++++++---- 3 files changed, 81 insertions(+), 40 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ce4e722..3c11830 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -179,6 +179,8 @@ class EBNF_to_BNF(Transformer_InPlace): if isinstance(rule, Terminal) and rule.filter_out and not ( self.rule_options and self.rule_options.keep_all_tokens): empty = ST('expansion', []) + elif isinstance(rule, NonTerminal) and rule.name.startswith('_'): + empty = ST('expansion', []) else: empty = _EMPTY return ST('expansions', [rule, empty]) @@ -506,11 +508,11 @@ class Grammar: if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - empty_indices = [i for i, x in enumerate(expansion) if x==_EMPTY] - if empty_indices: + empty_indices = [x==_EMPTY for i, x in enumerate(expansion)] + if any(empty_indices): assert options exp_options = copy(options) - exp_options.empty_indices = len(expansion), empty_indices + exp_options.empty_indices = empty_indices expansion = [x for x in expansion if x!=_EMPTY] else: exp_options = options diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index ed1f1dd..7992993 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -17,24 +17,6 @@ class ExpandSingleChild: else: return self.node_builder(children) -class AddMaybePlaceholder: - def __init__(self, empty_indices, node_builder): - self.node_builder = node_builder - self.empty_indices = empty_indices - - def __call__(self, children): - t = self.node_builder(children) - if self.empty_indices: - exp_len, empty_indices = self.empty_indices - # Calculate offset to handle repetition correctly - # e.g. ("a" "b"?)+ - # For non-repetitive rules, offset should be 0 - offset = len(t.children) - (exp_len - len(empty_indices)) - for i in empty_indices: - t.children.insert(i + offset, None) - return t - - class PropagatePositions: def __init__(self, node_builder): self.node_builder = node_builder @@ -77,23 +59,54 @@ class PropagatePositions: class ChildFilter: - def __init__(self, to_include, node_builder): + def __init__(self, to_include, append_none, node_builder): self.node_builder = node_builder self.to_include = to_include + self.append_none = append_none def __call__(self, children): filtered = [] - for i, to_expand in self.to_include: + + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none if to_expand: filtered += children[i].children else: filtered.append(children[i]) + if self.append_none: + filtered += [None] * self.append_none + return self.node_builder(filtered) class ChildFilterLALR(ChildFilter): "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" + def __call__(self, children): + filtered = [] + for i, to_expand, add_none in self.to_include: + if add_none: + filtered += [None] * add_none + if to_expand: + if filtered: + filtered += children[i].children + else: # Optimize for left-recursion + filtered = children[i].children + else: + filtered.append(children[i]) + + if self.append_none: + filtered += [None] * self.append_none + + return self.node_builder(filtered) + +class ChildFilterLALR_NoPlaceholders(ChildFilter): + "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" + def __init__(self, to_include, node_builder): + self.node_builder = node_builder + self.to_include = to_include + def __call__(self, children): filtered = [] for i, to_expand in self.to_include: @@ -110,12 +123,32 @@ class ChildFilterLALR(ChildFilter): def _should_expand(sym): return not sym.is_term and sym.name.startswith('_') -def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous): - to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) - if keep_all_tokens or not (sym.is_term and sym.filter_out)] - - if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): - return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) +def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices): + # Prepare empty_indices as: How many Nones to insert at each index? + if _empty_indices: + assert _empty_indices.count(False) == len(expansion) + s = ''.join(str(int(b)) for b in _empty_indices) + empty_indices = [len(ones) for ones in s.split('0')] + assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion)) + else: + empty_indices = [0] * (len(expansion)+1) + + to_include = [] + nones_to_add = 0 + for i, sym in enumerate(expansion): + nones_to_add += empty_indices[i] + if keep_all_tokens or not (sym.is_term and sym.filter_out): + to_include.append((i, _should_expand(sym), nones_to_add)) + nones_to_add = 0 + + nones_to_add += empty_indices[len(expansion)] + + if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include): + if _empty_indices or ambiguous: + return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add) + else: + # LALR without placeholders + return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include]) class Callback(object): @@ -150,8 +183,7 @@ class ParseTreeBuilder: wrapper_chain = filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous), - self.maybe_placeholders and partial(AddMaybePlaceholder, options.empty_indices if options else None), + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), self.propagate_positions and PropagatePositions, ]) diff --git a/tests/test_parser.py b/tests/test_parser.py index b1bfd61..2cb97fd 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1035,7 +1035,7 @@ def _make_parser_test(LEXER, PARSER): bb_.1: "bb" """ - l = _Lark(grammar, ambiguity='resolve__antiscore_sum') + l = Lark(grammar, ambiguity='resolve__antiscore_sum') res = l.parse('abba') self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_') @@ -1048,7 +1048,7 @@ def _make_parser_test(LEXER, PARSER): bb_: "bb" """ - l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum') + l = Lark(grammar, ambiguity='resolve__antiscore_sum') res = l.parse('abba') self.assertEqual(''.join(child.data for child in res.children), 'indirection') @@ -1061,7 +1061,7 @@ def _make_parser_test(LEXER, PARSER): bb_.3: "bb" """ - l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum') + l = Lark(grammar, ambiguity='resolve__antiscore_sum') res = l.parse('abba') self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_') @@ -1074,7 +1074,7 @@ def _make_parser_test(LEXER, PARSER): bb_.3: "bb" """ - l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum') + l = Lark(grammar, ambiguity='resolve__antiscore_sum') res = l.parse('abba') self.assertEqual(''.join(child.data for child in res.children), 'indirection') @@ -1248,18 +1248,19 @@ def _make_parser_test(LEXER, PARSER): res = p.parse('B') self.assertEqual(len(res.children), 3) + @unittest.skipIf(PARSER=='cyk', "Empty rules") def test_maybe_placeholders(self): # Anonymous tokens shouldn't count - p = Lark("""start: "a"? "b"? "c"? """, maybe_placeholders=True) + p = _Lark("""start: "a"? "b"? "c"? """, maybe_placeholders=True) self.assertEqual(p.parse("").children, []) # Anonymous tokens shouldn't count, other constructs should - p = Lark("""start: A? "b"? _c? + p = _Lark("""start: A? "b"? _c? A: "a" _c: "c" """, maybe_placeholders=True) - self.assertEqual(p.parse("").children, [None, None]) + self.assertEqual(p.parse("").children, [None]) - p = Lark("""!start: "a"? "b"? "c"? """, maybe_placeholders=True) + p = _Lark("""!start: "a"? "b"? "c"? """, maybe_placeholders=True) self.assertEqual(p.parse("").children, [None, None, None]) self.assertEqual(p.parse("a").children, ['a', None, None]) self.assertEqual(p.parse("b").children, [None, 'b', None]) @@ -1269,7 +1270,7 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(p.parse("bc").children, [None, 'b', 'c']) self.assertEqual(p.parse("abc").children, ['a', 'b', 'c']) - p = Lark("""!start: ("a"? "b" "c"?)+ """, maybe_placeholders=True) + p = _Lark("""!start: ("a"? "b" "c"?)+ """, maybe_placeholders=True) self.assertEqual(p.parse("b").children, [None, 'b', None]) self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None]) self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c']) @@ -1280,6 +1281,12 @@ def _make_parser_test(LEXER, PARSER): 'a', 'b', 'c', None, 'b', None]) + p = _Lark("""!start: "a"? "c"? "b"+ "a"? "d"? """, maybe_placeholders=True) + self.assertEqual(p.parse("bb").children, [None, None, 'b', 'b', None, None]) + self.assertEqual(p.parse("bd").children, [None, None, 'b', None, 'd']) + self.assertEqual(p.parse("abba").children, ['a', None, 'b', 'b', 'a', None]) + self.assertEqual(p.parse("cbbbb").children, [None, 'c', 'b', 'b', 'b', 'b', None, None]) + _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()