From 467483553bc77cd785d963d908e3726b99b1b789 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Thu, 20 Dec 2018 01:24:23 +0200
Subject: [PATCH] Fixed placeholder code, should work as expected (Issue #285)

---
 lark/load_grammar.py       |  8 ++--
 lark/parse_tree_builder.py | 88 ++++++++++++++++++++++++++------------
 tests/test_parser.py       | 25 +++++++----
 3 files changed, 81 insertions(+), 40 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index ce4e722..3c11830 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -179,6 +179,8 @@ class EBNF_to_BNF(Transformer_InPlace):
             if isinstance(rule, Terminal) and rule.filter_out and not (
                     self.rule_options and self.rule_options.keep_all_tokens):
                 empty = ST('expansion', [])
+            elif isinstance(rule, NonTerminal) and rule.name.startswith('_'):
+                empty = ST('expansion', [])
             else:
                 empty = _EMPTY
             return ST('expansions', [rule, empty])
@@ -506,11 +508,11 @@ class Grammar:
                 if alias and name.startswith('_'):
                     raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))
 
-                empty_indices = [i for i, x in enumerate(expansion) if x==_EMPTY]
-                if empty_indices:
+                empty_indices = [x==_EMPTY for i, x in enumerate(expansion)]
+                if any(empty_indices):
                     assert options
                     exp_options = copy(options)
-                    exp_options.empty_indices = len(expansion), empty_indices
+                    exp_options.empty_indices = empty_indices
                     expansion = [x for x in expansion if x!=_EMPTY]
                 else:
                     exp_options = options
diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index ed1f1dd..7992993 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -17,24 +17,6 @@ class ExpandSingleChild:
         else:
             return self.node_builder(children)
 
-class AddMaybePlaceholder:
-    def __init__(self, empty_indices, node_builder):
-        self.node_builder = node_builder
-        self.empty_indices = empty_indices
-
-    def __call__(self, children):
-        t = self.node_builder(children)
-        if self.empty_indices:
-            exp_len, empty_indices = self.empty_indices
-            # Calculate offset to handle repetition correctly
-            # e.g. ("a" "b"?)+
-            # For non-repetitive rules, offset should be 0
-            offset = len(t.children) - (exp_len - len(empty_indices))
-            for i in empty_indices:
-                t.children.insert(i + offset, None)
-        return t
-
-
 class PropagatePositions:
     def __init__(self, node_builder):
         self.node_builder = node_builder
@@ -77,23 +59,54 @@ class PropagatePositions:
 
 
 class ChildFilter:
-    def __init__(self, to_include, node_builder):
+    def __init__(self, to_include, append_none, node_builder):
         self.node_builder = node_builder
         self.to_include = to_include
+        self.append_none = append_none
 
     def __call__(self, children):
         filtered = []
-        for i, to_expand in self.to_include:
+
+        for i, to_expand, add_none in self.to_include:
+            if add_none:
+                filtered += [None] * add_none
             if to_expand:
                 filtered += children[i].children
             else:
                 filtered.append(children[i])
 
+        if self.append_none:
+            filtered += [None] * self.append_none
+
         return self.node_builder(filtered)
 
 class ChildFilterLALR(ChildFilter):
     "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
 
+    def __call__(self, children):
+        filtered = []
+        for i, to_expand, add_none in self.to_include:
+            if add_none:
+                filtered += [None] * add_none
+            if to_expand:
+                if filtered:
+                    filtered += children[i].children
+                else:   # Optimize for left-recursion
+                    filtered = children[i].children
+            else:
+                filtered.append(children[i])
+
+        if self.append_none:
+            filtered += [None] * self.append_none
+
+        return self.node_builder(filtered)
+
+class ChildFilterLALR_NoPlaceholders(ChildFilter):
+    "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"
+    def __init__(self, to_include, node_builder):
+        self.node_builder = node_builder
+        self.to_include = to_include
+
     def __call__(self, children):
         filtered = []
         for i, to_expand in self.to_include:
@@ -110,12 +123,32 @@ class ChildFilterLALR(ChildFilter):
 def _should_expand(sym):
     return not sym.is_term and sym.name.startswith('_')
 
-def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous):
-    to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion)
-                  if keep_all_tokens or not (sym.is_term and sym.filter_out)]
-
-    if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
-        return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include)
+def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous, _empty_indices):
+    # Prepare empty_indices as: How many Nones to insert at each index?
+    if _empty_indices:
+        assert _empty_indices.count(False) == len(expansion)
+        s = ''.join(str(int(b)) for b in _empty_indices)
+        empty_indices = [len(ones) for ones in s.split('0')]
+        assert len(empty_indices) == len(expansion)+1, (empty_indices, len(expansion))
+    else:
+        empty_indices = [0] * (len(expansion)+1)
+
+    to_include = []
+    nones_to_add = 0
+    for i, sym in enumerate(expansion):
+        nones_to_add += empty_indices[i]
+        if keep_all_tokens or not (sym.is_term and sym.filter_out):
+            to_include.append((i, _should_expand(sym), nones_to_add))
+            nones_to_add = 0
+
+    nones_to_add += empty_indices[len(expansion)]
+
+    if _empty_indices or len(to_include) < len(expansion) or any(to_expand for i, to_expand,_ in to_include):
+        if _empty_indices or ambiguous:
+            return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include, nones_to_add)
+        else:
+            # LALR without placeholders
+            return partial(ChildFilterLALR_NoPlaceholders, [(i, x) for i,x,_ in to_include])
 
 
 class Callback(object):
@@ -150,8 +183,7 @@ class ParseTreeBuilder:
 
             wrapper_chain = filter(None, [
                 (expand_single_child and not rule.alias) and ExpandSingleChild,
-                maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous),
-                self.maybe_placeholders and partial(AddMaybePlaceholder, options.empty_indices if options else None),
+                maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None),
                 self.propagate_positions and PropagatePositions,
             ])
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index b1bfd61..2cb97fd 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1035,7 +1035,7 @@ def _make_parser_test(LEXER, PARSER):
             bb_.1: "bb"
             """
 
-            l = _Lark(grammar, ambiguity='resolve__antiscore_sum')
+            l = Lark(grammar, ambiguity='resolve__antiscore_sum')
             res = l.parse('abba')
             self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
 
@@ -1048,7 +1048,7 @@ def _make_parser_test(LEXER, PARSER):
             bb_: "bb"
             """
 
-            l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
+            l = Lark(grammar, ambiguity='resolve__antiscore_sum')
             res = l.parse('abba')
             self.assertEqual(''.join(child.data for child in res.children), 'indirection')
 
@@ -1061,7 +1061,7 @@ def _make_parser_test(LEXER, PARSER):
             bb_.3: "bb"
             """
 
-            l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
+            l = Lark(grammar, ambiguity='resolve__antiscore_sum')
             res = l.parse('abba')
             self.assertEqual(''.join(child.data for child in res.children), 'ab_b_a_')
 
@@ -1074,7 +1074,7 @@ def _make_parser_test(LEXER, PARSER):
             bb_.3: "bb"
             """
 
-            l = Lark(grammar, parser='earley', ambiguity='resolve__antiscore_sum')
+            l = Lark(grammar, ambiguity='resolve__antiscore_sum')
             res = l.parse('abba')
             self.assertEqual(''.join(child.data for child in res.children), 'indirection')
 
@@ -1248,18 +1248,19 @@ def _make_parser_test(LEXER, PARSER):
             res = p.parse('B')
             self.assertEqual(len(res.children), 3)
 
+        @unittest.skipIf(PARSER=='cyk', "Empty rules")
         def test_maybe_placeholders(self):
             # Anonymous tokens shouldn't count
-            p = Lark("""start: "a"? "b"? "c"? """, maybe_placeholders=True)
+            p = _Lark("""start: "a"? "b"? "c"? """, maybe_placeholders=True)
             self.assertEqual(p.parse("").children, [])
 
             # Anonymous tokens shouldn't count, other constructs should
-            p = Lark("""start: A? "b"? _c?
+            p = _Lark("""start: A? "b"? _c?
                         A: "a"
                         _c: "c" """, maybe_placeholders=True)
-            self.assertEqual(p.parse("").children, [None, None])
+            self.assertEqual(p.parse("").children, [None])
 
-            p = Lark("""!start: "a"? "b"? "c"? """, maybe_placeholders=True)
+            p = _Lark("""!start: "a"? "b"? "c"? """, maybe_placeholders=True)
             self.assertEqual(p.parse("").children, [None, None, None])
             self.assertEqual(p.parse("a").children, ['a', None, None])
             self.assertEqual(p.parse("b").children, [None, 'b', None])
@@ -1269,7 +1270,7 @@ def _make_parser_test(LEXER, PARSER):
             self.assertEqual(p.parse("bc").children, [None, 'b', 'c'])
             self.assertEqual(p.parse("abc").children, ['a', 'b', 'c'])
 
-            p = Lark("""!start: ("a"? "b" "c"?)+ """, maybe_placeholders=True)
+            p = _Lark("""!start: ("a"? "b" "c"?)+ """, maybe_placeholders=True)
             self.assertEqual(p.parse("b").children, [None, 'b', None])
             self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None])
             self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c'])
@@ -1280,6 +1281,12 @@ def _make_parser_test(LEXER, PARSER):
                  'a', 'b', 'c',
                  None, 'b', None])
 
+            p = _Lark("""!start: "a"? "c"? "b"+ "a"? "d"? """, maybe_placeholders=True)
+            self.assertEqual(p.parse("bb").children, [None, None, 'b', 'b', None, None])
+            self.assertEqual(p.parse("bd").children, [None, None, 'b', None, 'd'])
+            self.assertEqual(p.parse("abba").children, ['a', None, 'b', 'b', 'a', None])
+            self.assertEqual(p.parse("cbbbb").children, [None, 'c', 'b', 'b', 'b', 'b', None, None])
+
 
 
     _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()