New Feature: Added maybe_placeholders option (Issue #285)

6 years ago · 222df5bab4
--- a/lark/grammar.py
+++ b/lark/grammar.py
@@ -64,6 +64,7 @@ class RuleOptions:
        self.keep_all_tokens = keep_all_tokens
        self.expand1 = expand1
        self.priority = priority
        self.empty_indices = ()

    def __repr__(self):
        return 'RuleOptions(%r, %r, %r)' % (
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -45,6 +45,7 @@ class LarkOptions(object):
        profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False)
        propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches.
        lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution.
        maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None
    """
    if __doc__:
        __doc__ += OPTIONS_DOC
@@ -66,6 +67,7 @@ class LarkOptions(object):
        self.propagate_positions = o.pop('propagate_positions', False)
        self.earley__predict_all = o.pop('earley__predict_all', False)
        self.lexer_callbacks = o.pop('lexer_callbacks', {})
        self.maybe_placeholders = o.pop('maybe_placeholders', False)

        assert self.parser in ('earley', 'lalr', 'cyk', None)

@@ -179,7 +181,7 @@ class Lark:
    def _build_parser(self):
        self.parser_class = get_frontend(self.options.parser, self.options.lexer)

        self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr')
        self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr', self.options.maybe_placeholders)
        callback = self._parse_tree_builder.create_callback(self.options.transformer)
        if self.profiler:
            for f in dir(callback):
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -3,7 +3,7 @@
 import os.path
 import sys
 from ast import literal_eval
 from copy import deepcopy
 from copy import copy, deepcopy

 from .utils import bfs
 from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -26,6 +26,8 @@ EXT = '.lark'

 _RE_FLAGS = 'imslux'

 _EMPTY = Symbol('__empty__')

 _TERMINAL_NAMES = {
    '.' : 'DOT',
    ',' : 'COMMA',
@@ -151,7 +153,6 @@ RULES = {
    'literal': ['REGEXP', 'STRING'],
 }


@inline_args
 class EBNF_to_BNF(Transformer_InPlace):
    def __init__(self):
@@ -175,7 +176,7 @@ class EBNF_to_BNF(Transformer_InPlace):

    def expr(self, rule, op, *args):
        if op.value == '?':
            return ST('expansions', [rule, ST('expansion', [])])
            return ST('expansions', [rule, _EMPTY])
        elif op.value == '+':
            # a : b c+ d
            #   -->
@@ -481,7 +482,8 @@ class Grammar:
        for name, rule_tree, options in rule_defs:
            ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None
            tree = transformer.transform(rule_tree)
            rules.append((name, ebnf_to_bnf.transform(tree), options))
            res = ebnf_to_bnf.transform(tree)
            rules.append((name, res, options))
        rules += ebnf_to_bnf.new_rules

        assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision"
@@ -499,9 +501,17 @@ class Grammar:
                if alias and name.startswith('_'):
                    raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))

                assert all(isinstance(x, Symbol) for x in expansion), expansion
                empty_indices = [i for i, x in enumerate(expansion) if x==_EMPTY]
                if empty_indices:
                    assert options
                    exp_options = copy(options)
                    exp_options.empty_indices = len(expansion), empty_indices
                    expansion = [x for x in expansion if x!=_EMPTY]
                else:
                    exp_options = options

                rule = Rule(NonTerminal(name), expansion, alias, options)
                assert all(isinstance(x, Symbol) for x in expansion), expansion
                rule = Rule(NonTerminal(name), expansion, alias, exp_options)
                compiled_rules.append(rule)

        return terminals, compiled_rules, self.ignore
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -1,7 +1,5 @@
 from .exceptions import GrammarError
 from .utils import suppress
 from .lexer import Token
 from .grammar import Rule
 from .tree import Tree
 from .visitors import InlineTransformer # XXX Deprecated

@@ -19,6 +17,23 @@ class ExpandSingleChild:
        else:
            return self.node_builder(children)

 class AddMaybePlaceholder:
    def __init__(self, empty_indices, node_builder):
        self.node_builder = node_builder
        self.empty_indices = empty_indices

    def __call__(self, children):
        t = self.node_builder(children)
        if self.empty_indices:
            exp_len, empty_indices = self.empty_indices
            # Calculate offset to handle repetition correctly
            # e.g. ("a" "b"?)+
            # For non-repetitive rules, offset should be 0
            offset = len(t.children) - (exp_len - len(empty_indices))
            for i in empty_indices:
                t.children.insert(i + offset, None)
        return t


 class PropagatePositions:
    def __init__(self, node_builder):
@@ -116,11 +131,12 @@ def ptb_inline_args(func):


 class ParseTreeBuilder:
    def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False):
    def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False):
        self.tree_class = tree_class
        self.propagate_positions = propagate_positions
        self.always_keep_all_tokens = keep_all_tokens
        self.ambiguous = ambiguous
        self.maybe_placeholders = maybe_placeholders

        self.rule_builders = list(self._init_builders(rules))

@@ -135,6 +151,7 @@ class ParseTreeBuilder:
            wrapper_chain = filter(None, [
                (expand_single_child and not rule.alias) and ExpandSingleChild,
                maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous),
                self.maybe_placeholders and partial(AddMaybePlaceholder, options.empty_indices),
                self.propagate_positions and PropagatePositions,
            ])

--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1248,6 +1248,28 @@ def _make_parser_test(LEXER, PARSER):
            res = p.parse('B')
            self.assertEqual(len(res.children), 3)

        def test_maybe_placeholders(self):
            p = Lark("""!start: "a"? "b"? "c"? """, maybe_placeholders=True)
            self.assertEqual(p.parse("").children, [None, None, None])
            self.assertEqual(p.parse("a").children, ['a', None, None])
            self.assertEqual(p.parse("b").children, [None, 'b', None])
            self.assertEqual(p.parse("c").children, [None, None, 'c'])
            self.assertEqual(p.parse("ab").children, ['a', 'b', None])
            self.assertEqual(p.parse("ac").children, ['a', None, 'c'])
            self.assertEqual(p.parse("bc").children, [None, 'b', 'c'])
            self.assertEqual(p.parse("abc").children, ['a', 'b', 'c'])

            p = Lark("""!start: ("a"? "b" "c"?)+ """, maybe_placeholders=True)
            self.assertEqual(p.parse("b").children, [None, 'b', None])
            self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None])
            self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c'])
            self.assertEqual(p.parse("babbcabcb").children,
                [None, 'b', None, 
                 'a', 'b', None, 
                 None, 'b', 'c',
                 'a', 'b', 'c',
                 None, 'b', None])



    _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()