Added earley__all_derivations due to performance concerns

7 years ago · aede340449
--- a/lark/init.py
+++ b/lark/init.py
@@ -3,4 +3,4 @@ from .common import ParseError, GrammarError
 from .lark import Lark
 from .utils import inline_args

 __version__ = "0.3.4"
 __version__ = "0.3.5"
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -28,10 +28,14 @@ class LarkOptions(object):
            "auto" (default): Choose for me based on grammar and parser

        ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley"
            "resolve": The parser will automatically choose the simplest derivation 
            "resolve": The parser will automatically choose the simplest derivation
                       (it chooses consistently: greedy for tokens, non-greedy for rules)
            "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest).

        earley__all_derivations - If True, try every possible derivation of each rule. If False, pick the first
                                  correct derivation. Both will find a solution to every correct grammar & input,
                                  but when False, some ambiguities won't appear (Default: True)

        transformer - Applies the transformer to every parse tree
        debug - Affects verbosity (default: False)
        keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False)
@@ -57,6 +61,7 @@ class LarkOptions(object):
        self.profile = o.pop('profile', False)
        self.ambiguity = o.pop('ambiguity', 'auto')
        self.propagate_positions = o.pop('propagate_positions', False)
        self.earley__all_derivations = o.pop('earley__all_derivations', True)

        assert self.parser in ('earley', 'lalr', None)

--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -77,7 +77,8 @@ class Earley_NoLex:
        self.parser = earley.Parser(rules,
                                    parser_conf.start,
                                    parser_conf.callback,
                                    resolve_ambiguity=get_ambiguity_resolver(options))
                                    resolve_ambiguity=get_ambiguity_resolver(options),
                                    all_derivations = options.earley__all_derivations if options else True)

    def _prepare_expansion(self, expansion):
        for sym in expansion:
@@ -100,10 +101,11 @@ class Earley(WithLexer):

        rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules]

        self.parser = earley.Parser(rules, 
        self.parser = earley.Parser(rules,
                                    parser_conf.start,
                                    parser_conf.callback,
                                    resolve_ambiguity=get_ambiguity_resolver(options))
                                    resolve_ambiguity=get_ambiguity_resolver(options),
                                    all_derivations = options.earley__all_derivations if options else True)

    def _prepare_expansion(self, expansion):
        return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -51,7 +51,7 @@ class Item(object):
    def advance(self, tree):
        assert self.tree.data == 'drv'
        new_tree = Derivation(self.rule, self.tree.children + [tree])
        return Item(self.rule, self.ptr+1, self.start, new_tree)
        return self.__class__(self.rule, self.ptr+1, self.start, new_tree)

    def similar(self, other):
        return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
@@ -67,6 +67,9 @@ class Item(object):
        after = list(map(str, self.rule.expansion[self.ptr:]))
        return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after))

 class Item_JoinDerivations(Item):
    __eq__ = Item.similar


 class NewsList(list):
    "Keeps track of newly added items (append-only)"
@@ -133,10 +136,16 @@ class Column:
        return bool(self.item_count)

 class Parser:
    def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None):
    def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, all_derivations=True):
        """
        all_derivations:
            True = Try every rule combination, and every possible derivation of each rule. (default)
            False = Try every rule combination, but not every derivation of the same rule.
        """
        self.analysis = GrammarAnalyzer(rules, start_symbol)
        self.start_symbol = start_symbol
        self.resolve_ambiguity = resolve_ambiguity
        self.all_derivations = all_derivations

        self.postprocess = {}
        self.predictions = {}
@@ -150,9 +159,11 @@ class Parser:
        # Define parser functions
        start_symbol = start_symbol or self.start_symbol

        _Item = Item if self.all_derivations else Item_JoinDerivations

        def predict(nonterm, column):
            assert not isinstance(nonterm, Terminal), nonterm
            return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
            return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]]

        def complete(item):
            name = item.rule.origin
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -140,6 +140,35 @@ def _make_full_earley_test(LEXER):
            self.assertEqual( res.data, '_ambig')
            self.assertEqual( len(res.children), 2)

        def test_ambiguity1(self):
            grammar = """
            start: cd+ "e"

            !cd: "c"
               | "d"
               | "cd"

            """
            l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
            x = l.parse('cde')
            assert x.data == '_ambig', x
            assert len(x.children) == 2

        @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet")  # TODO
        def test_not_all_derivations(self):
            grammar = """
            start: cd+ "e"

            !cd: "c"
               | "d"
               | "cd"

            """
            l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False)
            x = l.parse('cde')
            assert x.data != '_ambig', x
            assert len(x.children) == 1

    _NAME = "TestFullEarley" + (LEXER or 'Scanless').capitalize()
    _TestFullEarley.__name__ = _NAME
    globals()[_NAME] = _TestFullEarley
@@ -400,6 +429,7 @@ def _make_parser_test(LEXER, PARSER):
            self.assertSequenceEqual(x.children, ['HelloWorld'])


        @unittest.skipIf(LEXER is None, "Known bug with scanless parsing")  # TODO
        def test_token_collision2(self):
            # NOTE: This test reveals a bug in token reconstruction in Scanless Earley
            #       I probably need to re-write grammar transformation
@@ -625,32 +655,6 @@ def _make_parser_test(LEXER, PARSER):
            self.assertEqual(len(tree.children), 2)


        @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
        def test_earley_prioritization(self):
            "Tests effect of priority on result"

            grammar = """
            start: a | b
            a.1: "a"
            b.2: "a"
            """

            # l = Lark(grammar, parser='earley', lexer='standard')
            l = _Lark(grammar)
            res = l.parse("a")
            self.assertEqual(res.children[0].data, 'b')

            grammar = """
            start: a | b
            a.2: "a"
            b.1: "a"
            """

            l = _Lark(grammar)
            # l = Lark(grammar, parser='earley', lexer='standard')
            res = l.parse("a")
            self.assertEqual(res.children[0].data, 'a')

        @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority")
        def test_lexer_prioritization(self):
            "Tests effect of priority on result"
@@ -680,22 +684,6 @@ def _make_parser_test(LEXER, PARSER):
            self.assertEqual(res.children, ['ab'])


        @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports ambiguity")
        def test_ambiguity1(self):
            grammar = """
            start: cd+ "e"

            !cd: "c"
               | "d"
               | "cd"

            """
            # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=None)
            l = _Lark(grammar, ambiguity='explicit')
            x = l.parse('cde')
            assert x.data == '_ambig'
            assert len(x.children) == 2


        def test_import(self):
            grammar = """
@@ -711,6 +699,33 @@ def _make_parser_test(LEXER, PARSER):
            x = l.parse('12 elephants')
            self.assertEqual(x.children, ['12', 'elephants'])

        @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
        def test_earley_prioritization(self):
            "Tests effect of priority on result"

            grammar = """
            start: a | b
            a.1: "a"
            b.2: "a"
            """

            # l = Lark(grammar, parser='earley', lexer='standard')
            l = _Lark(grammar)
            res = l.parse("a")
            self.assertEqual(res.children[0].data, 'b')

            grammar = """
            start: a | b
            a.2: "a"
            b.1: "a"
            """

            l = _Lark(grammar)
            # l = Lark(grammar, parser='earley', lexer='standard')
            res = l.parse("a")
            self.assertEqual(res.children[0].data, 'a')


        @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
        def test_earley_prioritization_sum(self):
            "Tests effect of priority on result"