From aede340449ebdb92214ae19147131bcab4847c5a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 23 Aug 2017 10:52:46 +0300 Subject: [PATCH] Added earley__all_derivations due to performance concerns --- lark/__init__.py | 2 +- lark/lark.py | 7 ++- lark/parser_frontends.py | 8 ++-- lark/parsers/earley.py | 17 +++++-- tests/test_parser.py | 99 +++++++++++++++++++++++----------------- 5 files changed, 83 insertions(+), 50 deletions(-) diff --git a/lark/__init__.py b/lark/__init__.py index 65a318c..f3339d9 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,4 +3,4 @@ from .common import ParseError, GrammarError from .lark import Lark from .utils import inline_args -__version__ = "0.3.4" +__version__ = "0.3.5" diff --git a/lark/lark.py b/lark/lark.py index 4adabcc..18d9959 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -28,10 +28,14 @@ class LarkOptions(object): "auto" (default): Choose for me based on grammar and parser ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" - "resolve": The parser will automatically choose the simplest derivation + "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules) "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). + earley__all_derivations - If True, try every possible derivation of each rule. If False, pick the first + correct derivation. Both will find a solution to every correct grammar & input, + but when False, some ambiguities won't appear (Default: True) + transformer - Applies the transformer to every parse tree debug - Affects verbosity (default: False) keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) @@ -57,6 +61,7 @@ class LarkOptions(object): self.profile = o.pop('profile', False) self.ambiguity = o.pop('ambiguity', 'auto') self.propagate_positions = o.pop('propagate_positions', False) + self.earley__all_derivations = o.pop('earley__all_derivations', True) assert self.parser in ('earley', 'lalr', None) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index c4524ca..c46352f 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -77,7 +77,8 @@ class Earley_NoLex: self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback, - resolve_ambiguity=get_ambiguity_resolver(options)) + resolve_ambiguity=get_ambiguity_resolver(options), + all_derivations = options.earley__all_derivations if options else True) def _prepare_expansion(self, expansion): for sym in expansion: @@ -100,10 +101,11 @@ class Earley(WithLexer): rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] - self.parser = earley.Parser(rules, + self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback, - resolve_ambiguity=get_ambiguity_resolver(options)) + resolve_ambiguity=get_ambiguity_resolver(options), + all_derivations = options.earley__all_derivations if options else True) def _prepare_expansion(self, expansion): return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 276d0ec..4225b4d 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -51,7 +51,7 @@ class Item(object): def advance(self, tree): assert self.tree.data == 'drv' new_tree = Derivation(self.rule, self.tree.children + [tree]) - return Item(self.rule, self.ptr+1, self.start, new_tree) + return self.__class__(self.rule, self.ptr+1, self.start, new_tree) def similar(self, other): return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule @@ -67,6 +67,9 @@ class Item(object): after = list(map(str, self.rule.expansion[self.ptr:])) return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) +class Item_JoinDerivations(Item): + __eq__ = Item.similar + class NewsList(list): "Keeps track of newly added items (append-only)" @@ -133,10 +136,16 @@ class Column: return bool(self.item_count) class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None): + def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, all_derivations=True): + """ + all_derivations: + True = Try every rule combination, and every possible derivation of each rule. (default) + False = Try every rule combination, but not every derivation of the same rule. + """ self.analysis = GrammarAnalyzer(rules, start_symbol) self.start_symbol = start_symbol self.resolve_ambiguity = resolve_ambiguity + self.all_derivations = all_derivations self.postprocess = {} self.predictions = {} @@ -150,9 +159,11 @@ class Parser: # Define parser functions start_symbol = start_symbol or self.start_symbol + _Item = Item if self.all_derivations else Item_JoinDerivations + def predict(nonterm, column): assert not isinstance(nonterm, Terminal), nonterm - return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] + return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] def complete(item): name = item.rule.origin diff --git a/tests/test_parser.py b/tests/test_parser.py index 40b4a0f..9fa05eb 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -140,6 +140,35 @@ def _make_full_earley_test(LEXER): self.assertEqual( res.data, '_ambig') self.assertEqual( len(res.children), 2) + def test_ambiguity1(self): + grammar = """ + start: cd+ "e" + + !cd: "c" + | "d" + | "cd" + + """ + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) + x = l.parse('cde') + assert x.data == '_ambig', x + assert len(x.children) == 2 + + @unittest.skipIf(LEXER=='dynamic', "Not implemented in Dynamic Earley yet") # TODO + def test_not_all_derivations(self): + grammar = """ + start: cd+ "e" + + !cd: "c" + | "d" + | "cd" + + """ + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER, earley__all_derivations=False) + x = l.parse('cde') + assert x.data != '_ambig', x + assert len(x.children) == 1 + _NAME = "TestFullEarley" + (LEXER or 'Scanless').capitalize() _TestFullEarley.__name__ = _NAME globals()[_NAME] = _TestFullEarley @@ -400,6 +429,7 @@ def _make_parser_test(LEXER, PARSER): self.assertSequenceEqual(x.children, ['HelloWorld']) + @unittest.skipIf(LEXER is None, "Known bug with scanless parsing") # TODO def test_token_collision2(self): # NOTE: This test reveals a bug in token reconstruction in Scanless Earley # I probably need to re-write grammar transformation @@ -625,32 +655,6 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(len(tree.children), 2) - @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") - def test_earley_prioritization(self): - "Tests effect of priority on result" - - grammar = """ - start: a | b - a.1: "a" - b.2: "a" - """ - - # l = Lark(grammar, parser='earley', lexer='standard') - l = _Lark(grammar) - res = l.parse("a") - self.assertEqual(res.children[0].data, 'b') - - grammar = """ - start: a | b - a.2: "a" - b.1: "a" - """ - - l = _Lark(grammar) - # l = Lark(grammar, parser='earley', lexer='standard') - res = l.parse("a") - self.assertEqual(res.children[0].data, 'a') - @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority") def test_lexer_prioritization(self): "Tests effect of priority on result" @@ -680,22 +684,6 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(res.children, ['ab']) - @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports ambiguity") - def test_ambiguity1(self): - grammar = """ - start: cd+ "e" - - !cd: "c" - | "d" - | "cd" - - """ - # l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=None) - l = _Lark(grammar, ambiguity='explicit') - x = l.parse('cde') - assert x.data == '_ambig' - assert len(x.children) == 2 - def test_import(self): grammar = """ @@ -711,6 +699,33 @@ def _make_parser_test(LEXER, PARSER): x = l.parse('12 elephants') self.assertEqual(x.children, ['12', 'elephants']) + @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") + def test_earley_prioritization(self): + "Tests effect of priority on result" + + grammar = """ + start: a | b + a.1: "a" + b.2: "a" + """ + + # l = Lark(grammar, parser='earley', lexer='standard') + l = _Lark(grammar) + res = l.parse("a") + self.assertEqual(res.children[0].data, 'b') + + grammar = """ + start: a | b + a.2: "a" + b.1: "a" + """ + + l = _Lark(grammar) + # l = Lark(grammar, parser='earley', lexer='standard') + res = l.parse("a") + self.assertEqual(res.children[0].data, 'a') + + @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") def test_earley_prioritization_sum(self): "Tests effect of priority on result"