From f374e70b2c2787c5fe5c771f8e5a306dc56cfd01 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 18 Apr 2017 02:17:17 +0300 Subject: [PATCH] Feature: Added explicit ambiguity option for Earley --- examples/fruitflies.py | 39 +++++++++++++++++++++++++++++++++++++++ lark/__init__.py | 2 +- lark/lark.py | 25 +++++++++++++++++++------ lark/parser_frontends.py | 20 ++++++++++++++------ lark/parsers/earley.py | 22 ++++++++++++---------- tests/test_parser.py | 17 +++++++++++++++++ 6 files changed, 102 insertions(+), 23 deletions(-) create mode 100644 examples/fruitflies.py diff --git a/examples/fruitflies.py b/examples/fruitflies.py new file mode 100644 index 0000000..b7026b6 --- /dev/null +++ b/examples/fruitflies.py @@ -0,0 +1,39 @@ +# +# This example shows how to use get explicit ambiguity from Lark's Earley parser. +# + +from lark import Lark + +g = """ + sentence: noun verb noun -> simple + | noun verb "like" noun -> comparative + + noun: ADJ? NOUN + verb: VERB + + NOUN: "flies" | "bananas" | "fruit" + VERB: "like" | "flies" + ADJ: "fruit" + + %import common.WS + %ignore WS +""" + +lark = Lark(g, start='sentence', ambiguity='explicit') + +print(lark.parse('fruit flies like bananas').pretty()) + +# Outputs: +# +# _ambig +# comparative +# noun fruit +# verb flies +# noun bananas +# simple +# noun +# fruit +# flies +# verb like +# noun bananas + diff --git a/lark/__init__.py b/lark/__init__.py index a37cdb7..e22a247 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,4 +3,4 @@ from .common import ParseError, GrammarError from .lark import Lark from .utils import inline_args -__version__ = "0.2.6" +__version__ = "0.2.7" diff --git a/lark/lark.py b/lark/lark.py index 7d434bf..b839650 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -27,6 +27,11 @@ class LarkOptions(object): "contextual": Stronger lexer (only works with parser="lalr") "auto" (default): Choose for me based on grammar and parser + ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" + "resolve": The parser will automatically choose the simplest derivation + (it chooses consistently: greedy for tokens, non-greedy for rules) + "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). + transformer - Applies the transformer to every parse tree debug - Affects verbosity (default: False) keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) @@ -49,6 +54,7 @@ class LarkOptions(object): self.transformer = o.pop('transformer', None) self.start = o.pop('start', 'start') self.profile = o.pop('profile', False) + self.ambiguity = o.pop('ambiguity', 'auto') assert self.parser in ('earley', 'lalr', None) @@ -119,13 +125,20 @@ class Lark: assert not self.options.profile, "Feature temporarily disabled" self.profiler = Profiler() if self.options.profile else None - lexer = self.options.lexer - if lexer == 'auto': + if self.options.lexer == 'auto': if self.options.parser == 'lalr': - lexer = 'standard' + self.options.lexer = 'standard' elif self.options.parser == 'earley': - lexer = None - self.options.lexer = lexer + self.options.lexer = None + lexer = self.options.lexer + assert lexer in ('standard', 'contextual', None) + + if self.options.ambiguity == 'auto': + if self.options.parser == 'earley': + self.options.ambiguity = 'resolve' + else: + assert self.options.parser == 'earley' + assert self.options.ambiguity in ('resolve', 'explicit', 'auto') self.grammar = load_grammar(grammar, source) tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start) @@ -155,7 +168,7 @@ class Lark: setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) parser_conf = ParserConf(rules, callback, self.options.start) - return self.parser_class(self.lexer_conf, parser_conf) + return self.parser_class(self.lexer_conf, parser_conf, options=self.options) def lex(self, text): diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 0b36a75..1646726 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -20,7 +20,7 @@ class WithLexer: return stream class LALR(WithLexer): - def __init__(self, lexer_conf, parser_conf): + def __init__(self, lexer_conf, parser_conf, options=None): WithLexer.__init__(self, lexer_conf) self.parser_conf = parser_conf @@ -31,7 +31,7 @@ class LALR(WithLexer): return self.parser.parse(tokens) class LALR_ContextualLexer: - def __init__(self, lexer_conf, parser_conf): + def __init__(self, lexer_conf, parser_conf, options=None): self.lexer_conf = lexer_conf self.parser_conf = parser_conf @@ -126,12 +126,16 @@ class OldEarley_NoLex: return res[0] class Earley_NoLex: - def __init__(self, lexer_conf, parser_conf): + def __init__(self, lexer_conf, parser_conf, options=None): self.token_by_name = {t.name:t for t in lexer_conf.tokens} rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] - self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) + resolve_ambiguity = (options.ambiguity=='resolve') if options else True + self.parser = earley.Parser(rules, + parser_conf.start, + parser_conf.callback, + resolve_ambiguity=resolve_ambiguity) def _prepare_expansion(self, expansion): for sym in expansion: @@ -149,12 +153,16 @@ class Earley_NoLex: return self.parser.parse(new_text) class Earley(WithLexer): - def __init__(self, lexer_conf, parser_conf): + def __init__(self, lexer_conf, parser_conf, options=None): WithLexer.__init__(self, lexer_conf) rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] - self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) + resolve_ambiguity = (options.ambiguity=='resolve') if options else True + self.parser = earley.Parser(rules, + parser_conf.start, + parser_conf.callback, + resolve_ambiguity=resolve_ambiguity) def _prepare_expansion(self, expansion): return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 95061e1..20a6ee8 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -101,10 +101,10 @@ class Column: # XXX Potential bug: What happens if there's ambiguity in an empty rule? if item.rule.expansion and item in self.completed: old_tree = self.completed[item].tree - if old_tree.data != 'ambig': + if old_tree.data != '_ambig': new_tree = old_tree.copy() new_tree.rule = old_tree.rule - old_tree.set('ambig', [new_tree]) + old_tree.set('_ambig', [new_tree]) if item.tree.children[0] is old_tree: # XXX a little hacky! raise ParseError("Infinite recursion in grammar!") old_tree.children.append(item.tree) @@ -125,9 +125,10 @@ class Column: return bool(self.item_count) class Parser: - def __init__(self, rules, start, callback): + def __init__(self, rules, start, callback, resolve_ambiguity=True): self.analysis = GrammarAnalyzer(rules, start) self.start = start + self.resolve_ambiguity = resolve_ambiguity self.postprocess = {} self.predictions = {} @@ -197,9 +198,11 @@ class Parser: elif len(solutions) == 1: tree = solutions[0] else: - tree = Tree('ambig', solutions) + tree = Tree('_ambig', solutions) + + if self.resolve_ambiguity: + ResolveAmbig().visit(tree) - ResolveAmbig().visit(tree) return ApplyCallbacks(self.postprocess).transform(tree) @@ -220,9 +223,8 @@ def _compare_rules(rule1, rule2): assert rule1.origin == rule2.origin c = compare( len(rule1.expansion), len(rule2.expansion)) if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here - return c - else: - return -c + c = -c + return c def _compare_drv(tree1, tree2): if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): @@ -242,8 +244,8 @@ def _compare_drv(tree1, tree2): class ResolveAmbig(Visitor_NoRecurse): - def ambig(self, tree): - best = max(tree.children, key=cmp_to_key(_compare_drv)) + def _ambig(self, tree): + best = min(tree.children, key=cmp_to_key(_compare_drv)) assert best.data == 'drv' tree.set('drv', best.children) tree.rule = best.rule # needed for applying callbacks diff --git a/tests/test_parser.py b/tests/test_parser.py index 3e3ee14..96adb67 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -120,6 +120,23 @@ class TestEarley(unittest.TestCase): empty_tree = Tree('empty', [Tree('empty2', [])]) self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) + def test_earley_explicit_ambiguity(self): + # This was a sneaky bug! + + grammar = """ + start: a b | ab + a: "a" + b: "b" + ab: "ab" + """ + + parser = Lark(grammar, parser='earley', lexer=None, ambiguity='explicit') + res = parser.parse('ab') + + self.assertEqual( res.data, '_ambig') + self.assertEqual( len(res.children), 2) + + def _make_parser_test(LEXER, PARSER): def _Lark(grammar, **kwargs): return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)