@@ -0,0 +1,39 @@ | |||
# | |||
# This example shows how to use get explicit ambiguity from Lark's Earley parser. | |||
# | |||
from lark import Lark | |||
g = """ | |||
sentence: noun verb noun -> simple | |||
| noun verb "like" noun -> comparative | |||
noun: ADJ? NOUN | |||
verb: VERB | |||
NOUN: "flies" | "bananas" | "fruit" | |||
VERB: "like" | "flies" | |||
ADJ: "fruit" | |||
%import common.WS | |||
%ignore WS | |||
""" | |||
lark = Lark(g, start='sentence', ambiguity='explicit') | |||
print(lark.parse('fruit flies like bananas').pretty()) | |||
# Outputs: | |||
# | |||
# _ambig | |||
# comparative | |||
# noun fruit | |||
# verb flies | |||
# noun bananas | |||
# simple | |||
# noun | |||
# fruit | |||
# flies | |||
# verb like | |||
# noun bananas | |||
@@ -3,4 +3,4 @@ from .common import ParseError, GrammarError | |||
from .lark import Lark | |||
from .utils import inline_args | |||
__version__ = "0.2.6" | |||
__version__ = "0.2.7" |
@@ -27,6 +27,11 @@ class LarkOptions(object): | |||
"contextual": Stronger lexer (only works with parser="lalr") | |||
"auto" (default): Choose for me based on grammar and parser | |||
ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" | |||
"resolve": The parser will automatically choose the simplest derivation | |||
(it chooses consistently: greedy for tokens, non-greedy for rules) | |||
"explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). | |||
transformer - Applies the transformer to every parse tree | |||
debug - Affects verbosity (default: False) | |||
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) | |||
@@ -49,6 +54,7 @@ class LarkOptions(object): | |||
self.transformer = o.pop('transformer', None) | |||
self.start = o.pop('start', 'start') | |||
self.profile = o.pop('profile', False) | |||
self.ambiguity = o.pop('ambiguity', 'auto') | |||
assert self.parser in ('earley', 'lalr', None) | |||
@@ -119,13 +125,20 @@ class Lark: | |||
assert not self.options.profile, "Feature temporarily disabled" | |||
self.profiler = Profiler() if self.options.profile else None | |||
lexer = self.options.lexer | |||
if lexer == 'auto': | |||
if self.options.lexer == 'auto': | |||
if self.options.parser == 'lalr': | |||
lexer = 'standard' | |||
self.options.lexer = 'standard' | |||
elif self.options.parser == 'earley': | |||
lexer = None | |||
self.options.lexer = lexer | |||
self.options.lexer = None | |||
lexer = self.options.lexer | |||
assert lexer in ('standard', 'contextual', None) | |||
if self.options.ambiguity == 'auto': | |||
if self.options.parser == 'earley': | |||
self.options.ambiguity = 'resolve' | |||
else: | |||
assert self.options.parser == 'earley' | |||
assert self.options.ambiguity in ('resolve', 'explicit', 'auto') | |||
self.grammar = load_grammar(grammar, source) | |||
tokens, self.rules, self.grammar_extra = self.grammar.compile(lexer=bool(lexer), start=self.options.start) | |||
@@ -155,7 +168,7 @@ class Lark: | |||
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) | |||
parser_conf = ParserConf(rules, callback, self.options.start) | |||
return self.parser_class(self.lexer_conf, parser_conf) | |||
return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | |||
def lex(self, text): | |||
@@ -20,7 +20,7 @@ class WithLexer: | |||
return stream | |||
class LALR(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
WithLexer.__init__(self, lexer_conf) | |||
self.parser_conf = parser_conf | |||
@@ -31,7 +31,7 @@ class LALR(WithLexer): | |||
return self.parser.parse(tokens) | |||
class LALR_ContextualLexer: | |||
def __init__(self, lexer_conf, parser_conf): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.lexer_conf = lexer_conf | |||
self.parser_conf = parser_conf | |||
@@ -126,12 +126,16 @@ class OldEarley_NoLex: | |||
return res[0] | |||
class Earley_NoLex: | |||
def __init__(self, lexer_conf, parser_conf): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||
self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||
resolve_ambiguity = (options.ambiguity=='resolve') if options else True | |||
self.parser = earley.Parser(rules, | |||
parser_conf.start, | |||
parser_conf.callback, | |||
resolve_ambiguity=resolve_ambiguity) | |||
def _prepare_expansion(self, expansion): | |||
for sym in expansion: | |||
@@ -149,12 +153,16 @@ class Earley_NoLex: | |||
return self.parser.parse(new_text) | |||
class Earley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
WithLexer.__init__(self, lexer_conf) | |||
rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||
self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback) | |||
resolve_ambiguity = (options.ambiguity=='resolve') if options else True | |||
self.parser = earley.Parser(rules, | |||
parser_conf.start, | |||
parser_conf.callback, | |||
resolve_ambiguity=resolve_ambiguity) | |||
def _prepare_expansion(self, expansion): | |||
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | |||
@@ -101,10 +101,10 @@ class Column: | |||
# XXX Potential bug: What happens if there's ambiguity in an empty rule? | |||
if item.rule.expansion and item in self.completed: | |||
old_tree = self.completed[item].tree | |||
if old_tree.data != 'ambig': | |||
if old_tree.data != '_ambig': | |||
new_tree = old_tree.copy() | |||
new_tree.rule = old_tree.rule | |||
old_tree.set('ambig', [new_tree]) | |||
old_tree.set('_ambig', [new_tree]) | |||
if item.tree.children[0] is old_tree: # XXX a little hacky! | |||
raise ParseError("Infinite recursion in grammar!") | |||
old_tree.children.append(item.tree) | |||
@@ -125,9 +125,10 @@ class Column: | |||
return bool(self.item_count) | |||
class Parser: | |||
def __init__(self, rules, start, callback): | |||
def __init__(self, rules, start, callback, resolve_ambiguity=True): | |||
self.analysis = GrammarAnalyzer(rules, start) | |||
self.start = start | |||
self.resolve_ambiguity = resolve_ambiguity | |||
self.postprocess = {} | |||
self.predictions = {} | |||
@@ -197,9 +198,11 @@ class Parser: | |||
elif len(solutions) == 1: | |||
tree = solutions[0] | |||
else: | |||
tree = Tree('ambig', solutions) | |||
tree = Tree('_ambig', solutions) | |||
if self.resolve_ambiguity: | |||
ResolveAmbig().visit(tree) | |||
ResolveAmbig().visit(tree) | |||
return ApplyCallbacks(self.postprocess).transform(tree) | |||
@@ -220,9 +223,8 @@ def _compare_rules(rule1, rule2): | |||
assert rule1.origin == rule2.origin | |||
c = compare( len(rule1.expansion), len(rule2.expansion)) | |||
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||
return c | |||
else: | |||
return -c | |||
c = -c | |||
return c | |||
def _compare_drv(tree1, tree2): | |||
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||
@@ -242,8 +244,8 @@ def _compare_drv(tree1, tree2): | |||
class ResolveAmbig(Visitor_NoRecurse): | |||
def ambig(self, tree): | |||
best = max(tree.children, key=cmp_to_key(_compare_drv)) | |||
def _ambig(self, tree): | |||
best = min(tree.children, key=cmp_to_key(_compare_drv)) | |||
assert best.data == 'drv' | |||
tree.set('drv', best.children) | |||
tree.rule = best.rule # needed for applying callbacks | |||
@@ -120,6 +120,23 @@ class TestEarley(unittest.TestCase): | |||
empty_tree = Tree('empty', [Tree('empty2', [])]) | |||
self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) | |||
def test_earley_explicit_ambiguity(self): | |||
# This was a sneaky bug! | |||
grammar = """ | |||
start: a b | ab | |||
a: "a" | |||
b: "b" | |||
ab: "ab" | |||
""" | |||
parser = Lark(grammar, parser='earley', lexer=None, ambiguity='explicit') | |||
res = parser.parse('ab') | |||
self.assertEqual( res.data, '_ambig') | |||
self.assertEqual( len(res.children), 2) | |||
def _make_parser_test(LEXER, PARSER): | |||
def _Lark(grammar, **kwargs): | |||
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) | |||