diff --git a/lark/lark.py b/lark/lark.py index a39a847..9979ab1 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -139,7 +139,7 @@ class Lark: if self.options.parser == 'earley': self.options.ambiguity = 'resolve' else: - assert self.options.parser == 'earley' + assert self.options.parser == 'earley', "Only Earley supports disambiguation right now" assert self.options.ambiguity in ('resolve', 'explicit', 'auto') # Parse the grammar file and compose the grammars (TODO) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 91b8c85..3d29576 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -4,7 +4,7 @@ import sre_parse from .lexer import Lexer, ContextualLexer, Token from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token -from .parsers import lalr_parser, earley, xearley +from .parsers import lalr_parser, earley, xearley, resolve_ambig class WithLexer: def __init__(self, lexer_conf): @@ -48,6 +48,12 @@ class LALR_ContextualLexer: tokens = self.lexer_conf.postlex.process(tokens) return self.parser.parse(tokens, self.lexer.set_parser_state) +def get_ambiguity_resolver(options): + if not options or options.ambiguity == 'resolve': + return resolve_ambig.resolve_ambig + elif options.ambiguity == 'explicit': + return None + raise ValueError(options) def tokenize_text(text): new_text = [] @@ -66,11 +72,10 @@ class Earley_NoLex: rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] - resolve_ambiguity = (options.ambiguity=='resolve') if options else True - self.parser = earley.Parser(rules, + self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback, - resolve_ambiguity=resolve_ambiguity) + resolve_ambiguity=get_ambiguity_resolver(options)) def _prepare_expansion(self, expansion): for sym in expansion: @@ -93,11 +98,10 @@ class Earley(WithLexer): rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] - resolve_ambiguity = (options.ambiguity=='resolve') if options else True self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback, - resolve_ambiguity=resolve_ambiguity) + resolve_ambiguity=get_ambiguity_resolver(options)) def _prepare_expansion(self, expansion): return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] @@ -113,13 +117,12 @@ class XEarley: rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] - resolve_ambiguity = (options.ambiguity=='resolve') if options else True ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] self.parser = xearley.Parser(rules, parser_conf.start, parser_conf.callback, - resolve_ambiguity=resolve_ambiguity, + resolve_ambiguity=get_ambiguity_resolver(options), ignore=ignore, ) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index a00bb36..276d0ec 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -13,9 +13,6 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from functools import cmp_to_key - -from ..utils import compare from ..common import ParseError, UnexpectedToken, Terminal from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse from .grammar_analysis import GrammarAnalyzer @@ -136,7 +133,7 @@ class Column: return bool(self.item_count) class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=True): + def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None): self.analysis = GrammarAnalyzer(rules, start_symbol) self.start_symbol = start_symbol self.resolve_ambiguity = resolve_ambiguity @@ -213,10 +210,9 @@ class Parser: tree = Tree('_ambig', solutions) if self.resolve_ambiguity: - ResolveAmbig().visit(tree) + tree = self.resolve_ambiguity(tree) return ApplyCallbacks(self.postprocess).transform(tree) - class ApplyCallbacks(Transformer_NoRecurse): @@ -231,66 +227,6 @@ class ApplyCallbacks(Transformer_NoRecurse): else: return Tree(rule.origin, children) -def _compare_rules(rule1, rule2): - if rule1.origin != rule2.origin: - if rule1.options and rule2.options: - if rule1.options.priority is not None and rule2.options.priority is not None: - assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2) - return -compare(rule1.options.priority, rule2.options.priority) - - return 0 - - c = compare( len(rule1.expansion), len(rule2.expansion)) - if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here - c = -c - return c - -def _compare_drv(tree1, tree2): - if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): - return -compare(tree1, tree2) - - try: - rule1, rule2 = tree1.rule, tree2.rule - except AttributeError: - # Probably trees that don't take part in this parse (better way to distinguish?) - return -compare(tree1, tree2) - - # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, - # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be - # computationally inefficient. So we handle it here. - if tree1.data == '_ambig': - _resolve_ambig(tree1) - if tree2.data == '_ambig': - _resolve_ambig(tree2) - - c = _compare_rules(tree1.rule, tree2.rule) - if c: - return c - - # rules are "equal", so compare trees - for t1, t2 in zip(tree1.children, tree2.children): - c = _compare_drv(t1, t2) - if c: - return c - - return compare(len(tree1.children), len(tree2.children)) - - -def _resolve_ambig(tree): - assert tree.data == '_ambig' - - best = min(tree.children, key=cmp_to_key(_compare_drv)) - assert best.data == 'drv' - tree.set('drv', best.children) - tree.rule = best.rule # needed for applying callbacks - - assert tree.data != '_ambig' - -class ResolveAmbig(Visitor_NoRecurse): - def _ambig(self, tree): - _resolve_ambig(tree) - - # RULES = [ # ('a', ['d']), # ('d', ['b']), diff --git a/lark/parsers/resolve_ambig.py b/lark/parsers/resolve_ambig.py new file mode 100644 index 0000000..302223b --- /dev/null +++ b/lark/parsers/resolve_ambig.py @@ -0,0 +1,68 @@ +from ..utils import compare +from functools import cmp_to_key + +from ..tree import Tree, Visitor_NoRecurse + +def _compare_rules(rule1, rule2): + if rule1.origin != rule2.origin: + if rule1.options and rule2.options: + if rule1.options.priority is not None and rule2.options.priority is not None: + assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2) + return -compare(rule1.options.priority, rule2.options.priority) + + return 0 + + c = compare( len(rule1.expansion), len(rule2.expansion)) + if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here + c = -c + return c + +def _compare_drv(tree1, tree2): + if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): + return -compare(tree1, tree2) + + try: + rule1, rule2 = tree1.rule, tree2.rule + except AttributeError: + # Probably trees that don't take part in this parse (better way to distinguish?) + return -compare(tree1, tree2) + + # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, + # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be + # computationally inefficient. So we handle it here. + if tree1.data == '_ambig': + _resolve_ambig(tree1) + if tree2.data == '_ambig': + _resolve_ambig(tree2) + + c = _compare_rules(tree1.rule, tree2.rule) + if c: + return c + + # rules are "equal", so compare trees + for t1, t2 in zip(tree1.children, tree2.children): + c = _compare_drv(t1, t2) + if c: + return c + + return compare(len(tree1.children), len(tree2.children)) + + +def _resolve_ambig(tree): + assert tree.data == '_ambig' + + best = min(tree.children, key=cmp_to_key(_compare_drv)) + assert best.data == 'drv' + tree.set('drv', best.children) + tree.rule = best.rule # needed for applying callbacks + + assert tree.data != '_ambig' + +class ResolveAmbig(Visitor_NoRecurse): + def _ambig(self, tree): + _resolve_ambig(tree) + + +def resolve_ambig(tree): + ResolveAmbig().visit(tree) + return tree diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index a74852a..c12108f 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -25,10 +25,10 @@ from ..lexer import Token from ..tree import Tree from .grammar_analysis import GrammarAnalyzer -from .earley import ResolveAmbig, ApplyCallbacks, Item, NewsList, Derivation, END_TOKEN, Column +from .earley import ApplyCallbacks, Item, Column class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=True, ignore=()): + def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=()): self.analysis = GrammarAnalyzer(rules, start_symbol) self.start_symbol = start_symbol self.resolve_ambiguity = resolve_ambiguity @@ -132,7 +132,7 @@ class Parser: tree = Tree('_ambig', solutions) if self.resolve_ambiguity: - ResolveAmbig().visit(tree) + tree = self.resolve_ambiguity(tree) return ApplyCallbacks(self.postprocess).transform(tree)