From 4eec9244038eafaf31c1cda595e18321c8f9730b Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 4 Aug 2017 02:42:31 +0300 Subject: [PATCH] Added prioritization to Earley. Use rule.1 etc. Highest priority will be selected in case of ambiguity. --- lark/common.py | 2 +- lark/load_grammar.py | 20 +++++++++++++++----- lark/parse_tree_builder.py | 2 +- lark/parser_frontends.py | 4 ++-- lark/parsers/earley.py | 5 +++++ lark/parsers/grammar_analysis.py | 11 ++++++----- lark/parsers/lalr_parser.py | 1 + tests/test_parser.py | 24 ++++++++++++++++++++++++ 8 files changed, 55 insertions(+), 14 deletions(-) diff --git a/lark/common.py b/lark/common.py index f9b0990..f1b6784 100644 --- a/lark/common.py +++ b/lark/common.py @@ -41,7 +41,7 @@ class LexerConf: class ParserConf: def __init__(self, rules, callback, start): - assert all(len(r)==3 for r in rules) + assert all(len(r) == 4 for r in rules) self.rules = rules self.callback = callback self.start = start diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 460af8a..21c5a8b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -75,6 +75,7 @@ TOKENS = { '_TO': '->', '_IGNORE': r'%ignore', '_IMPORT': r'%import', + 'NUMBER': '\d+', } RULES = { @@ -82,7 +83,8 @@ RULES = { '_list': ['_item', '_list _item'], '_item': ['rule', 'token', 'statement', '_NL'], - 'rule': ['RULE _COLON expansions _NL'], + 'rule': ['RULE _COLON expansions _NL', + 'RULE _DOT NUMBER _COLON expansions _NL'], 'expansions': ['alias', 'expansions _OR alias', 'expansions _NL _OR alias'], @@ -470,21 +472,29 @@ class Grammar: class RuleOptions: - def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False): + def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.create_token = create_token # used for scanless postprocessing + self.priority = priority self.filter_out = filter_out # remove this rule from the tree # used for "token"-rules in scanless @classmethod - def from_rule(cls, name, expansions): + def from_rule(cls, name, *x): + if len(x) > 1: + priority, expansions = x + priority = int(priority) + else: + expansions ,= x + priority = None + keep_all_tokens = name.startswith('!') name = name.lstrip('!') expand1 = name.startswith('?') name = name.lstrip('?') - return name, expansions, cls(keep_all_tokens, expand1) + return name, expansions, cls(keep_all_tokens, expand1, priority=priority) @@ -605,7 +615,7 @@ class GrammarLoader: raise GrammarError("Token '%s' defined more than once" % name) token_names.add(name) - rules = [RuleOptions.from_rule(name, x) for name, x in rule_defs] + rules = [RuleOptions.from_rule(*x) for x in rule_defs] rule_names = set() for name, _x, _o in rules: diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 0c21bfb..601372e 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -123,6 +123,6 @@ class ParseTreeBuilder: raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) setattr(callback, callback_name, alias_handler) - new_rules.append(( _origin, expansion, callback_name )) + new_rules.append(( _origin, expansion, callback_name, options )) return new_rules, callback diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 1646726..a9066f5 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -129,7 +129,7 @@ class Earley_NoLex: def __init__(self, lexer_conf, parser_conf, options=None): self.token_by_name = {t.name:t for t in lexer_conf.tokens} - rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] + rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] resolve_ambiguity = (options.ambiguity=='resolve') if options else True self.parser = earley.Parser(rules, @@ -156,7 +156,7 @@ class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): WithLexer.__init__(self, lexer_conf) - rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] + rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] resolve_ambiguity = (options.ambiguity=='resolve') if options else True self.parser = earley.Parser(rules, diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 3c02332..dbcbda3 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -224,6 +224,11 @@ class ApplyCallbacks(Transformer_NoRecurse): return Tree(rule.origin, children) def _compare_rules(rule1, rule2): + if rule1.options and rule2.options: + if rule1.options.priority is not None and rule2.options.priority is not None: + assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2) + return -compare(rule1.options.priority, rule2.options.priority) + if rule1.origin != rule2.origin: return 0 c = compare( len(rule1.expansion), len(rule2.expansion)) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index f08a8bd..7dff9ce 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -7,10 +7,11 @@ class Rule(object): origin : a symbol expansion : a list of symbols """ - def __init__(self, origin, expansion, alias=None): + def __init__(self, origin, expansion, alias=None, options=None): self.origin = origin self.expansion = expansion self.alias = alias + self.options = options def __repr__(self): return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) @@ -111,12 +112,12 @@ class GrammarAnalyzer(object): self.debug = debug rule_tuples = list(rule_tuples) rule_tuples.append(('$root', [start_symbol, '$end'])) - rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] + rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples] self.rules = set() - self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} - for origin, exp, alias in rule_tuples: - r = Rule( origin, exp, alias ) + self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples} + for origin, exp, alias, options in rule_tuples: + r = Rule( origin, exp, alias, options ) self.rules.add(r) self.rules_by_origin[origin].append(r) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 1420345..bd519d1 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -9,6 +9,7 @@ from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT class Parser(object): def __init__(self, parser_conf): + assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" self.analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) self.analysis.compute_lookahead() self.callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) diff --git a/tests/test_parser.py b/tests/test_parser.py index 348993a..d7f6928 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -621,6 +621,30 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(len(tree.children), 2) + @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") + def test_earley_prioritization(self): + "Tests effect of priority on result" + + grammar = """ + start: a | b + a.1: "a" + b.2: "a" + """ + + l = Lark(grammar, parser='earley', lexer='standard') + res = l.parse("a") + self.assertEqual(res.children[0].data, 'b') + + grammar = """ + start: a | b + a.2: "a" + b.1: "a" + """ + + l = Lark(grammar, parser='earley', lexer='standard') + res = l.parse("a") + self.assertEqual(res.children[0].data, 'a') + _NAME = "Test" + PARSER.capitalize() + (LEXER or 'Scanless').capitalize()