diff --git a/lark/common.py b/lark/common.py index 61f4950..8335919 100644 --- a/lark/common.py +++ b/lark/common.py @@ -89,10 +89,11 @@ class PatternRE(Pattern): return sre_parse.parse(self.to_regexp()).getwidth()[1] class TokenDef(object): - def __init__(self, name, pattern): + def __init__(self, name, pattern, priority=1): assert isinstance(pattern, Pattern), pattern self.name = name self.pattern = pattern + self.priority = priority def __repr__(self): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) diff --git a/lark/lexer.py b/lark/lexer.py index 4e6d5b9..eb1fede 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -131,7 +131,7 @@ class Lexer(object): self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] self.ignore_types = [t for t in ignore] - tokens.sort(key=lambda x:x.pattern.max_width, reverse=True) + tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, x.name)) tokens, self.callback = _create_unless(tokens) assert all(self.callback.values()) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ddcffcb..803fee9 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -110,7 +110,8 @@ RULES = { 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOT _DOT STRING'], - 'token': ['TOKEN _COLON expansions _NL'], + 'token': ['TOKEN _COLON expansions _NL', + 'TOKEN _DOT NUMBER _COLON expansions _NL'], 'statement': ['ignore', 'import'], 'ignore': ['_IGNORE expansions _NL'], 'import': ['_IMPORT import_args _NL', @@ -373,7 +374,7 @@ class Grammar: # Implement the "%ignore" feature without a lexer.. terms_to_ignore = {name:'__'+name for name in self.ignore} if terms_to_ignore: - assert set(terms_to_ignore) <= {name for name, t in term_defs} + assert set(terms_to_ignore) <= {name for name, _t in term_defs} term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] expr = Token('RULE', '__ignore') for r, tree, _o in rule_defs: @@ -388,7 +389,7 @@ class Grammar: rule_defs.append(('__ignore', _ignore_tree, None)) # Convert all tokens to rules - new_terminal_names = {name: '__token_'+name for name, tree in term_defs} + new_terminal_names = {name: '__token_'+name for name, _t in term_defs} for name, tree, options in rule_defs: for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): @@ -396,11 +397,11 @@ class Grammar: if sym in new_terminal_names: exp.children[i] = Token(sym.type, new_terminal_names[sym]) - for name, tree in term_defs: + for name, (tree, priority) in term_defs: # TODO transfer priority to rule? if name.startswith('_'): - options = RuleOptions(filter_out=True) + options = RuleOptions(filter_out=True, priority=priority) else: - options = RuleOptions(keep_all_tokens=True, create_token=name) + options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority) name = new_terminal_names[name] inner_name = name + '_inner' @@ -423,8 +424,8 @@ class Grammar: # Convert token-trees to strings/regexps transformer = PrepareLiterals() * TokenTreeToPattern() - tokens = [TokenDef(name, transformer.transform(token_tree)) - for name, token_tree in token_defs] + tokens = [TokenDef(name, transformer.transform(token_tree), priority) + for name, (token_tree, priority) in token_defs] # ================= # Compile Rules @@ -504,7 +505,7 @@ def resolve_token_references(token_defs): while True: changed = False - for name, token_tree in token_defs: + for name, (token_tree, _p) in token_defs: for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')): for i, item in enumerate(exp.children): if isinstance(item, Token): @@ -555,7 +556,9 @@ class GrammarLoader: statements = [c.children for c in tree.children if c.data=='statement'] assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children) - token_defs = [(name.value, t) for name, t in token_defs] + token_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in token_defs] + + token_defs = [(name.value, (t, int(p))) for name, p, t in token_defs] # Execute statements ignore = [] @@ -568,8 +571,9 @@ class GrammarLoader: name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1] grammar_path = os.path.join(*dotted_path[:-1]) + '.g' g = import_grammar(grammar_path) - token_tree = dict(g.token_defs)[dotted_path[-1]] - token_defs.append([name.value, token_tree]) + token_options = dict(g.token_defs)[dotted_path[-1]] + assert isinstance(token_options, tuple) and len(token_options)==2 + token_defs.append([name.value, token_options]) else: assert False, stmt @@ -594,7 +598,7 @@ class GrammarLoader: name = '__IGNORE_%d'% len(ignore_names) ignore_names.append(name) - token_defs.append((name, t)) + token_defs.append((name, (t, 0))) # Verify correctness 2 diff --git a/tests/test_parser.py b/tests/test_parser.py index cce6a37..3de98c6 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -635,7 +635,8 @@ def _make_parser_test(LEXER, PARSER): b.2: "a" """ - l = Lark(grammar, parser='earley', lexer='standard') + # l = Lark(grammar, parser='earley', lexer='standard') + l = _Lark(grammar) res = l.parse("a") self.assertEqual(res.children[0].data, 'b') @@ -645,10 +646,55 @@ def _make_parser_test(LEXER, PARSER): b.1: "a" """ - l = Lark(grammar, parser='earley', lexer='standard') + l = _Lark(grammar) + # l = Lark(grammar, parser='earley', lexer='standard') res = l.parse("a") self.assertEqual(res.children[0].data, 'a') + @unittest.skipIf(LEXER != 'standard', "Only standard lexers care about token priority") + def test_lexer_prioritization(self): + "Tests effect of priority on result" + + grammar = """ + start: A B | AB + A.2: "a" + B: "b" + AB: "ab" + """ + l = _Lark(grammar) + res = l.parse("ab") + + self.assertEqual(res.children, ['a', 'b']) + self.assertNotEqual(res.children, ['ab']) + + grammar = """ + start: A B | AB + A: "a" + B: "b" + AB.3: "ab" + """ + l = _Lark(grammar) + res = l.parse("ab") + + self.assertNotEqual(res.children, ['a', 'b']) + self.assertEqual(res.children, ['ab']) + + + @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports ambiguity") + def test_ambiguity1(self): + grammar = """ + start: cd+ "e" + + !cd: "c" + | "d" + | "cd" + + """ + l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=None) + x = l.parse('cde') + assert x.data == '_ambig' + assert len(x.children) == 2 + @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules") def test_earley_prioritization_sum(self):