diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 7045c36..ddcffcb 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -561,8 +561,8 @@ class GrammarLoader: ignore = [] for (stmt,) in statements: if stmt.data == 'ignore': - expansions ,= stmt.children - ignore.append(expansions) + t ,= stmt.children + ignore.append(t) elif stmt.data == 'import': dotted_path = stmt.children[0].children name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1] @@ -580,9 +580,22 @@ class GrammarLoader: raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) # Handle ignore tokens - ignore_defs = [('__IGNORE_%d'%i, t) for i, t in enumerate(ignore)] - ignore_names = [name for name,_ in ignore_defs] - token_defs += ignore_defs + # XXX A slightly hacky solution. Recognition of %ignore TOKEN as separate comes from the lexer's + # inability to handle duplicate tokens (two names, one value) + ignore_names = [] + for t in ignore: + if t.data=='expansions' and len(t.children) == 1: + t2 ,= t.children + if t2.data=='expansion' and len(t2.children) == 1: + item ,= t2.children + if isinstance(item, Token) and item.type == 'TOKEN': + ignore_names.append(item.value) + continue + + name = '__IGNORE_%d'% len(ignore_names) + ignore_names.append(name) + token_defs.append((name, t)) + # Verify correctness 2 token_names = set() diff --git a/tests/test_parser.py b/tests/test_parser.py index f81d8eb..b50cd50 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -47,7 +47,10 @@ class TestParsers(unittest.TestCase): self.assertRaises(GrammarError, Lark, g, parser='lalr') - l = Lark(g, parser='earley') + l = Lark(g, parser='earley', lexer=None) + self.assertRaises(ParseError, l.parse, 'a') + + l = Lark(g, parser='earley', lexer='dynamic') self.assertRaises(ParseError, l.parse, 'a') @@ -385,6 +388,18 @@ def _make_parser_test(LEXER, PARSER): x = g.parse('Hello HelloWorld') self.assertSequenceEqual(x.children, ['HelloWorld']) + def test_token_collision_WS(self): + g = _Lark("""start: "Hello" NAME + NAME: /\w/+ + %import common.WS + %ignore WS + """) + x = g.parse('Hello World') + self.assertSequenceEqual(x.children, ['World']) + x = g.parse('Hello HelloWorld') + self.assertSequenceEqual(x.children, ['HelloWorld']) + + def test_token_collision2(self): # NOTE: This test reveals a bug in token reconstruction in Scanless Earley # I probably need to re-write grammar transformation