Переглянути джерело

Bugfix: Fixed the %import TOKEN feature

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 роки тому
джерело
коміт
9b8ae7aecb
2 змінених файлів з 34 додано та 6 видалено
  1. +18
    -5
      lark/load_grammar.py
  2. +16
    -1
      tests/test_parser.py

+ 18
- 5
lark/load_grammar.py Переглянути файл

@@ -561,8 +561,8 @@ class GrammarLoader:
ignore = []
for (stmt,) in statements:
if stmt.data == 'ignore':
expansions ,= stmt.children
ignore.append(expansions)
t ,= stmt.children
ignore.append(t)
elif stmt.data == 'import':
dotted_path = stmt.children[0].children
name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1]
@@ -580,9 +580,22 @@ class GrammarLoader:
raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name)

# Handle ignore tokens
ignore_defs = [('__IGNORE_%d'%i, t) for i, t in enumerate(ignore)]
ignore_names = [name for name,_ in ignore_defs]
token_defs += ignore_defs
# XXX A slightly hacky solution. Recognition of %ignore TOKEN as separate comes from the lexer's
# inability to handle duplicate tokens (two names, one value)
ignore_names = []
for t in ignore:
if t.data=='expansions' and len(t.children) == 1:
t2 ,= t.children
if t2.data=='expansion' and len(t2.children) == 1:
item ,= t2.children
if isinstance(item, Token) and item.type == 'TOKEN':
ignore_names.append(item.value)
continue

name = '__IGNORE_%d'% len(ignore_names)
ignore_names.append(name)
token_defs.append((name, t))


# Verify correctness 2
token_names = set()


+ 16
- 1
tests/test_parser.py Переглянути файл

@@ -47,7 +47,10 @@ class TestParsers(unittest.TestCase):

self.assertRaises(GrammarError, Lark, g, parser='lalr')

l = Lark(g, parser='earley')
l = Lark(g, parser='earley', lexer=None)
self.assertRaises(ParseError, l.parse, 'a')

l = Lark(g, parser='earley', lexer='dynamic')
self.assertRaises(ParseError, l.parse, 'a')


@@ -385,6 +388,18 @@ def _make_parser_test(LEXER, PARSER):
x = g.parse('Hello HelloWorld')
self.assertSequenceEqual(x.children, ['HelloWorld'])

def test_token_collision_WS(self):
g = _Lark("""start: "Hello" NAME
NAME: /\w/+
%import common.WS
%ignore WS
""")
x = g.parse('Hello World')
self.assertSequenceEqual(x.children, ['World'])
x = g.parse('Hello HelloWorld')
self.assertSequenceEqual(x.children, ['HelloWorld'])


def test_token_collision2(self):
# NOTE: This test reveals a bug in token reconstruction in Scanless Earley
# I probably need to re-write grammar transformation


Завантаження…
Відмінити
Зберегти