浏览代码

Improved grammar validation and refactored the lexers

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.2
Erez Shinan 6 年前
父节点
当前提交
38c5fd244a
共有 5 个文件被更改,包括 95 次插入70 次删除
  1. +51
    -63
      lark/lexer.py
  2. +5
    -1
      lark/load_grammar.py
  3. +1
    -1
      lark/parse_tree_builder.py
  4. +1
    -1
      lark/parsers/xearley.py
  5. +37
    -4
      tests/test_parser.py

+ 51
- 63
lark/lexer.py 查看文件

@@ -111,12 +111,35 @@ def build_mres(tokens, match_whole=False):
return _build_mres(tokens, len(tokens), match_whole)


class Lexer(object):
class LineCounter:
def __init__(self):
self.newline_char = '\n'
self.char_pos = 0
self.line = 1
self.column = 0
self.line_start_pos = 0

def feed(self, token, test_newline=True):
"""Consume a token and calculat the new line & column.

As an optional optimization, set test_newline=False is token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1

self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos



class Lexer:
def __init__(self, tokens, ignore=()):
assert all(isinstance(t, TokenDef) for t in tokens), tokens

self.ignore = ignore
self.newline_char = '\n'
tokens = list(tokens)

# Sanitization
@@ -129,10 +152,7 @@ class Lexer(object):
if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))

token_names = {t.name for t in tokens}
for t in ignore:
if t not in token_names:
raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
assert set(ignore) <= {t.name for t in tokens}

# Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
@@ -147,46 +167,8 @@ class Lexer(object):

self.mres = build_mres(tokens)


def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.newline_types)
ignore_types = list(self.ignore_types)
while True:
for mre, type_from_index in self.mres:
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
to_yield = type_ not in ignore_types

if to_yield:
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
end_col = t.column + len(value)
if t.type in self.callback:
t = self.callback[t.type](t)

if type_ in newline_types:
newlines = value.count(self.newline_char)
if newlines:
line += newlines
last_newline_index = value.rindex(self.newline_char) + 1
col_start_pos = lex_pos + last_newline_index
end_col = len(value) - last_newline_index

if to_yield:
t.end_line = line
t.end_col = end_col
yield t

lex_pos += len(value)
break
else:
if lex_pos < len(stream):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
break
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)


class ContextualLexer:
@@ -218,33 +200,39 @@ class ContextualLexer:
self.parser_state = state

def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.root_lexer.newline_types)
ignore_types = list(self.root_lexer.ignore_types)
l = _Lex(self.lexers[self.parser_state])
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
l.lexer = self.lexers[self.parser_state]


class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer):
self.lexer = lexer

def lex(self, stream, newline_types, ignore_types):
newline_types = list(newline_types)
newline_types = list(newline_types)
line_ctr = LineCounter()

while True:
lexer = self.lexers[self.parser_state]
lexer = self.lexer
for mre, type_from_index in lexer.mres:
m = mre.match(stream, lex_pos)
m = mre.match(stream, line_ctr.char_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
if type_ not in ignore_types:
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
yield t

if type_ in newline_types:
newlines = value.count(lexer.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(lexer.newline_char)
lex_pos += len(value)
lexer = yield t

line_ctr.feed(value, type_ in newline_types)
break
else:
if lex_pos < len(stream):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens)
if line_ctr.char_pos < len(stream):
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
break


+ 5
- 1
lark/load_grammar.py 查看文件

@@ -411,6 +411,7 @@ class Grammar:
terms_to_ignore = {name:'__'+name for name in self.ignore}
if terms_to_ignore:
assert set(terms_to_ignore) <= {name for name, _t in term_defs}

term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
expr = Token('RULE', '__ignore')
for r, tree, _o in rule_defs:
@@ -562,6 +563,7 @@ class GrammarLoader:
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
rules, callback = ParseTreeBuilder(d, T).apply()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR(lexer_conf, parser_conf)

@@ -636,7 +638,6 @@ class GrammarLoader:
ignore_names.append(name)
token_defs.append((name, (t, 0)))


# Verify correctness 2
token_names = set()
for name, _ in token_defs:
@@ -644,6 +645,9 @@ class GrammarLoader:
raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name)

if set(ignore_names) > token_names:
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))

# Resolve token references
resolve_token_references(token_defs)



+ 1
- 1
lark/parse_tree_builder.py 查看文件

@@ -121,7 +121,7 @@ class ParseTreeBuilder:

for expansion, alias in expansions:
if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))

wrapper_chain = filter(None, [
(expand1 and not alias) and Expand1,


+ 1
- 1
lark/parsers/xearley.py 查看文件

@@ -127,7 +127,7 @@ class Parser:

if token == '\n':
text_line += 1
text_column = 1
text_column = 0
else:
text_column += 1



+ 37
- 4
tests/test_parser.py 查看文件

@@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("x"))
self.assertEqual( r.children, ["<b>"] )

g = Lark("""start: a
?a : b
b : "x"
@@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("xx"))
self.assertEqual( r.children, ["<c>"] )

g = Lark("""start: a
?a : b b -> c
b : "x"
""", parser='lalr', transformer=T())
r = g.parse("xx")
self.assertEqual( r.children, ["<c>"] )



@@ -796,6 +796,39 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, ['a', 'A'])


def test_undefined_ignore(self):
g = """!start: "A"

%ignore B
"""
self.assertRaises( GrammarError, _Lark, g)

@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO
def test_line_and_column(self):
g = r"""!start: "A" bc "D"
!bc: "B\nC"
"""
l = _Lark(g)
a, bc, d = l.parse("AB\nCD").children
self.assertEqual(a.line, 1)
self.assertEqual(a.column, 0)

bc ,= bc.children
self.assertEqual(bc.line, 1)
self.assertEqual(bc.column, 1)

self.assertEqual(d.line, 2)
self.assertEqual(d.column, 1)

# self.assertEqual(a.end_line, 1)
# self.assertEqual(a.end_col, 1)
# self.assertEqual(bc.end_line, 2)
# self.assertEqual(bc.end_col, 1)
# self.assertEqual(d.end_line, 2)
# self.assertEqual(d.end_col, 2)



def test_reduce_cycle(self):
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
@@ -969,7 +1002,7 @@ def _make_parser_test(LEXER, PARSER):

parser = _Lark(grammar)

tree = parser.parse("int 1 ! This is a comment\n")
tree = parser.parse("int 1 ! This is a comment\n")
self.assertEqual(tree.children, ['1'])

tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!


正在加载...
取消
保存