Browse Source

Improved grammar validation and refactored the lexers

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.2
Erez Shinan 6 years ago
parent
commit
38c5fd244a
5 changed files with 95 additions and 70 deletions
  1. +51
    -63
      lark/lexer.py
  2. +5
    -1
      lark/load_grammar.py
  3. +1
    -1
      lark/parse_tree_builder.py
  4. +1
    -1
      lark/parsers/xearley.py
  5. +37
    -4
      tests/test_parser.py

+ 51
- 63
lark/lexer.py View File

@@ -111,12 +111,35 @@ def build_mres(tokens, match_whole=False):
return _build_mres(tokens, len(tokens), match_whole) return _build_mres(tokens, len(tokens), match_whole)




class Lexer(object):
class LineCounter:
def __init__(self):
self.newline_char = '\n'
self.char_pos = 0
self.line = 1
self.column = 0
self.line_start_pos = 0

def feed(self, token, test_newline=True):
"""Consume a token and calculat the new line & column.

As an optional optimization, set test_newline=False is token doesn't contain a newline.
"""
if test_newline:
newlines = token.count(self.newline_char)
if newlines:
self.line += newlines
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1

self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos



class Lexer:
def __init__(self, tokens, ignore=()): def __init__(self, tokens, ignore=()):
assert all(isinstance(t, TokenDef) for t in tokens), tokens assert all(isinstance(t, TokenDef) for t in tokens), tokens


self.ignore = ignore self.ignore = ignore
self.newline_char = '\n'
tokens = list(tokens) tokens = list(tokens)


# Sanitization # Sanitization
@@ -129,10 +152,7 @@ class Lexer(object):
if t.pattern.min_width == 0: if t.pattern.min_width == 0:
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern))


token_names = {t.name for t in tokens}
for t in ignore:
if t not in token_names:
raise LexError("Token '%s' was marked to ignore but it is not defined!" % t)
assert set(ignore) <= {t.name for t in tokens}


# Init # Init
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())]
@@ -147,46 +167,8 @@ class Lexer(object):


self.mres = build_mres(tokens) self.mres = build_mres(tokens)



def lex(self, stream): def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.newline_types)
ignore_types = list(self.ignore_types)
while True:
for mre, type_from_index in self.mres:
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_ = type_from_index[m.lastindex]
to_yield = type_ not in ignore_types

if to_yield:
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
end_col = t.column + len(value)
if t.type in self.callback:
t = self.callback[t.type](t)

if type_ in newline_types:
newlines = value.count(self.newline_char)
if newlines:
line += newlines
last_newline_index = value.rindex(self.newline_char) + 1
col_start_pos = lex_pos + last_newline_index
end_col = len(value) - last_newline_index

if to_yield:
t.end_line = line
t.end_col = end_col
yield t

lex_pos += len(value)
break
else:
if lex_pos < len(stream):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos)
break
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)




class ContextualLexer: class ContextualLexer:
@@ -218,33 +200,39 @@ class ContextualLexer:
self.parser_state = state self.parser_state = state


def lex(self, stream): def lex(self, stream):
lex_pos = 0
line = 1
col_start_pos = 0
newline_types = list(self.root_lexer.newline_types)
ignore_types = list(self.root_lexer.ignore_types)
l = _Lex(self.lexers[self.parser_state])
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
l.lexer = self.lexers[self.parser_state]


class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer):
self.lexer = lexer

def lex(self, stream, newline_types, ignore_types):
newline_types = list(newline_types)
newline_types = list(newline_types)
line_ctr = LineCounter()

while True: while True:
lexer = self.lexers[self.parser_state]
lexer = self.lexer
for mre, type_from_index in lexer.mres: for mre, type_from_index in lexer.mres:
m = mre.match(stream, lex_pos)
m = mre.match(stream, line_ctr.char_pos)
if m: if m:
value = m.group(0) value = m.group(0)
type_ = type_from_index[m.lastindex] type_ = type_from_index[m.lastindex]
if type_ not in ignore_types: if type_ not in ignore_types:
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos)
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
if t.type in lexer.callback: if t.type in lexer.callback:
t = lexer.callback[t.type](t) t = lexer.callback[t.type](t)
yield t

if type_ in newline_types:
newlines = value.count(lexer.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(lexer.newline_char)
lex_pos += len(value)
lexer = yield t

line_ctr.feed(value, type_ in newline_types)
break break
else: else:
if lex_pos < len(stream):
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens)
if line_ctr.char_pos < len(stream):
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column)
break break



+ 5
- 1
lark/load_grammar.py View File

@@ -411,6 +411,7 @@ class Grammar:
terms_to_ignore = {name:'__'+name for name in self.ignore} terms_to_ignore = {name:'__'+name for name in self.ignore}
if terms_to_ignore: if terms_to_ignore:
assert set(terms_to_ignore) <= {name for name, _t in term_defs} assert set(terms_to_ignore) <= {name for name, _t in term_defs}

term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs]
expr = Token('RULE', '__ignore') expr = Token('RULE', '__ignore')
for r, tree, _o in rule_defs: for r, tree, _o in rule_defs:
@@ -562,6 +563,7 @@ class GrammarLoader:
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules}
rules, callback = ParseTreeBuilder(d, T).apply() rules, callback = ParseTreeBuilder(d, T).apply()
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) lexer_conf = LexerConf(tokens, ['WS', 'COMMENT'])

parser_conf = ParserConf(rules, callback, 'start') parser_conf = ParserConf(rules, callback, 'start')
self.parser = LALR(lexer_conf, parser_conf) self.parser = LALR(lexer_conf, parser_conf)


@@ -636,7 +638,6 @@ class GrammarLoader:
ignore_names.append(name) ignore_names.append(name)
token_defs.append((name, (t, 0))) token_defs.append((name, (t, 0)))



# Verify correctness 2 # Verify correctness 2
token_names = set() token_names = set()
for name, _ in token_defs: for name, _ in token_defs:
@@ -644,6 +645,9 @@ class GrammarLoader:
raise GrammarError("Token '%s' defined more than once" % name) raise GrammarError("Token '%s' defined more than once" % name)
token_names.add(name) token_names.add(name)


if set(ignore_names) > token_names:
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names))

# Resolve token references # Resolve token references
resolve_token_references(token_defs) resolve_token_references(token_defs)




+ 1
- 1
lark/parse_tree_builder.py View File

@@ -121,7 +121,7 @@ class ParseTreeBuilder:


for expansion, alias in expansions: for expansion, alias in expansions:
if alias and origin.startswith('_'): if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias))


wrapper_chain = filter(None, [ wrapper_chain = filter(None, [
(expand1 and not alias) and Expand1, (expand1 and not alias) and Expand1,


+ 1
- 1
lark/parsers/xearley.py View File

@@ -127,7 +127,7 @@ class Parser:


if token == '\n': if token == '\n':
text_line += 1 text_line += 1
text_column = 1
text_column = 0
else: else:
text_column += 1 text_column += 1




+ 37
- 4
tests/test_parser.py View File

@@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("x")) r = T().transform(g.parse("x"))
self.assertEqual( r.children, ["<b>"] ) self.assertEqual( r.children, ["<b>"] )


g = Lark("""start: a g = Lark("""start: a
?a : b ?a : b
b : "x" b : "x"
@@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase):
r = T().transform(g.parse("xx")) r = T().transform(g.parse("xx"))
self.assertEqual( r.children, ["<c>"] ) self.assertEqual( r.children, ["<c>"] )


g = Lark("""start: a g = Lark("""start: a
?a : b b -> c ?a : b b -> c
b : "x" b : "x"
""", parser='lalr', transformer=T()) """, parser='lalr', transformer=T())
r = g.parse("xx") r = g.parse("xx")
self.assertEqual( r.children, ["<c>"] ) self.assertEqual( r.children, ["<c>"] )






@@ -796,6 +796,39 @@ def _make_parser_test(LEXER, PARSER):
self.assertEqual(tree.children, ['a', 'A']) self.assertEqual(tree.children, ['a', 'A'])




def test_undefined_ignore(self):
g = """!start: "A"

%ignore B
"""
self.assertRaises( GrammarError, _Lark, g)

@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO
def test_line_and_column(self):
g = r"""!start: "A" bc "D"
!bc: "B\nC"
"""
l = _Lark(g)
a, bc, d = l.parse("AB\nCD").children
self.assertEqual(a.line, 1)
self.assertEqual(a.column, 0)

bc ,= bc.children
self.assertEqual(bc.line, 1)
self.assertEqual(bc.column, 1)

self.assertEqual(d.line, 2)
self.assertEqual(d.column, 1)

# self.assertEqual(a.end_line, 1)
# self.assertEqual(a.end_col, 1)
# self.assertEqual(bc.end_line, 2)
# self.assertEqual(bc.end_col, 1)
# self.assertEqual(d.end_line, 2)
# self.assertEqual(d.end_col, 2)



def test_reduce_cycle(self): def test_reduce_cycle(self):
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state.
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function.
@@ -969,7 +1002,7 @@ def _make_parser_test(LEXER, PARSER):


parser = _Lark(grammar) parser = _Lark(grammar)


tree = parser.parse("int 1 ! This is a comment\n")
tree = parser.parse("int 1 ! This is a comment\n")
self.assertEqual(tree.children, ['1']) self.assertEqual(tree.children, ['1'])


tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!


Loading…
Cancel
Save