@@ -111,12 +111,35 @@ def build_mres(tokens, match_whole=False): | |||||
return _build_mres(tokens, len(tokens), match_whole) | return _build_mres(tokens, len(tokens), match_whole) | ||||
class Lexer(object): | |||||
class LineCounter: | |||||
def __init__(self): | |||||
self.newline_char = '\n' | |||||
self.char_pos = 0 | |||||
self.line = 1 | |||||
self.column = 0 | |||||
self.line_start_pos = 0 | |||||
def feed(self, token, test_newline=True): | |||||
"""Consume a token and calculat the new line & column. | |||||
As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||||
""" | |||||
if test_newline: | |||||
newlines = token.count(self.newline_char) | |||||
if newlines: | |||||
self.line += newlines | |||||
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||||
self.char_pos += len(token) | |||||
self.column = self.char_pos - self.line_start_pos | |||||
class Lexer: | |||||
def __init__(self, tokens, ignore=()): | def __init__(self, tokens, ignore=()): | ||||
assert all(isinstance(t, TokenDef) for t in tokens), tokens | assert all(isinstance(t, TokenDef) for t in tokens), tokens | ||||
self.ignore = ignore | self.ignore = ignore | ||||
self.newline_char = '\n' | |||||
tokens = list(tokens) | tokens = list(tokens) | ||||
# Sanitization | # Sanitization | ||||
@@ -129,10 +152,7 @@ class Lexer(object): | |||||
if t.pattern.min_width == 0: | if t.pattern.min_width == 0: | ||||
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) | raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) | ||||
token_names = {t.name for t in tokens} | |||||
for t in ignore: | |||||
if t not in token_names: | |||||
raise LexError("Token '%s' was marked to ignore but it is not defined!" % t) | |||||
assert set(ignore) <= {t.name for t in tokens} | |||||
# Init | # Init | ||||
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] | self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] | ||||
@@ -147,46 +167,8 @@ class Lexer(object): | |||||
self.mres = build_mres(tokens) | self.mres = build_mres(tokens) | ||||
def lex(self, stream): | def lex(self, stream): | ||||
lex_pos = 0 | |||||
line = 1 | |||||
col_start_pos = 0 | |||||
newline_types = list(self.newline_types) | |||||
ignore_types = list(self.ignore_types) | |||||
while True: | |||||
for mre, type_from_index in self.mres: | |||||
m = mre.match(stream, lex_pos) | |||||
if m: | |||||
value = m.group(0) | |||||
type_ = type_from_index[m.lastindex] | |||||
to_yield = type_ not in ignore_types | |||||
if to_yield: | |||||
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | |||||
end_col = t.column + len(value) | |||||
if t.type in self.callback: | |||||
t = self.callback[t.type](t) | |||||
if type_ in newline_types: | |||||
newlines = value.count(self.newline_char) | |||||
if newlines: | |||||
line += newlines | |||||
last_newline_index = value.rindex(self.newline_char) + 1 | |||||
col_start_pos = lex_pos + last_newline_index | |||||
end_col = len(value) - last_newline_index | |||||
if to_yield: | |||||
t.end_line = line | |||||
t.end_col = end_col | |||||
yield t | |||||
lex_pos += len(value) | |||||
break | |||||
else: | |||||
if lex_pos < len(stream): | |||||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||||
break | |||||
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | |||||
class ContextualLexer: | class ContextualLexer: | ||||
@@ -218,33 +200,39 @@ class ContextualLexer: | |||||
self.parser_state = state | self.parser_state = state | ||||
def lex(self, stream): | def lex(self, stream): | ||||
lex_pos = 0 | |||||
line = 1 | |||||
col_start_pos = 0 | |||||
newline_types = list(self.root_lexer.newline_types) | |||||
ignore_types = list(self.root_lexer.ignore_types) | |||||
l = _Lex(self.lexers[self.parser_state]) | |||||
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): | |||||
yield x | |||||
l.lexer = self.lexers[self.parser_state] | |||||
class _Lex: | |||||
"Built to serve both Lexer and ContextualLexer" | |||||
def __init__(self, lexer): | |||||
self.lexer = lexer | |||||
def lex(self, stream, newline_types, ignore_types): | |||||
newline_types = list(newline_types) | |||||
newline_types = list(newline_types) | |||||
line_ctr = LineCounter() | |||||
while True: | while True: | ||||
lexer = self.lexers[self.parser_state] | |||||
lexer = self.lexer | |||||
for mre, type_from_index in lexer.mres: | for mre, type_from_index in lexer.mres: | ||||
m = mre.match(stream, lex_pos) | |||||
m = mre.match(stream, line_ctr.char_pos) | |||||
if m: | if m: | ||||
value = m.group(0) | value = m.group(0) | ||||
type_ = type_from_index[m.lastindex] | type_ = type_from_index[m.lastindex] | ||||
if type_ not in ignore_types: | if type_ not in ignore_types: | ||||
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | |||||
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
if t.type in lexer.callback: | if t.type in lexer.callback: | ||||
t = lexer.callback[t.type](t) | t = lexer.callback[t.type](t) | ||||
yield t | |||||
if type_ in newline_types: | |||||
newlines = value.count(lexer.newline_char) | |||||
if newlines: | |||||
line += newlines | |||||
col_start_pos = lex_pos + value.rindex(lexer.newline_char) | |||||
lex_pos += len(value) | |||||
lexer = yield t | |||||
line_ctr.feed(value, type_ in newline_types) | |||||
break | break | ||||
else: | else: | ||||
if lex_pos < len(stream): | |||||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens) | |||||
if line_ctr.char_pos < len(stream): | |||||
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
break | break | ||||
@@ -411,6 +411,7 @@ class Grammar: | |||||
terms_to_ignore = {name:'__'+name for name in self.ignore} | terms_to_ignore = {name:'__'+name for name in self.ignore} | ||||
if terms_to_ignore: | if terms_to_ignore: | ||||
assert set(terms_to_ignore) <= {name for name, _t in term_defs} | assert set(terms_to_ignore) <= {name for name, _t in term_defs} | ||||
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] | term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] | ||||
expr = Token('RULE', '__ignore') | expr = Token('RULE', '__ignore') | ||||
for r, tree, _o in rule_defs: | for r, tree, _o in rule_defs: | ||||
@@ -562,6 +563,7 @@ class GrammarLoader: | |||||
d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} | d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} | ||||
rules, callback = ParseTreeBuilder(d, T).apply() | rules, callback = ParseTreeBuilder(d, T).apply() | ||||
lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | ||||
parser_conf = ParserConf(rules, callback, 'start') | parser_conf = ParserConf(rules, callback, 'start') | ||||
self.parser = LALR(lexer_conf, parser_conf) | self.parser = LALR(lexer_conf, parser_conf) | ||||
@@ -636,7 +638,6 @@ class GrammarLoader: | |||||
ignore_names.append(name) | ignore_names.append(name) | ||||
token_defs.append((name, (t, 0))) | token_defs.append((name, (t, 0))) | ||||
# Verify correctness 2 | # Verify correctness 2 | ||||
token_names = set() | token_names = set() | ||||
for name, _ in token_defs: | for name, _ in token_defs: | ||||
@@ -644,6 +645,9 @@ class GrammarLoader: | |||||
raise GrammarError("Token '%s' defined more than once" % name) | raise GrammarError("Token '%s' defined more than once" % name) | ||||
token_names.add(name) | token_names.add(name) | ||||
if set(ignore_names) > token_names: | |||||
raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names)) | |||||
# Resolve token references | # Resolve token references | ||||
resolve_token_references(token_defs) | resolve_token_references(token_defs) | ||||
@@ -121,7 +121,7 @@ class ParseTreeBuilder: | |||||
for expansion, alias in expansions: | for expansion, alias in expansions: | ||||
if alias and origin.startswith('_'): | if alias and origin.startswith('_'): | ||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) | |||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) | |||||
wrapper_chain = filter(None, [ | wrapper_chain = filter(None, [ | ||||
(expand1 and not alias) and Expand1, | (expand1 and not alias) and Expand1, | ||||
@@ -127,7 +127,7 @@ class Parser: | |||||
if token == '\n': | if token == '\n': | ||||
text_line += 1 | text_line += 1 | ||||
text_column = 1 | |||||
text_column = 0 | |||||
else: | else: | ||||
text_column += 1 | text_column += 1 | ||||
@@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase): | |||||
r = T().transform(g.parse("x")) | r = T().transform(g.parse("x")) | ||||
self.assertEqual( r.children, ["<b>"] ) | self.assertEqual( r.children, ["<b>"] ) | ||||
g = Lark("""start: a | g = Lark("""start: a | ||||
?a : b | ?a : b | ||||
b : "x" | b : "x" | ||||
@@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase): | |||||
r = T().transform(g.parse("xx")) | r = T().transform(g.parse("xx")) | ||||
self.assertEqual( r.children, ["<c>"] ) | self.assertEqual( r.children, ["<c>"] ) | ||||
g = Lark("""start: a | g = Lark("""start: a | ||||
?a : b b -> c | ?a : b b -> c | ||||
b : "x" | b : "x" | ||||
""", parser='lalr', transformer=T()) | """, parser='lalr', transformer=T()) | ||||
r = g.parse("xx") | r = g.parse("xx") | ||||
self.assertEqual( r.children, ["<c>"] ) | self.assertEqual( r.children, ["<c>"] ) | ||||
@@ -796,6 +796,39 @@ def _make_parser_test(LEXER, PARSER): | |||||
self.assertEqual(tree.children, ['a', 'A']) | self.assertEqual(tree.children, ['a', 'A']) | ||||
def test_undefined_ignore(self): | |||||
g = """!start: "A" | |||||
%ignore B | |||||
""" | |||||
self.assertRaises( GrammarError, _Lark, g) | |||||
@unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | |||||
def test_line_and_column(self): | |||||
g = r"""!start: "A" bc "D" | |||||
!bc: "B\nC" | |||||
""" | |||||
l = _Lark(g) | |||||
a, bc, d = l.parse("AB\nCD").children | |||||
self.assertEqual(a.line, 1) | |||||
self.assertEqual(a.column, 0) | |||||
bc ,= bc.children | |||||
self.assertEqual(bc.line, 1) | |||||
self.assertEqual(bc.column, 1) | |||||
self.assertEqual(d.line, 2) | |||||
self.assertEqual(d.column, 1) | |||||
# self.assertEqual(a.end_line, 1) | |||||
# self.assertEqual(a.end_col, 1) | |||||
# self.assertEqual(bc.end_line, 2) | |||||
# self.assertEqual(bc.end_col, 1) | |||||
# self.assertEqual(d.end_line, 2) | |||||
# self.assertEqual(d.end_col, 2) | |||||
def test_reduce_cycle(self): | def test_reduce_cycle(self): | ||||
"""Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. | """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. | ||||
It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. | It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. | ||||
@@ -969,7 +1002,7 @@ def _make_parser_test(LEXER, PARSER): | |||||
parser = _Lark(grammar) | parser = _Lark(grammar) | ||||
tree = parser.parse("int 1 ! This is a comment\n") | |||||
tree = parser.parse("int 1 ! This is a comment\n") | |||||
self.assertEqual(tree.children, ['1']) | self.assertEqual(tree.children, ['1']) | ||||
tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! | tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! | ||||