diff --git a/lark/lexer.py b/lark/lexer.py index 66923b0..ba920c6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -111,12 +111,35 @@ def build_mres(tokens, match_whole=False): return _build_mres(tokens, len(tokens), match_whole) -class Lexer(object): +class LineCounter: + def __init__(self): + self.newline_char = '\n' + self.char_pos = 0 + self.line = 1 + self.column = 0 + self.line_start_pos = 0 + + def feed(self, token, test_newline=True): + """Consume a token and calculat the new line & column. + + As an optional optimization, set test_newline=False is token doesn't contain a newline. + """ + if test_newline: + newlines = token.count(self.newline_char) + if newlines: + self.line += newlines + self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 + + self.char_pos += len(token) + self.column = self.char_pos - self.line_start_pos + + + +class Lexer: def __init__(self, tokens, ignore=()): assert all(isinstance(t, TokenDef) for t in tokens), tokens self.ignore = ignore - self.newline_char = '\n' tokens = list(tokens) # Sanitization @@ -129,10 +152,7 @@ class Lexer(object): if t.pattern.min_width == 0: raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) - token_names = {t.name for t in tokens} - for t in ignore: - if t not in token_names: - raise LexError("Token '%s' was marked to ignore but it is not defined!" % t) + assert set(ignore) <= {t.name for t in tokens} # Init self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] @@ -147,46 +167,8 @@ class Lexer(object): self.mres = build_mres(tokens) - def lex(self, stream): - lex_pos = 0 - line = 1 - col_start_pos = 0 - newline_types = list(self.newline_types) - ignore_types = list(self.ignore_types) - while True: - for mre, type_from_index in self.mres: - m = mre.match(stream, lex_pos) - if m: - value = m.group(0) - type_ = type_from_index[m.lastindex] - to_yield = type_ not in ignore_types - - if to_yield: - t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) - end_col = t.column + len(value) - if t.type in self.callback: - t = self.callback[t.type](t) - - if type_ in newline_types: - newlines = value.count(self.newline_char) - if newlines: - line += newlines - last_newline_index = value.rindex(self.newline_char) + 1 - col_start_pos = lex_pos + last_newline_index - end_col = len(value) - last_newline_index - - if to_yield: - t.end_line = line - t.end_col = end_col - yield t - - lex_pos += len(value) - break - else: - if lex_pos < len(stream): - raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) - break + return _Lex(self).lex(stream, self.newline_types, self.ignore_types) class ContextualLexer: @@ -218,33 +200,39 @@ class ContextualLexer: self.parser_state = state def lex(self, stream): - lex_pos = 0 - line = 1 - col_start_pos = 0 - newline_types = list(self.root_lexer.newline_types) - ignore_types = list(self.root_lexer.ignore_types) + l = _Lex(self.lexers[self.parser_state]) + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + l.lexer = self.lexers[self.parser_state] + + +class _Lex: + "Built to serve both Lexer and ContextualLexer" + def __init__(self, lexer): + self.lexer = lexer + + def lex(self, stream, newline_types, ignore_types): + newline_types = list(newline_types) + newline_types = list(newline_types) + line_ctr = LineCounter() + while True: - lexer = self.lexers[self.parser_state] + lexer = self.lexer for mre, type_from_index in lexer.mres: - m = mre.match(stream, lex_pos) + m = mre.match(stream, line_ctr.char_pos) if m: value = m.group(0) type_ = type_from_index[m.lastindex] if type_ not in ignore_types: - t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) if t.type in lexer.callback: t = lexer.callback[t.type](t) - yield t - - if type_ in newline_types: - newlines = value.count(lexer.newline_char) - if newlines: - line += newlines - col_start_pos = lex_pos + value.rindex(lexer.newline_char) - lex_pos += len(value) + lexer = yield t + + line_ctr.feed(value, type_ in newline_types) break else: - if lex_pos < len(stream): - raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens) + if line_ctr.char_pos < len(stream): + raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) break diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 72e2e22..7726845 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -411,6 +411,7 @@ class Grammar: terms_to_ignore = {name:'__'+name for name in self.ignore} if terms_to_ignore: assert set(terms_to_ignore) <= {name for name, _t in term_defs} + term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] expr = Token('RULE', '__ignore') for r, tree, _o in rule_defs: @@ -562,6 +563,7 @@ class GrammarLoader: d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} rules, callback = ParseTreeBuilder(d, T).apply() lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) + parser_conf = ParserConf(rules, callback, 'start') self.parser = LALR(lexer_conf, parser_conf) @@ -636,7 +638,6 @@ class GrammarLoader: ignore_names.append(name) token_defs.append((name, (t, 0))) - # Verify correctness 2 token_names = set() for name, _ in token_defs: @@ -644,6 +645,9 @@ class GrammarLoader: raise GrammarError("Token '%s' defined more than once" % name) token_names.add(name) + if set(ignore_names) > token_names: + raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names)) + # Resolve token references resolve_token_references(token_defs) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 497af55..e26d287 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -121,7 +121,7 @@ class ParseTreeBuilder: for expansion, alias in expansions: if alias and origin.startswith('_'): - raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) + raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) wrapper_chain = filter(None, [ (expand1 and not alias) and Expand1, diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 3cc67f3..420c469 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -127,7 +127,7 @@ class Parser: if token == '\n': text_line += 1 - text_column = 1 + text_column = 0 else: text_column += 1 diff --git a/tests/test_parser.py b/tests/test_parser.py index d93e33b..db28834 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase): r = T().transform(g.parse("x")) self.assertEqual( r.children, [""] ) - + g = Lark("""start: a ?a : b b : "x" @@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase): r = T().transform(g.parse("xx")) self.assertEqual( r.children, [""] ) - + g = Lark("""start: a ?a : b b -> c b : "x" """, parser='lalr', transformer=T()) r = g.parse("xx") self.assertEqual( r.children, [""] ) - + @@ -796,6 +796,39 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(tree.children, ['a', 'A']) + def test_undefined_ignore(self): + g = """!start: "A" + + %ignore B + """ + self.assertRaises( GrammarError, _Lark, g) + + @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO + def test_line_and_column(self): + g = r"""!start: "A" bc "D" + !bc: "B\nC" + """ + l = _Lark(g) + a, bc, d = l.parse("AB\nCD").children + self.assertEqual(a.line, 1) + self.assertEqual(a.column, 0) + + bc ,= bc.children + self.assertEqual(bc.line, 1) + self.assertEqual(bc.column, 1) + + self.assertEqual(d.line, 2) + self.assertEqual(d.column, 1) + + # self.assertEqual(a.end_line, 1) + # self.assertEqual(a.end_col, 1) + # self.assertEqual(bc.end_line, 2) + # self.assertEqual(bc.end_col, 1) + # self.assertEqual(d.end_line, 2) + # self.assertEqual(d.end_col, 2) + + + def test_reduce_cycle(self): """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. @@ -969,7 +1002,7 @@ def _make_parser_test(LEXER, PARSER): parser = _Lark(grammar) - tree = parser.parse("int 1 ! This is a comment\n") + tree = parser.parse("int 1 ! This is a comment\n") self.assertEqual(tree.children, ['1']) tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky!