From a7e7b568ff5535a3becee9625ba469b5db444979 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 28 Sep 2019 21:42:39 +0300 Subject: [PATCH] Fixed contextual lexer error that was confusing users (Issue #194) --- lark/lexer.py | 81 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 9cd7adb..9ea224e 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,7 +3,7 @@ import re from .utils import Str, classify, get_regexp_width, Py36, Serialize -from .exceptions import UnexpectedCharacters, LexError +from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone @@ -43,7 +43,7 @@ class PatternStr(Pattern): __serialize_fields__ = 'value', 'flags' type = "str" - + def to_regexp(self): return self._get_flags(re.escape(self.value)) @@ -166,37 +166,32 @@ class _Lex: while line_ctr.char_pos < len(stream): lexer = self.lexer - for mre, type_from_index in lexer.mres: - m = mre.match(stream, line_ctr.char_pos) - if not m: - continue - - t = None - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - if not isinstance(t, Token): - raise ValueError("Callbacks must return a token (returned %r)" % t) - last_token = t - yield t - else: - if type_ in lexer.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t) - - line_ctr.feed(value, type_ in newline_types) - if t: - t.end_line = line_ctr.line - t.end_column = line_ctr.column - - break - else: + res = lexer.match(stream, line_ctr.char_pos) + if not res: allowed = {v for m, tfi in lexer.mres for v in tfi.values()} raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) + value, type_ = res + + t = None + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + last_token = t + yield t + else: + if type_ in lexer.callback: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t) + + line_ctr.feed(value, type_ in newline_types) + if t: + t.end_line = line_ctr.line + t.end_column = line_ctr.column + class UnlessCallback: def __init__(self, mres): @@ -330,6 +325,11 @@ class TraditionalLexer(Lexer): self.mres = build_mres(terminals) + def match(self, stream, pos): + for mre, type_from_index in self.mres: + m = mre.match(stream, pos) + if m: + return m.group(0), type_from_index[m.lastindex] def lex(self, stream): return _Lex(self).lex(stream, self.newline_types, self.ignore_types) @@ -367,9 +367,22 @@ class ContextualLexer(Lexer): def lex(self, stream): l = _Lex(self.lexers[self.parser_state], self.parser_state) - for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): - yield x - l.lexer = self.lexers[self.parser_state] - l.state = self.parser_state + try: + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + l.lexer = self.lexers[self.parser_state] + l.state = self.parser_state + except UnexpectedCharacters as e: + # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, + # but not in the current context. + # This tests the input against the global context, to provide a nicer error. + root_match = self.root_lexer.match(stream, e.pos_in_stream) + if not root_match: + raise + + value, type_ = root_match + t = Token(type_, value, e.pos_in_stream, e.line, e.column) + expected = {v for m, tfi in l.lexer.mres for v in tfi.values()} + raise UnexpectedToken(t, expected) ###}