Fixed contextual lexer error that was confusing users (Issue #194)

5 years ago · a7e7b568ff
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -3,7 +3,7 @@
 import re

 from .utils import Str, classify, get_regexp_width, Py36, Serialize
 from .exceptions import UnexpectedCharacters, LexError
 from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken

 ###{standalone

@@ -43,7 +43,7 @@ class PatternStr(Pattern):
    __serialize_fields__ = 'value', 'flags'

    type = "str"
    

    def to_regexp(self):
        return self._get_flags(re.escape(self.value))

@@ -166,37 +166,32 @@ class _Lex:

        while line_ctr.char_pos < len(stream):
            lexer = self.lexer
            for mre, type_from_index in lexer.mres:
                m = mre.match(stream, line_ctr.char_pos)
                if not m:
                    continue

                t = None
                value = m.group(0)
                type_ = type_from_index[m.lastindex]
                if type_ not in ignore_types:
                    t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
                    if t.type in lexer.callback:
                        t = lexer.callback[t.type](t)
                        if not isinstance(t, Token):
                            raise ValueError("Callbacks must return a token (returned %r)" % t)
                    last_token = t
                    yield t
                else:
                    if type_ in lexer.callback:
                        t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
                        lexer.callback[type_](t)

                line_ctr.feed(value, type_ in newline_types)
                if t:
                    t.end_line = line_ctr.line
                    t.end_column = line_ctr.column

                break
            else:
            res = lexer.match(stream, line_ctr.char_pos)
            if not res:
                allowed = {v for m, tfi in lexer.mres for v in tfi.values()}
                raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])

            value, type_ = res

            t = None
            if type_ not in ignore_types:
                t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
                if t.type in lexer.callback:
                    t = lexer.callback[t.type](t)
                    if not isinstance(t, Token):
                        raise ValueError("Callbacks must return a token (returned %r)" % t)
                last_token = t
                yield t
            else:
                if type_ in lexer.callback:
                    t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
                    lexer.callback[type_](t)

            line_ctr.feed(value, type_ in newline_types)
            if t:
                t.end_line = line_ctr.line
                t.end_column = line_ctr.column


 class UnlessCallback:
    def __init__(self, mres):
@@ -330,6 +325,11 @@ class TraditionalLexer(Lexer):

        self.mres = build_mres(terminals)

    def match(self, stream, pos):
        for mre, type_from_index in self.mres:
            m = mre.match(stream, pos)
            if m:
                return m.group(0), type_from_index[m.lastindex]

    def lex(self, stream):
        return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
@@ -367,9 +367,22 @@ class ContextualLexer(Lexer):

    def lex(self, stream):
        l = _Lex(self.lexers[self.parser_state], self.parser_state)
        for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
            yield x
            l.lexer = self.lexers[self.parser_state]
            l.state = self.parser_state
        try:
            for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
                yield x
                l.lexer = self.lexers[self.parser_state]
                l.state = self.parser_state
        except UnexpectedCharacters as e:
            # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
            # but not in the current context.
            # This tests the input against the global context, to provide a nicer error.
            root_match = self.root_lexer.match(stream, e.pos_in_stream)
            if not root_match:
                raise

            value, type_ = root_match
            t = Token(type_, value, e.pos_in_stream, e.line, e.column)
            expected = {v for m, tfi in l.lexer.mres for v in tfi.values()}
            raise UnexpectedToken(t, expected)

 ###}