| @@ -3,7 +3,7 @@ | |||||
| import re | import re | ||||
| from .utils import Str, classify, get_regexp_width, Py36, Serialize | from .utils import Str, classify, get_regexp_width, Py36, Serialize | ||||
| from .exceptions import UnexpectedCharacters, LexError | |||||
| from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken | |||||
| ###{standalone | ###{standalone | ||||
| @@ -43,7 +43,7 @@ class PatternStr(Pattern): | |||||
| __serialize_fields__ = 'value', 'flags' | __serialize_fields__ = 'value', 'flags' | ||||
| type = "str" | type = "str" | ||||
| def to_regexp(self): | def to_regexp(self): | ||||
| return self._get_flags(re.escape(self.value)) | return self._get_flags(re.escape(self.value)) | ||||
| @@ -166,37 +166,32 @@ class _Lex: | |||||
| while line_ctr.char_pos < len(stream): | while line_ctr.char_pos < len(stream): | ||||
| lexer = self.lexer | lexer = self.lexer | ||||
| for mre, type_from_index in lexer.mres: | |||||
| m = mre.match(stream, line_ctr.char_pos) | |||||
| if not m: | |||||
| continue | |||||
| t = None | |||||
| value = m.group(0) | |||||
| type_ = type_from_index[m.lastindex] | |||||
| if type_ not in ignore_types: | |||||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| if t.type in lexer.callback: | |||||
| t = lexer.callback[t.type](t) | |||||
| if not isinstance(t, Token): | |||||
| raise ValueError("Callbacks must return a token (returned %r)" % t) | |||||
| last_token = t | |||||
| yield t | |||||
| else: | |||||
| if type_ in lexer.callback: | |||||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| lexer.callback[type_](t) | |||||
| line_ctr.feed(value, type_ in newline_types) | |||||
| if t: | |||||
| t.end_line = line_ctr.line | |||||
| t.end_column = line_ctr.column | |||||
| break | |||||
| else: | |||||
| res = lexer.match(stream, line_ctr.char_pos) | |||||
| if not res: | |||||
| allowed = {v for m, tfi in lexer.mres for v in tfi.values()} | allowed = {v for m, tfi in lexer.mres for v in tfi.values()} | ||||
| raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) | raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) | ||||
| value, type_ = res | |||||
| t = None | |||||
| if type_ not in ignore_types: | |||||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| if t.type in lexer.callback: | |||||
| t = lexer.callback[t.type](t) | |||||
| if not isinstance(t, Token): | |||||
| raise ValueError("Callbacks must return a token (returned %r)" % t) | |||||
| last_token = t | |||||
| yield t | |||||
| else: | |||||
| if type_ in lexer.callback: | |||||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| lexer.callback[type_](t) | |||||
| line_ctr.feed(value, type_ in newline_types) | |||||
| if t: | |||||
| t.end_line = line_ctr.line | |||||
| t.end_column = line_ctr.column | |||||
| class UnlessCallback: | class UnlessCallback: | ||||
| def __init__(self, mres): | def __init__(self, mres): | ||||
| @@ -330,6 +325,11 @@ class TraditionalLexer(Lexer): | |||||
| self.mres = build_mres(terminals) | self.mres = build_mres(terminals) | ||||
| def match(self, stream, pos): | |||||
| for mre, type_from_index in self.mres: | |||||
| m = mre.match(stream, pos) | |||||
| if m: | |||||
| return m.group(0), type_from_index[m.lastindex] | |||||
| def lex(self, stream): | def lex(self, stream): | ||||
| return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | ||||
| @@ -367,9 +367,22 @@ class ContextualLexer(Lexer): | |||||
| def lex(self, stream): | def lex(self, stream): | ||||
| l = _Lex(self.lexers[self.parser_state], self.parser_state) | l = _Lex(self.lexers[self.parser_state], self.parser_state) | ||||
| for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): | |||||
| yield x | |||||
| l.lexer = self.lexers[self.parser_state] | |||||
| l.state = self.parser_state | |||||
| try: | |||||
| for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): | |||||
| yield x | |||||
| l.lexer = self.lexers[self.parser_state] | |||||
| l.state = self.parser_state | |||||
| except UnexpectedCharacters as e: | |||||
| # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, | |||||
| # but not in the current context. | |||||
| # This tests the input against the global context, to provide a nicer error. | |||||
| root_match = self.root_lexer.match(stream, e.pos_in_stream) | |||||
| if not root_match: | |||||
| raise | |||||
| value, type_ = root_match | |||||
| t = Token(type_, value, e.pos_in_stream, e.line, e.column) | |||||
| expected = {v for m, tfi in l.lexer.mres for v in tfi.values()} | |||||
| raise UnexpectedToken(t, expected) | |||||
| ###} | ###} | ||||