| @@ -2,7 +2,7 @@ | |||||
| import re | import re | ||||
| from .utils import Str, classify, get_regexp_width, Py36, Serialize | |||||
| from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress | |||||
| from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken | from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken | ||||
| ###{standalone | ###{standalone | ||||
| @@ -178,49 +178,6 @@ class LineCounter: | |||||
| self.char_pos += len(token) | self.char_pos += len(token) | ||||
| self.column = self.char_pos - self.line_start_pos + 1 | self.column = self.char_pos - self.line_start_pos + 1 | ||||
| class _Lex: | |||||
| "Built to serve both Lexer and ContextualLexer" | |||||
| def __init__(self, lexer, state=None): | |||||
| self.lexer = lexer | |||||
| self.state = state | |||||
| def lex(self, stream, newline_types, ignore_types): | |||||
| newline_types = frozenset(newline_types) | |||||
| ignore_types = frozenset(ignore_types) | |||||
| line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') | |||||
| last_token = None | |||||
| while line_ctr.char_pos < len(stream): | |||||
| lexer = self.lexer | |||||
| res = lexer.match(stream, line_ctr.char_pos) | |||||
| if not res: | |||||
| allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types | |||||
| if not allowed: | |||||
| allowed = {"<END-OF-FILE>"} | |||||
| raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) | |||||
| value, type_ = res | |||||
| if type_ not in ignore_types: | |||||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| line_ctr.feed(value, type_ in newline_types) | |||||
| t.end_line = line_ctr.line | |||||
| t.end_column = line_ctr.column | |||||
| t.end_pos = line_ctr.char_pos | |||||
| if t.type in lexer.callback: | |||||
| t = lexer.callback[t.type](t) | |||||
| if not isinstance(t, Token): | |||||
| raise ValueError("Callbacks must return a token (returned %r)" % t) | |||||
| yield t | |||||
| last_token = t | |||||
| else: | |||||
| if type_ in lexer.callback: | |||||
| t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| lexer.callback[type_](t2) | |||||
| line_ctr.feed(value, type_ in newline_types) | |||||
| class UnlessCallback: | class UnlessCallback: | ||||
| def __init__(self, mres): | def __init__(self, mres): | ||||
| @@ -308,7 +265,7 @@ class Lexer(object): | |||||
| """Lexer interface | """Lexer interface | ||||
| Method Signatures: | Method Signatures: | ||||
| lex(self, stream) -> Iterator[Token] | |||||
| lex(self, text) -> Iterator[Token] | |||||
| """ | """ | ||||
| lex = NotImplemented | lex = NotImplemented | ||||
| @@ -335,8 +292,8 @@ class TraditionalLexer(Lexer): | |||||
| assert set(conf.ignore) <= {t.name for t in terminals} | assert set(conf.ignore) <= {t.name for t in terminals} | ||||
| # Init | # Init | ||||
| self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] | |||||
| self.ignore_types = list(conf.ignore) | |||||
| self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())) | |||||
| self.ignore_types = frozenset(conf.ignore) | |||||
| terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | ||||
| self.terminals = terminals | self.terminals = terminals | ||||
| @@ -345,7 +302,6 @@ class TraditionalLexer(Lexer): | |||||
| self.use_bytes = conf.use_bytes | self.use_bytes = conf.use_bytes | ||||
| self._mres = None | self._mres = None | ||||
| # self.build(g_regex_flags) | |||||
| def _build(self): | def _build(self): | ||||
| terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) | terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) | ||||
| @@ -366,16 +322,61 @@ class TraditionalLexer(Lexer): | |||||
| self._build() | self._build() | ||||
| return self._mres | return self._mres | ||||
| def match(self, stream, pos): | |||||
| def match(self, text, pos): | |||||
| for mre, type_from_index in self.mres: | for mre, type_from_index in self.mres: | ||||
| m = mre.match(stream, pos) | |||||
| m = mre.match(text, pos) | |||||
| if m: | if m: | ||||
| return m.group(0), type_from_index[m.lastindex] | return m.group(0), type_from_index[m.lastindex] | ||||
| def lex(self, stream): | |||||
| return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | |||||
| def make_lexer_state(self, text): | |||||
| line_ctr = LineCounter('\n' if not self.use_bytes else b'\n') | |||||
| return LexerState(text, line_ctr) | |||||
| def lex(self, text): | |||||
| state = self.make_lexer_state(text) | |||||
| with suppress(EOFError): | |||||
| while True: | |||||
| yield self.next_token(state) | |||||
| def next_token(self, lex_state): | |||||
| text = lex_state.text | |||||
| line_ctr = lex_state.line_ctr | |||||
| while line_ctr.char_pos < len(text): | |||||
| res = self.match(text, line_ctr.char_pos) | |||||
| if not res: | |||||
| allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types | |||||
| if not allowed: | |||||
| allowed = {"<END-OF-FILE>"} | |||||
| raise UnexpectedCharacters(text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) | |||||
| value, type_ = res | |||||
| if type_ not in self.ignore_types: | |||||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| line_ctr.feed(value, type_ in self.newline_types) | |||||
| t.end_line = line_ctr.line | |||||
| t.end_column = line_ctr.column | |||||
| t.end_pos = line_ctr.char_pos | |||||
| if t.type in self.callback: | |||||
| t = self.callback[t.type](t) | |||||
| if not isinstance(t, Token): | |||||
| raise ValueError("Callbacks must return a token (returned %r)" % t) | |||||
| lex_state.last_token = t | |||||
| return t | |||||
| else: | |||||
| if type_ in self.callback: | |||||
| t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| self.callback[type_](t2) | |||||
| line_ctr.feed(value, type_ in self.newline_types) | |||||
| # EOF | |||||
| raise EOFError(self) | |||||
| class LexerState: | |||||
| def __init__(self, text, line_ctr, last_token=None): | |||||
| self.text = text | |||||
| self.line_ctr = line_ctr | |||||
| self.last_token = last_token | |||||
| class ContextualLexer(Lexer): | class ContextualLexer(Lexer): | ||||
| @@ -409,25 +410,24 @@ class ContextualLexer(Lexer): | |||||
| assert trad_conf.tokens is terminals | assert trad_conf.tokens is terminals | ||||
| self.root_lexer = TraditionalLexer(trad_conf) | self.root_lexer = TraditionalLexer(trad_conf) | ||||
| def lex(self, stream, get_parser_state): | |||||
| parser_state = get_parser_state() | |||||
| l = _Lex(self.lexers[parser_state], parser_state) | |||||
| def lex(self, text, get_parser_state): | |||||
| state = self.root_lexer.make_lexer_state(text) | |||||
| try: | try: | ||||
| for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): | |||||
| yield x | |||||
| parser_state = get_parser_state() | |||||
| l.lexer = self.lexers[parser_state] | |||||
| l.state = parser_state # For debug only, no need to worry about multithreading | |||||
| while True: | |||||
| lexer = self.lexers[get_parser_state()] | |||||
| yield lexer.next_token(state) | |||||
| except EOFError: | |||||
| pass | |||||
| except UnexpectedCharacters as e: | except UnexpectedCharacters as e: | ||||
| # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, | # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, | ||||
| # but not in the current context. | # but not in the current context. | ||||
| # This tests the input against the global context, to provide a nicer error. | # This tests the input against the global context, to provide a nicer error. | ||||
| root_match = self.root_lexer.match(stream, e.pos_in_stream) | |||||
| root_match = self.root_lexer.match(text, e.pos_in_stream) | |||||
| if not root_match: | if not root_match: | ||||
| raise | raise | ||||
| value, type_ = root_match | value, type_ = root_match | ||||
| t = Token(type_, value, e.pos_in_stream, e.line, e.column) | t = Token(type_, value, e.pos_in_stream, e.line, e.column) | ||||
| raise UnexpectedToken(t, e.allowed, state=e.state) | |||||
| raise UnexpectedToken(t, e.allowed, state=get_parser_state()) | |||||
| ###} | ###} | ||||