diff --git a/lark/lexer.py b/lark/lexer.py index 8fc9e4b..c1793de 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -2,7 +2,7 @@ import re -from .utils import Str, classify, get_regexp_width, Py36, Serialize +from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone @@ -178,49 +178,6 @@ class LineCounter: self.char_pos += len(token) self.column = self.char_pos - self.line_start_pos + 1 -class _Lex: - "Built to serve both Lexer and ContextualLexer" - def __init__(self, lexer, state=None): - self.lexer = lexer - self.state = state - - def lex(self, stream, newline_types, ignore_types): - newline_types = frozenset(newline_types) - ignore_types = frozenset(ignore_types) - line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n') - last_token = None - - while line_ctr.char_pos < len(stream): - lexer = self.lexer - res = lexer.match(stream, line_ctr.char_pos) - if not res: - allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types - if not allowed: - allowed = {""} - raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) - - value, type_ = res - - if type_ not in ignore_types: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - line_ctr.feed(value, type_ in newline_types) - t.end_line = line_ctr.line - t.end_column = line_ctr.column - t.end_pos = line_ctr.char_pos - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - if not isinstance(t, Token): - raise ValueError("Callbacks must return a token (returned %r)" % t) - yield t - last_token = t - else: - if type_ in lexer.callback: - t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t2) - line_ctr.feed(value, type_ in newline_types) - - - class UnlessCallback: def __init__(self, mres): @@ -308,7 +265,7 @@ class Lexer(object): """Lexer interface Method Signatures: - lex(self, stream) -> Iterator[Token] + lex(self, text) -> Iterator[Token] """ lex = NotImplemented @@ -335,8 +292,8 @@ class TraditionalLexer(Lexer): assert set(conf.ignore) <= {t.name for t in terminals} # Init - self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] - self.ignore_types = list(conf.ignore) + self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())) + self.ignore_types = frozenset(conf.ignore) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) self.terminals = terminals @@ -345,7 +302,6 @@ class TraditionalLexer(Lexer): self.use_bytes = conf.use_bytes self._mres = None - # self.build(g_regex_flags) def _build(self): terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes) @@ -366,16 +322,61 @@ class TraditionalLexer(Lexer): self._build() return self._mres - def match(self, stream, pos): + def match(self, text, pos): for mre, type_from_index in self.mres: - m = mre.match(stream, pos) + m = mre.match(text, pos) if m: return m.group(0), type_from_index[m.lastindex] - def lex(self, stream): - return _Lex(self).lex(stream, self.newline_types, self.ignore_types) + def make_lexer_state(self, text): + line_ctr = LineCounter('\n' if not self.use_bytes else b'\n') + return LexerState(text, line_ctr) + + def lex(self, text): + state = self.make_lexer_state(text) + with suppress(EOFError): + while True: + yield self.next_token(state) + + def next_token(self, lex_state): + text = lex_state.text + line_ctr = lex_state.line_ctr + while line_ctr.char_pos < len(text): + res = self.match(text, line_ctr.char_pos) + if not res: + allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types + if not allowed: + allowed = {""} + raise UnexpectedCharacters(text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token]) + + value, type_ = res + + if type_ not in self.ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + line_ctr.feed(value, type_ in self.newline_types) + t.end_line = line_ctr.line + t.end_column = line_ctr.column + t.end_pos = line_ctr.char_pos + if t.type in self.callback: + t = self.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + lex_state.last_token = t + return t + else: + if type_ in self.callback: + t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + self.callback[type_](t2) + line_ctr.feed(value, type_ in self.newline_types) + # EOF + raise EOFError(self) +class LexerState: + def __init__(self, text, line_ctr, last_token=None): + self.text = text + self.line_ctr = line_ctr + self.last_token = last_token class ContextualLexer(Lexer): @@ -409,25 +410,24 @@ class ContextualLexer(Lexer): assert trad_conf.tokens is terminals self.root_lexer = TraditionalLexer(trad_conf) - def lex(self, stream, get_parser_state): - parser_state = get_parser_state() - l = _Lex(self.lexers[parser_state], parser_state) + def lex(self, text, get_parser_state): + state = self.root_lexer.make_lexer_state(text) try: - for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): - yield x - parser_state = get_parser_state() - l.lexer = self.lexers[parser_state] - l.state = parser_state # For debug only, no need to worry about multithreading + while True: + lexer = self.lexers[get_parser_state()] + yield lexer.next_token(state) + except EOFError: + pass except UnexpectedCharacters as e: # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, # but not in the current context. # This tests the input against the global context, to provide a nicer error. - root_match = self.root_lexer.match(stream, e.pos_in_stream) + root_match = self.root_lexer.match(text, e.pos_in_stream) if not root_match: raise value, type_ = root_match t = Token(type_, value, e.pos_in_stream, e.line, e.column) - raise UnexpectedToken(t, e.allowed, state=e.state) + raise UnexpectedToken(t, e.allowed, state=get_parser_state()) ###}