Browse Source

Rewrite lexer to use LexerState

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.0
Erez Sh 4 years ago
parent
commit
115edbfb32
1 changed files with 62 additions and 62 deletions
  1. +62
    -62
      lark/lexer.py

+ 62
- 62
lark/lexer.py View File

@@ -2,7 +2,7 @@

import re

from .utils import Str, classify, get_regexp_width, Py36, Serialize
from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress
from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken

###{standalone
@@ -178,49 +178,6 @@ class LineCounter:
self.char_pos += len(token)
self.column = self.char_pos - self.line_start_pos + 1

class _Lex:
"Built to serve both Lexer and ContextualLexer"
def __init__(self, lexer, state=None):
self.lexer = lexer
self.state = state

def lex(self, stream, newline_types, ignore_types):
newline_types = frozenset(newline_types)
ignore_types = frozenset(ignore_types)
line_ctr = LineCounter('\n' if not self.lexer.use_bytes else b'\n')
last_token = None

while line_ctr.char_pos < len(stream):
lexer = self.lexer
res = lexer.match(stream, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token])

value, type_ = res

if type_ not in ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
line_ctr.feed(value, type_ in newline_types)
t.end_line = line_ctr.line
t.end_column = line_ctr.column
t.end_pos = line_ctr.char_pos
if t.type in lexer.callback:
t = lexer.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
yield t
last_token = t
else:
if type_ in lexer.callback:
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
lexer.callback[type_](t2)
line_ctr.feed(value, type_ in newline_types)




class UnlessCallback:
def __init__(self, mres):
@@ -308,7 +265,7 @@ class Lexer(object):
"""Lexer interface

Method Signatures:
lex(self, stream) -> Iterator[Token]
lex(self, text) -> Iterator[Token]
"""
lex = NotImplemented

@@ -335,8 +292,8 @@ class TraditionalLexer(Lexer):
assert set(conf.ignore) <= {t.name for t in terminals}

# Init
self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())]
self.ignore_types = list(conf.ignore)
self.newline_types = frozenset(t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp()))
self.ignore_types = frozenset(conf.ignore)

terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name))
self.terminals = terminals
@@ -345,7 +302,6 @@ class TraditionalLexer(Lexer):
self.use_bytes = conf.use_bytes

self._mres = None
# self.build(g_regex_flags)

def _build(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re, use_bytes=self.use_bytes)
@@ -366,16 +322,61 @@ class TraditionalLexer(Lexer):
self._build()
return self._mres

def match(self, stream, pos):
def match(self, text, pos):
for mre, type_from_index in self.mres:
m = mre.match(stream, pos)
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]

def lex(self, stream):
return _Lex(self).lex(stream, self.newline_types, self.ignore_types)
def make_lexer_state(self, text):
line_ctr = LineCounter('\n' if not self.use_bytes else b'\n')
return LexerState(text, line_ctr)

def lex(self, text):
state = self.make_lexer_state(text)
with suppress(EOFError):
while True:
yield self.next_token(state)

def next_token(self, lex_state):
text = lex_state.text
line_ctr = lex_state.line_ctr
while line_ctr.char_pos < len(text):
res = self.match(text, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token])

value, type_ = res

if type_ not in self.ignore_types:
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
line_ctr.feed(value, type_ in self.newline_types)
t.end_line = line_ctr.line
t.end_column = line_ctr.column
t.end_pos = line_ctr.char_pos
if t.type in self.callback:
t = self.callback[t.type](t)
if not isinstance(t, Token):
raise ValueError("Callbacks must return a token (returned %r)" % t)
lex_state.last_token = t
return t
else:
if type_ in self.callback:
t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column)
self.callback[type_](t2)
line_ctr.feed(value, type_ in self.newline_types)

# EOF
raise EOFError(self)

class LexerState:
def __init__(self, text, line_ctr, last_token=None):
self.text = text
self.line_ctr = line_ctr
self.last_token = last_token


class ContextualLexer(Lexer):
@@ -409,25 +410,24 @@ class ContextualLexer(Lexer):
assert trad_conf.tokens is terminals
self.root_lexer = TraditionalLexer(trad_conf)

def lex(self, stream, get_parser_state):
parser_state = get_parser_state()
l = _Lex(self.lexers[parser_state], parser_state)
def lex(self, text, get_parser_state):
state = self.root_lexer.make_lexer_state(text)
try:
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types):
yield x
parser_state = get_parser_state()
l.lexer = self.lexers[parser_state]
l.state = parser_state # For debug only, no need to worry about multithreading
while True:
lexer = self.lexers[get_parser_state()]
yield lexer.next_token(state)
except EOFError:
pass
except UnexpectedCharacters as e:
# In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined,
# but not in the current context.
# This tests the input against the global context, to provide a nicer error.
root_match = self.root_lexer.match(stream, e.pos_in_stream)
root_match = self.root_lexer.match(text, e.pos_in_stream)
if not root_match:
raise

value, type_ = root_match
t = Token(type_, value, e.pos_in_stream, e.line, e.column)
raise UnexpectedToken(t, e.allowed, state=e.state)
raise UnexpectedToken(t, e.allowed, state=get_parser_state())

###}

Loading…
Cancel
Save