|
|
@@ -111,12 +111,35 @@ def build_mres(tokens, match_whole=False): |
|
|
|
return _build_mres(tokens, len(tokens), match_whole) |
|
|
|
|
|
|
|
|
|
|
|
class Lexer(object): |
|
|
|
class LineCounter: |
|
|
|
def __init__(self): |
|
|
|
self.newline_char = '\n' |
|
|
|
self.char_pos = 0 |
|
|
|
self.line = 1 |
|
|
|
self.column = 0 |
|
|
|
self.line_start_pos = 0 |
|
|
|
|
|
|
|
def feed(self, token, test_newline=True): |
|
|
|
"""Consume a token and calculat the new line & column. |
|
|
|
|
|
|
|
As an optional optimization, set test_newline=False is token doesn't contain a newline. |
|
|
|
""" |
|
|
|
if test_newline: |
|
|
|
newlines = token.count(self.newline_char) |
|
|
|
if newlines: |
|
|
|
self.line += newlines |
|
|
|
self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 |
|
|
|
|
|
|
|
self.char_pos += len(token) |
|
|
|
self.column = self.char_pos - self.line_start_pos |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Lexer: |
|
|
|
def __init__(self, tokens, ignore=()): |
|
|
|
assert all(isinstance(t, TokenDef) for t in tokens), tokens |
|
|
|
|
|
|
|
self.ignore = ignore |
|
|
|
self.newline_char = '\n' |
|
|
|
tokens = list(tokens) |
|
|
|
|
|
|
|
# Sanitization |
|
|
@@ -129,10 +152,7 @@ class Lexer(object): |
|
|
|
if t.pattern.min_width == 0: |
|
|
|
raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) |
|
|
|
|
|
|
|
token_names = {t.name for t in tokens} |
|
|
|
for t in ignore: |
|
|
|
if t not in token_names: |
|
|
|
raise LexError("Token '%s' was marked to ignore but it is not defined!" % t) |
|
|
|
assert set(ignore) <= {t.name for t in tokens} |
|
|
|
|
|
|
|
# Init |
|
|
|
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] |
|
|
@@ -147,46 +167,8 @@ class Lexer(object): |
|
|
|
|
|
|
|
self.mres = build_mres(tokens) |
|
|
|
|
|
|
|
|
|
|
|
def lex(self, stream): |
|
|
|
lex_pos = 0 |
|
|
|
line = 1 |
|
|
|
col_start_pos = 0 |
|
|
|
newline_types = list(self.newline_types) |
|
|
|
ignore_types = list(self.ignore_types) |
|
|
|
while True: |
|
|
|
for mre, type_from_index in self.mres: |
|
|
|
m = mre.match(stream, lex_pos) |
|
|
|
if m: |
|
|
|
value = m.group(0) |
|
|
|
type_ = type_from_index[m.lastindex] |
|
|
|
to_yield = type_ not in ignore_types |
|
|
|
|
|
|
|
if to_yield: |
|
|
|
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) |
|
|
|
end_col = t.column + len(value) |
|
|
|
if t.type in self.callback: |
|
|
|
t = self.callback[t.type](t) |
|
|
|
|
|
|
|
if type_ in newline_types: |
|
|
|
newlines = value.count(self.newline_char) |
|
|
|
if newlines: |
|
|
|
line += newlines |
|
|
|
last_newline_index = value.rindex(self.newline_char) + 1 |
|
|
|
col_start_pos = lex_pos + last_newline_index |
|
|
|
end_col = len(value) - last_newline_index |
|
|
|
|
|
|
|
if to_yield: |
|
|
|
t.end_line = line |
|
|
|
t.end_col = end_col |
|
|
|
yield t |
|
|
|
|
|
|
|
lex_pos += len(value) |
|
|
|
break |
|
|
|
else: |
|
|
|
if lex_pos < len(stream): |
|
|
|
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) |
|
|
|
break |
|
|
|
return _Lex(self).lex(stream, self.newline_types, self.ignore_types) |
|
|
|
|
|
|
|
|
|
|
|
class ContextualLexer: |
|
|
@@ -218,33 +200,39 @@ class ContextualLexer: |
|
|
|
self.parser_state = state |
|
|
|
|
|
|
|
def lex(self, stream): |
|
|
|
lex_pos = 0 |
|
|
|
line = 1 |
|
|
|
col_start_pos = 0 |
|
|
|
newline_types = list(self.root_lexer.newline_types) |
|
|
|
ignore_types = list(self.root_lexer.ignore_types) |
|
|
|
l = _Lex(self.lexers[self.parser_state]) |
|
|
|
for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): |
|
|
|
yield x |
|
|
|
l.lexer = self.lexers[self.parser_state] |
|
|
|
|
|
|
|
|
|
|
|
class _Lex: |
|
|
|
"Built to serve both Lexer and ContextualLexer" |
|
|
|
def __init__(self, lexer): |
|
|
|
self.lexer = lexer |
|
|
|
|
|
|
|
def lex(self, stream, newline_types, ignore_types): |
|
|
|
newline_types = list(newline_types) |
|
|
|
newline_types = list(newline_types) |
|
|
|
line_ctr = LineCounter() |
|
|
|
|
|
|
|
while True: |
|
|
|
lexer = self.lexers[self.parser_state] |
|
|
|
lexer = self.lexer |
|
|
|
for mre, type_from_index in lexer.mres: |
|
|
|
m = mre.match(stream, lex_pos) |
|
|
|
m = mre.match(stream, line_ctr.char_pos) |
|
|
|
if m: |
|
|
|
value = m.group(0) |
|
|
|
type_ = type_from_index[m.lastindex] |
|
|
|
if type_ not in ignore_types: |
|
|
|
t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) |
|
|
|
t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) |
|
|
|
if t.type in lexer.callback: |
|
|
|
t = lexer.callback[t.type](t) |
|
|
|
yield t |
|
|
|
|
|
|
|
if type_ in newline_types: |
|
|
|
newlines = value.count(lexer.newline_char) |
|
|
|
if newlines: |
|
|
|
line += newlines |
|
|
|
col_start_pos = lex_pos + value.rindex(lexer.newline_char) |
|
|
|
lex_pos += len(value) |
|
|
|
lexer = yield t |
|
|
|
|
|
|
|
line_ctr.feed(value, type_ in newline_types) |
|
|
|
break |
|
|
|
else: |
|
|
|
if lex_pos < len(stream): |
|
|
|
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens) |
|
|
|
if line_ctr.char_pos < len(stream): |
|
|
|
raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) |
|
|
|
break |
|
|
|
|