|
|
@@ -127,7 +127,7 @@ class Token(Str): |
|
|
|
end_column: The next column after the end of the token. For example, |
|
|
|
if the token is a single character with a column value of 4, |
|
|
|
end_column will be 5. |
|
|
|
end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``) |
|
|
|
end_pos: the index where the token ends (basically ``start_pos + len(token)``) |
|
|
|
""" |
|
|
|
__slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') |
|
|
|
|
|
|
@@ -214,15 +214,13 @@ class LineCounter: |
|
|
|
|
|
|
|
|
|
|
|
class UnlessCallback: |
|
|
|
def __init__(self, mres): |
|
|
|
self.mres = mres |
|
|
|
def __init__(self, scanner): |
|
|
|
self.scanner = scanner |
|
|
|
|
|
|
|
def __call__(self, t): |
|
|
|
for mre, type_from_index in self.mres: |
|
|
|
m = mre.match(t.value) |
|
|
|
if m: |
|
|
|
t.type = type_from_index[m.lastindex] |
|
|
|
break |
|
|
|
res = self.scanner.match(t.value, 0) |
|
|
|
if res: |
|
|
|
_value, t.type = res |
|
|
|
return t |
|
|
|
|
|
|
|
|
|
|
@@ -254,34 +252,51 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): |
|
|
|
if strtok.pattern.flags <= retok.pattern.flags: |
|
|
|
embedded_strs.add(strtok) |
|
|
|
if unless: |
|
|
|
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) |
|
|
|
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) |
|
|
|
|
|
|
|
terminals = [t for t in terminals if t not in embedded_strs] |
|
|
|
return terminals, callback |
|
|
|
|
|
|
|
|
|
|
|
def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): |
|
|
|
# Python sets an unreasonable group limit (currently 100) in its re module |
|
|
|
# Worse, the only way to know we reached it is by catching an AssertionError! |
|
|
|
# This function recursively tries less and less groups until it's successful. |
|
|
|
postfix = '$' if match_whole else '' |
|
|
|
mres = [] |
|
|
|
while terminals: |
|
|
|
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) |
|
|
|
if use_bytes: |
|
|
|
pattern = pattern.encode('latin-1') |
|
|
|
try: |
|
|
|
mre = re_.compile(pattern, g_regex_flags) |
|
|
|
except AssertionError: # Yes, this is what Python provides us.. :/ |
|
|
|
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) |
|
|
|
|
|
|
|
mres.append((mre, {i: n for n, i in mre.groupindex.items()})) |
|
|
|
terminals = terminals[max_size:] |
|
|
|
return mres |
|
|
|
class Scanner: |
|
|
|
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): |
|
|
|
self.terminals = terminals |
|
|
|
self.g_regex_flags = g_regex_flags |
|
|
|
self.re_ = re_ |
|
|
|
self.use_bytes = use_bytes |
|
|
|
self.match_whole = match_whole |
|
|
|
|
|
|
|
self._mres = self._build_mres(terminals, len(terminals)) |
|
|
|
|
|
|
|
def _build_mres(self, terminals, max_size): |
|
|
|
# Python sets an unreasonable group limit (currently 100) in its re module |
|
|
|
# Worse, the only way to know we reached it is by catching an AssertionError! |
|
|
|
# This function recursively tries less and less groups until it's successful. |
|
|
|
postfix = '$' if self.match_whole else '' |
|
|
|
mres = [] |
|
|
|
while terminals: |
|
|
|
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) |
|
|
|
if self.use_bytes: |
|
|
|
pattern = pattern.encode('latin-1') |
|
|
|
try: |
|
|
|
mre = self.re_.compile(pattern, self.g_regex_flags) |
|
|
|
except AssertionError: # Yes, this is what Python provides us.. :/ |
|
|
|
return self._build_mres(terminals, max_size//2) |
|
|
|
|
|
|
|
mres.append((mre, {i: n for n, i in mre.groupindex.items()})) |
|
|
|
terminals = terminals[max_size:] |
|
|
|
return mres |
|
|
|
|
|
|
|
def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): |
|
|
|
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) |
|
|
|
def match(self, text, pos): |
|
|
|
for mre, type_from_index in self._mres: |
|
|
|
m = mre.match(text, pos) |
|
|
|
if m: |
|
|
|
return m.group(0), type_from_index[m.lastindex] |
|
|
|
|
|
|
|
@property |
|
|
|
def allowed_types(self): |
|
|
|
return {v for m, tfi in self._mres for v in tfi.values()} |
|
|
|
|
|
|
|
|
|
|
|
def _regexp_has_newline(r): |
|
|
@@ -341,9 +356,9 @@ class TraditionalLexer(Lexer): |
|
|
|
self.use_bytes = conf.use_bytes |
|
|
|
self.terminals_by_name = conf.terminals_by_name |
|
|
|
|
|
|
|
self._mres = None |
|
|
|
self._scanner = None |
|
|
|
|
|
|
|
def _build(self): |
|
|
|
def _build_scanner(self): |
|
|
|
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) |
|
|
|
assert all(self.callback.values()) |
|
|
|
|
|
|
@@ -354,19 +369,16 @@ class TraditionalLexer(Lexer): |
|
|
|
else: |
|
|
|
self.callback[type_] = f |
|
|
|
|
|
|
|
self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) |
|
|
|
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) |
|
|
|
|
|
|
|
@property |
|
|
|
def mres(self): |
|
|
|
if self._mres is None: |
|
|
|
self._build() |
|
|
|
return self._mres |
|
|
|
def scanner(self): |
|
|
|
if self._scanner is None: |
|
|
|
self._build_scanner() |
|
|
|
return self._scanner |
|
|
|
|
|
|
|
def match(self, text, pos): |
|
|
|
for mre, type_from_index in self.mres: |
|
|
|
m = mre.match(text, pos) |
|
|
|
if m: |
|
|
|
return m.group(0), type_from_index[m.lastindex] |
|
|
|
return self.scanner.match(text, pos) |
|
|
|
|
|
|
|
def lex(self, state, parser_state): |
|
|
|
with suppress(EOFError): |
|
|
@@ -378,7 +390,7 @@ class TraditionalLexer(Lexer): |
|
|
|
while line_ctr.char_pos < len(lex_state.text): |
|
|
|
res = self.match(lex_state.text, line_ctr.char_pos) |
|
|
|
if not res: |
|
|
|
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types |
|
|
|
allowed = self.scanner.allowed_types - self.ignore_types |
|
|
|
if not allowed: |
|
|
|
allowed = {"<END-OF-FILE>"} |
|
|
|
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, |
|
|
|