ソースを参照

lexer.py: Refactored mres operations into a Scanner class.

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.12.0
Erez Sh 3年前
コミット
389e7fbf5c
1個のファイルの変更52行の追加40行の削除
  1. +52
    -40
      lark/lexer.py

+ 52
- 40
lark/lexer.py ファイルの表示

@@ -127,7 +127,7 @@ class Token(Str):
end_column: The next column after the end of the token. For example,
if the token is a single character with a column value of 4,
end_column will be 5.
end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``)
end_pos: the index where the token ends (basically ``start_pos + len(token)``)
"""
__slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')

@@ -214,15 +214,13 @@ class LineCounter:


class UnlessCallback:
def __init__(self, mres):
self.mres = mres
def __init__(self, scanner):
self.scanner = scanner

def __call__(self, t):
for mre, type_from_index in self.mres:
m = mre.match(t.value)
if m:
t.type = type_from_index[m.lastindex]
break
res = self.scanner.match(t.value, 0)
if res:
_value, t.type = res
return t


@@ -254,34 +252,51 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok)
if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))

terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback


def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = re_.compile(pattern, g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)

mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
terminals = terminals[max_size:]
return mres
class Scanner:
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
self.terminals = terminals
self.g_regex_flags = g_regex_flags
self.re_ = re_
self.use_bytes = use_bytes
self.match_whole = match_whole

self._mres = self._build_mres(terminals, len(terminals))

def _build_mres(self, terminals, max_size):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if self.match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if self.use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = self.re_.compile(pattern, self.g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(terminals, max_size//2)

mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
terminals = terminals[max_size:]
return mres

def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)
def match(self, text, pos):
for mre, type_from_index in self._mres:
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]

@property
def allowed_types(self):
return {v for m, tfi in self._mres for v in tfi.values()}


def _regexp_has_newline(r):
@@ -341,9 +356,9 @@ class TraditionalLexer(Lexer):
self.use_bytes = conf.use_bytes
self.terminals_by_name = conf.terminals_by_name

self._mres = None
self._scanner = None

def _build(self):
def _build_scanner(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
assert all(self.callback.values())

@@ -354,19 +369,16 @@ class TraditionalLexer(Lexer):
else:
self.callback[type_] = f

self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)

@property
def mres(self):
if self._mres is None:
self._build()
return self._mres
def scanner(self):
if self._scanner is None:
self._build_scanner()
return self._scanner

def match(self, text, pos):
for mre, type_from_index in self.mres:
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]
return self.scanner.match(text, pos)

def lex(self, state, parser_state):
with suppress(EOFError):
@@ -378,7 +390,7 @@ class TraditionalLexer(Lexer):
while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos)
if not res:
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
allowed = self.scanner.allowed_types - self.ignore_types
if not allowed:
allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,


読み込み中…
キャンセル
保存