Преглед на файлове

Merge pull request #932 from lark-parser/refactor_jun2021

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.12.0
Erez Shinan преди 3 години
committed by GitHub
родител
ревизия
eb2b0b9166
No known key found for this signature in database GPG ключ ID: 4AEE18F83AFDEB23
променени са 7 файла, в които са добавени 118 реда и са изтрити 109 реда
  1. +2
    -2
      lark-stubs/lark.pyi
  2. +9
    -6
      lark/exceptions.py
  3. +8
    -8
      lark/lark.py
  4. +73
    -59
      lark/lexer.py
  5. +16
    -24
      lark/parse_tree_builder.py
  6. +8
    -8
      lark/parser_frontends.py
  7. +2
    -2
      lark/parsers/lalr_parser.py

+ 2
- 2
lark-stubs/lark.pyi Целия файл

@@ -33,7 +33,7 @@ class LarkOptions:
regex: bool regex: bool
debug: bool debug: bool
keep_all_tokens: bool keep_all_tokens: bool
propagate_positions: Union[bool, str]
propagate_positions: Union[bool, Callable]
maybe_placeholders: bool maybe_placeholders: bool
lexer_callbacks: Dict[str, Callable[[Token], Token]] lexer_callbacks: Dict[str, Callable[[Token], Token]]
cache: Union[bool, str] cache: Union[bool, str]
@@ -77,7 +77,7 @@ class Lark:
regex: bool = False, regex: bool = False,
debug: bool = False, debug: bool = False,
keep_all_tokens: bool = False, keep_all_tokens: bool = False,
propagate_positions: Union[bool, str] = False,
propagate_positions: Union[bool, Callable] = False,
maybe_placeholders: bool = False, maybe_placeholders: bool = False,
lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
cache: Union[bool, str] = False, cache: Union[bool, str] = False,


+ 9
- 6
lark/exceptions.py Целия файл

@@ -129,6 +129,8 @@ class UnexpectedInput(LarkError):


class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedEOF(ParseError, UnexpectedInput):
def __init__(self, expected, state=None, terminals_by_name=None): def __init__(self, expected, state=None, terminals_by_name=None):
super(UnexpectedEOF, self).__init__()

self.expected = expected self.expected = expected
self.state = state self.state = state
from .lexer import Token from .lexer import Token
@@ -138,7 +140,6 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
self.column = -1 self.column = -1
self._terminals_by_name = terminals_by_name self._terminals_by_name = terminals_by_name


super(UnexpectedEOF, self).__init__()


def __str__(self): def __str__(self):
message = "Unexpected end-of-input. " message = "Unexpected end-of-input. "
@@ -149,6 +150,8 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput):
def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
terminals_by_name=None, considered_rules=None): terminals_by_name=None, considered_rules=None):
super(UnexpectedCharacters, self).__init__()

# TODO considered_tokens and allowed can be figured out using state # TODO considered_tokens and allowed can be figured out using state
self.line = line self.line = line
self.column = column self.column = column
@@ -167,7 +170,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput):
self.char = seq[lex_pos] self.char = seq[lex_pos]
self._context = self.get_context(seq) self._context = self.get_context(seq)


super(UnexpectedCharacters, self).__init__()


def __str__(self): def __str__(self):
message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column) message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column)
@@ -190,6 +192,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):
""" """


def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
super(UnexpectedToken, self).__init__()
# TODO considered_rules and expected can be figured out using state # TODO considered_rules and expected can be figured out using state
self.line = getattr(token, 'line', '?') self.line = getattr(token, 'line', '?')
self.column = getattr(token, 'column', '?') self.column = getattr(token, 'column', '?')
@@ -204,7 +208,6 @@ class UnexpectedToken(ParseError, UnexpectedInput):
self._terminals_by_name = terminals_by_name self._terminals_by_name = terminals_by_name
self.token_history = token_history self.token_history = token_history


super(UnexpectedToken, self).__init__()


@property @property
def accepts(self): def accepts(self):
@@ -236,10 +239,10 @@ class VisitError(LarkError):
""" """


def __init__(self, rule, obj, orig_exc): def __init__(self, rule, obj, orig_exc):
self.obj = obj
self.orig_exc = orig_exc

message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
super(VisitError, self).__init__(message) super(VisitError, self).__init__(message)


self.obj = obj
self.orig_exc = orig_exc

###} ###}

+ 8
- 8
lark/lark.py Целия файл

@@ -44,7 +44,7 @@ class LarkOptions(Serialize):
Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster)
propagate_positions propagate_positions
Propagates (line, column, end_line, end_column) attributes into all tree branches. Propagates (line, column, end_line, end_column) attributes into all tree branches.
Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees.
Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating.
maybe_placeholders maybe_placeholders
When ``True``, the ``[]`` operator returns ``None`` when not matched. When ``True``, the ``[]`` operator returns ``None`` when not matched.


@@ -162,7 +162,7 @@ class LarkOptions(Serialize):
assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) assert_config(self.parser, ('earley', 'lalr', 'cyk', None))


if self.parser == 'earley' and self.transformer: if self.parser == 'earley' and self.transformer:
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.'
raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')


if o: if o:
@@ -451,11 +451,11 @@ class Lark(Serialize):
d = f d = f
else: else:
d = pickle.load(f) d = pickle.load(f)
memo = d['memo']
memo_json = d['memo']
data = d['data'] data = d['data']


assert memo
memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
assert memo_json
memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
options = dict(data['options']) options = dict(data['options'])
if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
raise ConfigurationError("Some options are not allowed when loading a Parser: {}" raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
@@ -512,11 +512,11 @@ class Lark(Serialize):


Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
""" """
package = FromPackageLoader(package, search_paths)
full_path, text = package(None, grammar_path)
package_loader = FromPackageLoader(package, search_paths)
full_path, text = package_loader(None, grammar_path)
options.setdefault('source_path', full_path) options.setdefault('source_path', full_path)
options.setdefault('import_paths', []) options.setdefault('import_paths', [])
options['import_paths'].append(package)
options['import_paths'].append(package_loader)
return cls(text, **options) return cls(text, **options)


def __repr__(self): def __repr__(self):


+ 73
- 59
lark/lexer.py Целия файл

@@ -127,26 +127,26 @@ class Token(Str):
end_column: The next column after the end of the token. For example, end_column: The next column after the end of the token. For example,
if the token is a single character with a column value of 4, if the token is a single character with a column value of 4,
end_column will be 5. end_column will be 5.
end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``)
end_pos: the index where the token ends (basically ``start_pos + len(token)``)
""" """
__slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')


def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None):
try: try:
self = super(Token, cls).__new__(cls, value)
inst = super(Token, cls).__new__(cls, value)
except UnicodeDecodeError: except UnicodeDecodeError:
value = value.decode('latin1') value = value.decode('latin1')
self = super(Token, cls).__new__(cls, value)
self.type = type_
self.start_pos = start_pos if start_pos is not None else pos_in_stream
self.value = value
self.line = line
self.column = column
self.end_line = end_line
self.end_column = end_column
self.end_pos = end_pos
return self
inst = super(Token, cls).__new__(cls, value)
inst.type = type_
inst.start_pos = start_pos if start_pos is not None else pos_in_stream
inst.value = value
inst.line = line
inst.column = column
inst.end_line = end_line
inst.end_column = end_column
inst.end_pos = end_pos
return inst


@property @property
def pos_in_stream(self): def pos_in_stream(self):
@@ -214,15 +214,13 @@ class LineCounter:




class UnlessCallback: class UnlessCallback:
def __init__(self, mres):
self.mres = mres
def __init__(self, scanner):
self.scanner = scanner


def __call__(self, t): def __call__(self, t):
for mre, type_from_index in self.mres:
m = mre.match(t.value)
if m:
t.type = type_from_index[m.lastindex]
break
res = self.scanner.match(t.value, 0)
if res:
_value, t.type = res
return t return t




@@ -237,6 +235,11 @@ class CallChain:
return self.callback2(t) if self.cond(t2) else t2 return self.callback2(t) if self.cond(t2) else t2




def _get_match(re_, regexp, s, flags):
m = re_.match(regexp, s, flags)
if m:
return m.group(0)

def _create_unless(terminals, g_regex_flags, re_, use_bytes): def _create_unless(terminals, g_regex_flags, re_, use_bytes):
tokens_by_type = classify(terminals, lambda t: type(t.pattern)) tokens_by_type = classify(terminals, lambda t: type(t.pattern))
assert len(tokens_by_type) <= 2, tokens_by_type.keys() assert len(tokens_by_type) <= 2, tokens_by_type.keys()
@@ -248,40 +251,54 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
if strtok.priority > retok.priority: if strtok.priority > retok.priority:
continue continue
s = strtok.pattern.value s = strtok.pattern.value
m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
if m and m.group(0) == s:
if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags):
unless.append(strtok) unless.append(strtok)
if strtok.pattern.flags <= retok.pattern.flags: if strtok.pattern.flags <= retok.pattern.flags:
embedded_strs.add(strtok) embedded_strs.add(strtok)
if unless: if unless:
callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))

terminals = [t for t in terminals if t not in embedded_strs]
return terminals, callback


def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = re_.compile(pattern, g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)
callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))


mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
terminals = terminals[max_size:]
return mres
new_terminals = [t for t in terminals if t not in embedded_strs]
return new_terminals, callback




def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)

class Scanner:
def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
self.terminals = terminals
self.g_regex_flags = g_regex_flags
self.re_ = re_
self.use_bytes = use_bytes
self.match_whole = match_whole

self.allowed_types = {t.name for t in self.terminals}

self._mres = self._build_mres(terminals, len(terminals))

def _build_mres(self, terminals, max_size):
# Python sets an unreasonable group limit (currently 100) in its re module
# Worse, the only way to know we reached it is by catching an AssertionError!
# This function recursively tries less and less groups until it's successful.
postfix = '$' if self.match_whole else ''
mres = []
while terminals:
pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
if self.use_bytes:
pattern = pattern.encode('latin-1')
try:
mre = self.re_.compile(pattern, self.g_regex_flags)
except AssertionError: # Yes, this is what Python provides us.. :/
return self._build_mres(terminals, max_size//2)

mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
terminals = terminals[max_size:]
return mres

def match(self, text, pos):
for mre, type_from_index in self._mres:
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]




def _regexp_has_newline(r): def _regexp_has_newline(r):
@@ -341,9 +358,9 @@ class TraditionalLexer(Lexer):
self.use_bytes = conf.use_bytes self.use_bytes = conf.use_bytes
self.terminals_by_name = conf.terminals_by_name self.terminals_by_name = conf.terminals_by_name


self._mres = None
self._scanner = None


def _build(self):
def _build_scanner(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
assert all(self.callback.values()) assert all(self.callback.values())


@@ -354,19 +371,16 @@ class TraditionalLexer(Lexer):
else: else:
self.callback[type_] = f self.callback[type_] = f


self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)


@property @property
def mres(self):
if self._mres is None:
self._build()
return self._mres
def scanner(self):
if self._scanner is None:
self._build_scanner()
return self._scanner


def match(self, text, pos): def match(self, text, pos):
for mre, type_from_index in self.mres:
m = mre.match(text, pos)
if m:
return m.group(0), type_from_index[m.lastindex]
return self.scanner.match(text, pos)


def lex(self, state, parser_state): def lex(self, state, parser_state):
with suppress(EOFError): with suppress(EOFError):
@@ -378,7 +392,7 @@ class TraditionalLexer(Lexer):
while line_ctr.char_pos < len(lex_state.text): while line_ctr.char_pos < len(lex_state.text):
res = self.match(lex_state.text, line_ctr.char_pos) res = self.match(lex_state.text, line_ctr.char_pos)
if not res: if not res:
allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
allowed = self.scanner.allowed_types - self.ignore_types
if not allowed: if not allowed:
allowed = {"<END-OF-FILE>"} allowed = {"<END-OF-FILE>"}
raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,


+ 16
- 24
lark/parse_tree_builder.py Целия файл

@@ -23,8 +23,9 @@ class ExpandSingleChild:




class PropagatePositions: class PropagatePositions:
def __init__(self, node_builder):
def __init__(self, node_builder, node_filter=None):
self.node_builder = node_builder self.node_builder = node_builder
self.node_filter = node_filter


def __call__(self, children): def __call__(self, children):
res = self.node_builder(children) res = self.node_builder(children)
@@ -33,44 +34,35 @@ class PropagatePositions:
if isinstance(res, Tree): if isinstance(res, Tree):
res_meta = res.meta res_meta = res.meta


src_meta = self._pp_get_meta(children)
if src_meta is not None:
res_meta.line = src_meta.line
res_meta.column = src_meta.column
res_meta.start_pos = src_meta.start_pos
first_meta = self._pp_get_meta(children)
if first_meta is not None:
res_meta.line = first_meta.line
res_meta.column = first_meta.column
res_meta.start_pos = first_meta.start_pos
res_meta.empty = False res_meta.empty = False


src_meta = self._pp_get_meta(reversed(children))
if src_meta is not None:
res_meta.end_line = src_meta.end_line
res_meta.end_column = src_meta.end_column
res_meta.end_pos = src_meta.end_pos
last_meta = self._pp_get_meta(reversed(children))
if last_meta is not None:
res_meta.end_line = last_meta.end_line
res_meta.end_column = last_meta.end_column
res_meta.end_pos = last_meta.end_pos
res_meta.empty = False res_meta.empty = False


return res return res


def _pp_get_meta(self, children): def _pp_get_meta(self, children):
for c in children: for c in children:
if self.node_filter is not None and not self.node_filter(c):
continue
if isinstance(c, Tree): if isinstance(c, Tree):
if not c.meta.empty: if not c.meta.empty:
return c.meta return c.meta
elif isinstance(c, Token): elif isinstance(c, Token):
return c return c


class PropagatePositions_IgnoreWs(PropagatePositions):
def _pp_get_meta(self, children):
for c in children:
if isinstance(c, Tree):
if not c.meta.empty:
return c.meta
elif isinstance(c, Token):
if c and not c.isspace(): # Disregard whitespace-only tokens
return c


def make_propagate_positions(option): def make_propagate_positions(option):
if option == "ignore_ws":
return PropagatePositions_IgnoreWs
if callable(option):
return partial(PropagatePositions, node_filter=option)
elif option is True: elif option is True:
return PropagatePositions return PropagatePositions
elif option is False: elif option is False:


+ 8
- 8
lark/parser_frontends.py Целия файл

@@ -92,26 +92,26 @@ class ParsingFrontend(Serialize):
def _verify_start(self, start=None): def _verify_start(self, start=None):
if start is None: if start is None:
start = self.parser_conf.start
if len(start) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
start ,= start
start_decls = self.parser_conf.start
if len(start_decls) > 1:
raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls)
start ,= start_decls
elif start not in self.parser_conf.start: elif start not in self.parser_conf.start:
raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
return start return start


def parse(self, text, start=None, on_error=None): def parse(self, text, start=None, on_error=None):
start = self._verify_start(start)
chosen_start = self._verify_start(start)
stream = text if self.skip_lexer else LexerThread(self.lexer, text) stream = text if self.skip_lexer else LexerThread(self.lexer, text)
kw = {} if on_error is None else {'on_error': on_error} kw = {} if on_error is None else {'on_error': on_error}
return self.parser.parse(stream, start, **kw)
return self.parser.parse(stream, chosen_start, **kw)
def parse_interactive(self, text=None, start=None): def parse_interactive(self, text=None, start=None):
start = self._verify_start(start)
chosen_start = self._verify_start(start)
if self.parser_conf.parser_type != 'lalr': if self.parser_conf.parser_type != 'lalr':
raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
stream = text if self.skip_lexer else LexerThread(self.lexer, text) stream = text if self.skip_lexer else LexerThread(self.lexer, text)
return self.parser.parse_interactive(stream, start)
return self.parser.parse_interactive(stream, chosen_start)




def get_frontend(parser, lexer): def get_frontend(parser, lexer):


+ 2
- 2
lark/parsers/lalr_parser.py Целия файл

@@ -178,8 +178,8 @@ class _Parser(object):
for token in state.lexer.lex(state): for token in state.lexer.lex(state):
state.feed_token(token) state.feed_token(token)


token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(token, True)
end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
return state.feed_token(end_token, True)
except UnexpectedInput as e: except UnexpectedInput as e:
try: try:
e.interactive_parser = InteractiveParser(self, state, state.lexer) e.interactive_parser = InteractiveParser(self, state, state.lexer)


Зареждане…
Отказ
Запис