From 3bc070bc1dcbaa91a04f178b985c5250bafc492c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 15 Jun 2021 17:04:38 +0300 Subject: [PATCH] Change how propagate_positions work --- lark-stubs/lark.pyi | 4 ++-- lark/lark.py | 4 ++-- lark/lexer.py | 28 +++++++++++++------------- lark/parse_tree_builder.py | 40 +++++++++++++++----------------------- lark/parser_frontends.py | 16 +++++++-------- 5 files changed, 42 insertions(+), 50 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 27c6863..18748d1 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -33,7 +33,7 @@ class LarkOptions: regex: bool debug: bool keep_all_tokens: bool - propagate_positions: Union[bool, str] + propagate_positions: Union[bool, Callable] maybe_placeholders: bool lexer_callbacks: Dict[str, Callable[[Token], Token]] cache: Union[bool, str] @@ -77,7 +77,7 @@ class Lark: regex: bool = False, debug: bool = False, keep_all_tokens: bool = False, - propagate_positions: Union[bool, str] = False, + propagate_positions: Union[bool, Callable] = False, maybe_placeholders: bool = False, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, cache: Union[bool, str] = False, diff --git a/lark/lark.py b/lark/lark.py index 8e879cc..9863243 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -44,7 +44,7 @@ class LarkOptions(Serialize): Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) propagate_positions Propagates (line, column, end_line, end_column) attributes into all tree branches. - Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees. + Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating. maybe_placeholders When ``True``, the ``[]`` operator returns ``None`` when not matched. @@ -162,7 +162,7 @@ class LarkOptions(Serialize): assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) if self.parser == 'earley' and self.transformer: - raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.' + raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. ' 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') if o: diff --git a/lark/lexer.py b/lark/lexer.py index 2925c35..7c2f979 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -133,20 +133,20 @@ class Token(Str): def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): try: - self = super(Token, cls).__new__(cls, value) + inst = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: value = value.decode('latin1') - self = super(Token, cls).__new__(cls, value) - - self.type = type_ - self.start_pos = start_pos if start_pos is not None else pos_in_stream - self.value = value - self.line = line - self.column = column - self.end_line = end_line - self.end_column = end_column - self.end_pos = end_pos - return self + inst = super(Token, cls).__new__(cls, value) + + inst.type = type_ + inst.start_pos = start_pos if start_pos is not None else pos_in_stream + inst.value = value + inst.line = line + inst.column = column + inst.end_line = end_line + inst.end_column = end_column + inst.end_pos = end_pos + return inst @property def pos_in_stream(self): @@ -258,8 +258,8 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if unless: callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) - terminals = [t for t in terminals if t not in embedded_strs] - return terminals, callback + new_terminals = [t for t in terminals if t not in embedded_strs] + return new_terminals, callback diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 7a854bc..b4929c6 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -23,8 +23,9 @@ class ExpandSingleChild: class PropagatePositions: - def __init__(self, node_builder): + def __init__(self, node_builder, node_filter=None): self.node_builder = node_builder + self.node_filter = node_filter def __call__(self, children): res = self.node_builder(children) @@ -33,44 +34,35 @@ class PropagatePositions: if isinstance(res, Tree): res_meta = res.meta - src_meta = self._pp_get_meta(children) - if src_meta is not None: - res_meta.line = src_meta.line - res_meta.column = src_meta.column - res_meta.start_pos = src_meta.start_pos + first_meta = self._pp_get_meta(children) + if first_meta is not None: + res_meta.line = first_meta.line + res_meta.column = first_meta.column + res_meta.start_pos = first_meta.start_pos res_meta.empty = False - src_meta = self._pp_get_meta(reversed(children)) - if src_meta is not None: - res_meta.end_line = src_meta.end_line - res_meta.end_column = src_meta.end_column - res_meta.end_pos = src_meta.end_pos + last_meta = self._pp_get_meta(reversed(children)) + if last_meta is not None: + res_meta.end_line = last_meta.end_line + res_meta.end_column = last_meta.end_column + res_meta.end_pos = last_meta.end_pos res_meta.empty = False return res def _pp_get_meta(self, children): for c in children: + if self.node_filter is not None and not self.node_filter(c): + continue if isinstance(c, Tree): if not c.meta.empty: return c.meta elif isinstance(c, Token): return c -class PropagatePositions_IgnoreWs(PropagatePositions): - def _pp_get_meta(self, children): - for c in children: - if isinstance(c, Tree): - if not c.meta.empty: - return c.meta - elif isinstance(c, Token): - if c and not c.isspace(): # Disregard whitespace-only tokens - return c - - def make_propagate_positions(option): - if option == "ignore_ws": - return PropagatePositions_IgnoreWs + if callable(option): + return partial(PropagatePositions, node_filter=option) elif option is True: return PropagatePositions elif option is False: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e066d9a..1818ca7 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -92,26 +92,26 @@ class ParsingFrontend(Serialize): def _verify_start(self, start=None): if start is None: - start = self.parser_conf.start - if len(start) > 1: - raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) - start ,= start + start_decls = self.parser_conf.start + if len(start_decls) > 1: + raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls) + start ,= start_decls elif start not in self.parser_conf.start: raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) return start def parse(self, text, start=None, on_error=None): - start = self._verify_start(start) + chosen_start = self._verify_start(start) stream = text if self.skip_lexer else LexerThread(self.lexer, text) kw = {} if on_error is None else {'on_error': on_error} - return self.parser.parse(stream, start, **kw) + return self.parser.parse(stream, chosen_start, **kw) def parse_interactive(self, text=None, start=None): - start = self._verify_start(start) + chosen_start = self._verify_start(start) if self.parser_conf.parser_type != 'lalr': raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") stream = text if self.skip_lexer else LexerThread(self.lexer, text) - return self.parser.parse_interactive(stream, start) + return self.parser.parse_interactive(stream, chosen_start) def get_frontend(parser, lexer):