diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 155b774..27c6863 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -33,7 +33,7 @@ class LarkOptions: regex: bool debug: bool keep_all_tokens: bool - propagate_positions: bool + propagate_positions: Union[bool, str] maybe_placeholders: bool lexer_callbacks: Dict[str, Callable[[Token], Token]] cache: Union[bool, str] @@ -77,7 +77,7 @@ class Lark: regex: bool = False, debug: bool = False, keep_all_tokens: bool = False, - propagate_positions: bool = False, + propagate_positions: Union[bool, str] = False, maybe_placeholders: bool = False, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, cache: Union[bool, str] = False, diff --git a/lark-stubs/lexer.pyi b/lark-stubs/lexer.pyi index fa42322..004865c 100644 --- a/lark-stubs/lexer.pyi +++ b/lark-stubs/lexer.pyi @@ -76,7 +76,7 @@ class TerminalDef: class Token(str): type: str - pos_in_stream: int + start_pos: int value: Any line: int column: int @@ -84,7 +84,7 @@ class Token(str): end_column: int end_pos: int - def __init__(self, type_: str, value: Any, pos_in_stream: int = None, line: int = None, column: int = None, end_line: int = None, end_column: int = None, end_pos: int = None) -> None: + def __init__(self, type_: str, value: Any, start_pos: int = None, line: int = None, column: int = None, end_line: int = None, end_column: int = None, end_pos: int = None) -> None: ... def update(self, type_: Optional[str] = None, value: Optional[Any] = None) -> Token: diff --git a/lark/__init__.py b/lark/__init__.py index 6ca88da..f056182 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -7,4 +7,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "0.11.3" +__version__ = "0.11.4" diff --git a/lark/lark.py b/lark/lark.py index 6c6a239..8e879cc 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -44,8 +44,9 @@ class LarkOptions(Serialize): Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) propagate_positions Propagates (line, column, end_line, end_column) attributes into all tree branches. + Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees. maybe_placeholders - When True, the ``[]`` operator returns ``None`` when not matched. + When ``True``, the ``[]`` operator returns ``None`` when not matched. When ``False``, ``[]`` behaves like the ``?`` operator, and returns no value at all. (default= ``False``. Recommended to set to ``True``) @@ -145,7 +146,7 @@ class LarkOptions(Serialize): for name, default in self._defaults.items(): if name in o: value = o.pop(name) - if isinstance(default, bool) and name not in ('cache', 'use_bytes'): + if isinstance(default, bool) and name not in ('cache', 'use_bytes', 'propagate_positions'): value = bool(value) else: value = default @@ -573,7 +574,7 @@ class Lark(Serialize): @property def source(self): - warn("Lark.source attribute has been renamed to Lark.source_path", DeprecationWarning) + warn("Attribute Lark.source was renamed to Lark.source_path", DeprecationWarning) return self.source_path @source.setter @@ -582,7 +583,7 @@ class Lark(Serialize): @property def grammar_source(self): - warn("Lark.grammar_source attribute has been renamed to Lark.source_grammar", DeprecationWarning) + warn("Attribute Lark.grammar_source was renamed to Lark.source_grammar", DeprecationWarning) return self.source_grammar @grammar_source.setter diff --git a/lark/lexer.py b/lark/lexer.py index 4d18704..a2aefd2 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -6,6 +6,7 @@ from .utils import Str, classify, get_regexp_width, Py36, Serialize, suppress from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone +from warnings import warn from copy import copy @@ -128,9 +129,9 @@ class Token(Str): end_column will be 5. end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``) """ - __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') + __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') - def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): + def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): try: self = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: @@ -138,7 +139,7 @@ class Token(Str): self = super(Token, cls).__new__(cls, value) self.type = type_ - self.pos_in_stream = pos_in_stream + self.start_pos = start_pos if start_pos is not None else pos_in_stream self.value = value self.line = line self.column = column @@ -147,6 +148,11 @@ class Token(Str): self.end_pos = end_pos return self + @property + def pos_in_stream(self): + warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning) + return self.start_pos + def update(self, type_=None, value=None): return Token.new_borrow_pos( type_ if type_ is not None else self.type, @@ -156,16 +162,16 @@ class Token(Str): @classmethod def new_borrow_pos(cls, type_, value, borrow_t): - return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) + return cls(type_, value, borrow_t.start_pos, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) def __reduce__(self): - return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column)) + return (self.__class__, (self.type, self.value, self.start_pos, self.line, self.column)) def __repr__(self): return 'Token(%r, %r)' % (self.type, self.value) def __deepcopy__(self, memo): - return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) + return Token(self.type, self.value, self.start_pos, self.line, self.column) def __eq__(self, other): if isinstance(other, Token) and self.type != other.type: diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 569761a..7a854bc 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,4 +1,4 @@ -from .exceptions import GrammarError +from .exceptions import GrammarError, ConfigurationError from .lexer import Token from .tree import Tree from .visitors import InlineTransformer # XXX Deprecated @@ -21,6 +21,7 @@ class ExpandSingleChild: return self.node_builder(children) + class PropagatePositions: def __init__(self, node_builder): self.node_builder = node_builder @@ -31,40 +32,52 @@ class PropagatePositions: # local reference to Tree.meta reduces number of presence checks if isinstance(res, Tree): res_meta = res.meta - for c in children: - if isinstance(c, Tree): - child_meta = c.meta - if not child_meta.empty: - res_meta.line = child_meta.line - res_meta.column = child_meta.column - res_meta.start_pos = child_meta.start_pos - res_meta.empty = False - break - elif isinstance(c, Token): - res_meta.line = c.line - res_meta.column = c.column - res_meta.start_pos = c.pos_in_stream - res_meta.empty = False - break - - for c in reversed(children): - if isinstance(c, Tree): - child_meta = c.meta - if not child_meta.empty: - res_meta.end_line = child_meta.end_line - res_meta.end_column = child_meta.end_column - res_meta.end_pos = child_meta.end_pos - res_meta.empty = False - break - elif isinstance(c, Token): - res_meta.end_line = c.end_line - res_meta.end_column = c.end_column - res_meta.end_pos = c.end_pos - res_meta.empty = False - break + + src_meta = self._pp_get_meta(children) + if src_meta is not None: + res_meta.line = src_meta.line + res_meta.column = src_meta.column + res_meta.start_pos = src_meta.start_pos + res_meta.empty = False + + src_meta = self._pp_get_meta(reversed(children)) + if src_meta is not None: + res_meta.end_line = src_meta.end_line + res_meta.end_column = src_meta.end_column + res_meta.end_pos = src_meta.end_pos + res_meta.empty = False return res + def _pp_get_meta(self, children): + for c in children: + if isinstance(c, Tree): + if not c.meta.empty: + return c.meta + elif isinstance(c, Token): + return c + +class PropagatePositions_IgnoreWs(PropagatePositions): + def _pp_get_meta(self, children): + for c in children: + if isinstance(c, Tree): + if not c.meta.empty: + return c.meta + elif isinstance(c, Token): + if c and not c.isspace(): # Disregard whitespace-only tokens + return c + + +def make_propagate_positions(option): + if option == "ignore_ws": + return PropagatePositions_IgnoreWs + elif option is True: + return PropagatePositions + elif option is False: + return None + + raise ConfigurationError('Invalid option for propagate_positions: %r' % option) + class ChildFilter: def __init__(self, to_include, append_none, node_builder): @@ -320,6 +333,8 @@ class ParseTreeBuilder: self.rule_builders = list(self._init_builders(rules)) def _init_builders(self, rules): + propagate_positions = make_propagate_positions(self.propagate_positions) + for rule in rules: options = rule.options keep_all_tokens = options.keep_all_tokens @@ -328,7 +343,7 @@ class ParseTreeBuilder: wrapper_chain = list(filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None), - self.propagate_positions and PropagatePositions, + propagate_positions, self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), self.ambiguous and partial(AmbiguousIntermediateExpander, self.tree_class) ]))