From ec2ba8826ea5f396abab063f47ceaf914333e04c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 28 Jun 2021 12:11:07 +0300 Subject: [PATCH 01/13] Docs fix + cleanup --- lark/lexer.py | 2 +- lark/utils.py | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index a2aefd2..4062c2d 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -120,7 +120,7 @@ class Token(Str): Attributes: type: Name of the token (as specified in grammar) value: Value of the token (redundant, as ``token.value == token`` will always be true) - pos_in_stream: The index of the token in the text + start_pos: The index of the token in the text line: The line of the token in the text (starting with 1) column: The column of the token in the text (starting with 1) end_line: The line where the token ends diff --git a/lark/utils.py b/lark/utils.py index 70516e6..b9d7ac3 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -241,17 +241,6 @@ except ImportError: pass -try: - compare = cmp -except NameError: - def compare(a, b): - if a == b: - return 0 - elif a > b: - return 1 - return -1 - - class Enumerator(Serialize): def __init__(self): self.enums = {} From 389e7fbf5cc4ff8973ceb36e9823e6984df0941b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 6 Jun 2021 17:02:41 +0300 Subject: [PATCH 02/13] lexer.py: Refactored mres operations into a Scanner class. --- lark/lexer.py | 92 +++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 4062c2d..7a30d6d 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -127,7 +127,7 @@ class Token(Str): end_column: The next column after the end of the token. For example, if the token is a single character with a column value of 4, end_column will be 5. - end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``) + end_pos: the index where the token ends (basically ``start_pos + len(token)``) """ __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') @@ -214,15 +214,13 @@ class LineCounter: class UnlessCallback: - def __init__(self, mres): - self.mres = mres + def __init__(self, scanner): + self.scanner = scanner def __call__(self, t): - for mre, type_from_index in self.mres: - m = mre.match(t.value) - if m: - t.type = type_from_index[m.lastindex] - break + res = self.scanner.match(t.value, 0) + if res: + _value, t.type = res return t @@ -254,34 +252,51 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) + callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): - # Python sets an unreasonable group limit (currently 100) in its re module - # Worse, the only way to know we reached it is by catching an AssertionError! - # This function recursively tries less and less groups until it's successful. - postfix = '$' if match_whole else '' - mres = [] - while terminals: - pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) - if use_bytes: - pattern = pattern.encode('latin-1') - try: - mre = re_.compile(pattern, g_regex_flags) - except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) - mres.append((mre, {i: n for n, i in mre.groupindex.items()})) - terminals = terminals[max_size:] - return mres +class Scanner: + def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): + self.terminals = terminals + self.g_regex_flags = g_regex_flags + self.re_ = re_ + self.use_bytes = use_bytes + self.match_whole = match_whole + + self._mres = self._build_mres(terminals, len(terminals)) + + def _build_mres(self, terminals, max_size): + # Python sets an unreasonable group limit (currently 100) in its re module + # Worse, the only way to know we reached it is by catching an AssertionError! + # This function recursively tries less and less groups until it's successful. + postfix = '$' if self.match_whole else '' + mres = [] + while terminals: + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + if self.use_bytes: + pattern = pattern.encode('latin-1') + try: + mre = self.re_.compile(pattern, self.g_regex_flags) + except AssertionError: # Yes, this is what Python provides us.. :/ + return self._build_mres(terminals, max_size//2) + mres.append((mre, {i: n for n, i in mre.groupindex.items()})) + terminals = terminals[max_size:] + return mres -def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): - return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) + def match(self, text, pos): + for mre, type_from_index in self._mres: + m = mre.match(text, pos) + if m: + return m.group(0), type_from_index[m.lastindex] + + @property + def allowed_types(self): + return {v for m, tfi in self._mres for v in tfi.values()} def _regexp_has_newline(r): @@ -341,9 +356,9 @@ class TraditionalLexer(Lexer): self.use_bytes = conf.use_bytes self.terminals_by_name = conf.terminals_by_name - self._mres = None + self._scanner = None - def _build(self): + def _build_scanner(self): terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) assert all(self.callback.values()) @@ -354,19 +369,16 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) + self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) @property - def mres(self): - if self._mres is None: - self._build() - return self._mres + def scanner(self): + if self._scanner is None: + self._build_scanner() + return self._scanner def match(self, text, pos): - for mre, type_from_index in self.mres: - m = mre.match(text, pos) - if m: - return m.group(0), type_from_index[m.lastindex] + return self.scanner.match(text, pos) def lex(self, state, parser_state): with suppress(EOFError): @@ -378,7 +390,7 @@ class TraditionalLexer(Lexer): while line_ctr.char_pos < len(lex_state.text): res = self.match(lex_state.text, line_ctr.char_pos) if not res: - allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types + allowed = self.scanner.allowed_types - self.ignore_types if not allowed: allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, From e5991739ee5a1d5bd6f78b84a495a2d7e17ce406 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 6 Jun 2021 18:23:46 +0300 Subject: [PATCH 03/13] lexer.py: Small refactor --- lark/lexer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 7a30d6d..591943b 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -235,6 +235,11 @@ class CallChain: return self.callback2(t) if self.cond(t2) else t2 +def _get_match(re_, regexp, s, flags): + m = re_.match(regexp, s, flags) + if m: + return m.group(0) + def _create_unless(terminals, g_regex_flags, re_, use_bytes): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() @@ -246,8 +251,7 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if strtok.priority > retok.priority: continue s = strtok.pattern.value - m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) - if m and m.group(0) == s: + if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags): unless.append(strtok) if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) From da3a993d025d7f3463c5564ba6fed2c0f1146adf Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 6 Jun 2021 18:32:59 +0300 Subject: [PATCH 04/13] lexer.py: Small simplification --- lark/lexer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 591943b..2925c35 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -271,6 +271,8 @@ class Scanner: self.use_bytes = use_bytes self.match_whole = match_whole + self.allowed_types = {t.name for t in self.terminals} + self._mres = self._build_mres(terminals, len(terminals)) def _build_mres(self, terminals, max_size): @@ -298,10 +300,6 @@ class Scanner: if m: return m.group(0), type_from_index[m.lastindex] - @property - def allowed_types(self): - return {v for m, tfi in self._mres for v in tfi.values()} - def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: From 3bc070bc1dcbaa91a04f178b985c5250bafc492c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 15 Jun 2021 17:04:38 +0300 Subject: [PATCH 05/13] Change how propagate_positions work --- lark-stubs/lark.pyi | 4 ++-- lark/lark.py | 4 ++-- lark/lexer.py | 28 +++++++++++++------------- lark/parse_tree_builder.py | 40 +++++++++++++++----------------------- lark/parser_frontends.py | 16 +++++++-------- 5 files changed, 42 insertions(+), 50 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 27c6863..18748d1 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -33,7 +33,7 @@ class LarkOptions: regex: bool debug: bool keep_all_tokens: bool - propagate_positions: Union[bool, str] + propagate_positions: Union[bool, Callable] maybe_placeholders: bool lexer_callbacks: Dict[str, Callable[[Token], Token]] cache: Union[bool, str] @@ -77,7 +77,7 @@ class Lark: regex: bool = False, debug: bool = False, keep_all_tokens: bool = False, - propagate_positions: Union[bool, str] = False, + propagate_positions: Union[bool, Callable] = False, maybe_placeholders: bool = False, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, cache: Union[bool, str] = False, diff --git a/lark/lark.py b/lark/lark.py index 8e879cc..9863243 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -44,7 +44,7 @@ class LarkOptions(Serialize): Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) propagate_positions Propagates (line, column, end_line, end_column) attributes into all tree branches. - Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees. + Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating. maybe_placeholders When ``True``, the ``[]`` operator returns ``None`` when not matched. @@ -162,7 +162,7 @@ class LarkOptions(Serialize): assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) if self.parser == 'earley' and self.transformer: - raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.' + raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. ' 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') if o: diff --git a/lark/lexer.py b/lark/lexer.py index 2925c35..7c2f979 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -133,20 +133,20 @@ class Token(Str): def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): try: - self = super(Token, cls).__new__(cls, value) + inst = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: value = value.decode('latin1') - self = super(Token, cls).__new__(cls, value) - - self.type = type_ - self.start_pos = start_pos if start_pos is not None else pos_in_stream - self.value = value - self.line = line - self.column = column - self.end_line = end_line - self.end_column = end_column - self.end_pos = end_pos - return self + inst = super(Token, cls).__new__(cls, value) + + inst.type = type_ + inst.start_pos = start_pos if start_pos is not None else pos_in_stream + inst.value = value + inst.line = line + inst.column = column + inst.end_line = end_line + inst.end_column = end_column + inst.end_pos = end_pos + return inst @property def pos_in_stream(self): @@ -258,8 +258,8 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if unless: callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) - terminals = [t for t in terminals if t not in embedded_strs] - return terminals, callback + new_terminals = [t for t in terminals if t not in embedded_strs] + return new_terminals, callback diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 7a854bc..b4929c6 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -23,8 +23,9 @@ class ExpandSingleChild: class PropagatePositions: - def __init__(self, node_builder): + def __init__(self, node_builder, node_filter=None): self.node_builder = node_builder + self.node_filter = node_filter def __call__(self, children): res = self.node_builder(children) @@ -33,44 +34,35 @@ class PropagatePositions: if isinstance(res, Tree): res_meta = res.meta - src_meta = self._pp_get_meta(children) - if src_meta is not None: - res_meta.line = src_meta.line - res_meta.column = src_meta.column - res_meta.start_pos = src_meta.start_pos + first_meta = self._pp_get_meta(children) + if first_meta is not None: + res_meta.line = first_meta.line + res_meta.column = first_meta.column + res_meta.start_pos = first_meta.start_pos res_meta.empty = False - src_meta = self._pp_get_meta(reversed(children)) - if src_meta is not None: - res_meta.end_line = src_meta.end_line - res_meta.end_column = src_meta.end_column - res_meta.end_pos = src_meta.end_pos + last_meta = self._pp_get_meta(reversed(children)) + if last_meta is not None: + res_meta.end_line = last_meta.end_line + res_meta.end_column = last_meta.end_column + res_meta.end_pos = last_meta.end_pos res_meta.empty = False return res def _pp_get_meta(self, children): for c in children: + if self.node_filter is not None and not self.node_filter(c): + continue if isinstance(c, Tree): if not c.meta.empty: return c.meta elif isinstance(c, Token): return c -class PropagatePositions_IgnoreWs(PropagatePositions): - def _pp_get_meta(self, children): - for c in children: - if isinstance(c, Tree): - if not c.meta.empty: - return c.meta - elif isinstance(c, Token): - if c and not c.isspace(): # Disregard whitespace-only tokens - return c - - def make_propagate_positions(option): - if option == "ignore_ws": - return PropagatePositions_IgnoreWs + if callable(option): + return partial(PropagatePositions, node_filter=option) elif option is True: return PropagatePositions elif option is False: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e066d9a..1818ca7 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -92,26 +92,26 @@ class ParsingFrontend(Serialize): def _verify_start(self, start=None): if start is None: - start = self.parser_conf.start - if len(start) > 1: - raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) - start ,= start + start_decls = self.parser_conf.start + if len(start_decls) > 1: + raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls) + start ,= start_decls elif start not in self.parser_conf.start: raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) return start def parse(self, text, start=None, on_error=None): - start = self._verify_start(start) + chosen_start = self._verify_start(start) stream = text if self.skip_lexer else LexerThread(self.lexer, text) kw = {} if on_error is None else {'on_error': on_error} - return self.parser.parse(stream, start, **kw) + return self.parser.parse(stream, chosen_start, **kw) def parse_interactive(self, text=None, start=None): - start = self._verify_start(start) + chosen_start = self._verify_start(start) if self.parser_conf.parser_type != 'lalr': raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") stream = text if self.skip_lexer else LexerThread(self.lexer, text) - return self.parser.parse_interactive(stream, start) + return self.parser.parse_interactive(stream, chosen_start) def get_frontend(parser, lexer): From 24f653080f1118471934dba1d2ebc133c992305b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 16 Jun 2021 10:48:56 +0300 Subject: [PATCH 06/13] More minor refactorings --- lark/exceptions.py | 15 +++++++++------ lark/lark.py | 12 ++++++------ lark/parsers/lalr_parser.py | 4 ++-- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 26ffce3..9d326b8 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -129,6 +129,8 @@ class UnexpectedInput(LarkError): class UnexpectedEOF(ParseError, UnexpectedInput): def __init__(self, expected, state=None, terminals_by_name=None): + super(UnexpectedEOF, self).__init__() + self.expected = expected self.state = state from .lexer import Token @@ -138,7 +140,6 @@ class UnexpectedEOF(ParseError, UnexpectedInput): self.column = -1 self._terminals_by_name = terminals_by_name - super(UnexpectedEOF, self).__init__() def __str__(self): message = "Unexpected end-of-input. " @@ -149,6 +150,8 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, terminals_by_name=None, considered_rules=None): + super(UnexpectedCharacters, self).__init__() + # TODO considered_tokens and allowed can be figured out using state self.line = line self.column = column @@ -167,7 +170,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput): self.char = seq[lex_pos] self._context = self.get_context(seq) - super(UnexpectedCharacters, self).__init__() def __str__(self): message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column) @@ -190,6 +192,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): """ def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): + super(UnexpectedToken, self).__init__() + # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') @@ -204,7 +208,6 @@ class UnexpectedToken(ParseError, UnexpectedInput): self._terminals_by_name = terminals_by_name self.token_history = token_history - super(UnexpectedToken, self).__init__() @property def accepts(self): @@ -236,10 +239,10 @@ class VisitError(LarkError): """ def __init__(self, rule, obj, orig_exc): - self.obj = obj - self.orig_exc = orig_exc - message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) + self.obj = obj + self.orig_exc = orig_exc + ###} diff --git a/lark/lark.py b/lark/lark.py index 9863243..9a4b2d5 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -451,11 +451,11 @@ class Lark(Serialize): d = f else: d = pickle.load(f) - memo = d['memo'] + memo_json = d['memo'] data = d['data'] - assert memo - memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) + assert memo_json + memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) options = dict(data['options']) if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): raise ConfigurationError("Some options are not allowed when loading a Parser: {}" @@ -512,11 +512,11 @@ class Lark(Serialize): Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) """ - package = FromPackageLoader(package, search_paths) - full_path, text = package(None, grammar_path) + package_loader = FromPackageLoader(package, search_paths) + full_path, text = package_loader(None, grammar_path) options.setdefault('source_path', full_path) options.setdefault('import_paths', []) - options['import_paths'].append(package) + options['import_paths'].append(package_loader) return cls(text, **options) def __repr__(self): diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index fe40791..d916b46 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -178,8 +178,8 @@ class _Parser(object): for token in state.lexer.lex(state): state.feed_token(token) - token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) - return state.feed_token(token, True) + end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + return state.feed_token(end_token, True) except UnexpectedInput as e: try: e.interactive_parser = InteractiveParser(self, state, state.lexer) From a13cfcef55f6460b9b8897e9c313b9bcb4c80b33 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 5 Jul 2021 12:08:38 +0300 Subject: [PATCH 07/13] Bugfix in propagate_positions: Corrected to account for 'container nodes' --- lark/parse_tree_builder.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index b4929c6..39d3510 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -30,23 +30,36 @@ class PropagatePositions: def __call__(self, children): res = self.node_builder(children) - # local reference to Tree.meta reduces number of presence checks if isinstance(res, Tree): + # Calculate positions while the tree is streaming, according to the rule: + # - nodes start at the start of their first child's container, + # and end at the end of their last child's container. + # Containers are nodes that take up space in text, but have been inlined in the tree. + res_meta = res.meta first_meta = self._pp_get_meta(children) if first_meta is not None: - res_meta.line = first_meta.line - res_meta.column = first_meta.column - res_meta.start_pos = first_meta.start_pos - res_meta.empty = False + # meta was already set, probably because the rule has been inlined (e.g. `?rule`) + if not hasattr(res_meta, 'line'): + res_meta.line = getattr(first_meta, 'container_line', first_meta.line) + res_meta.column = getattr(first_meta, 'container_column', first_meta.column) + res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos) + res_meta.empty = False + + res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line) + res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column) last_meta = self._pp_get_meta(reversed(children)) if last_meta is not None: - res_meta.end_line = last_meta.end_line - res_meta.end_column = last_meta.end_column - res_meta.end_pos = last_meta.end_pos - res_meta.empty = False + if not hasattr(res_meta, 'end_line'): + res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) + res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) + res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos) + res_meta.empty = False + + res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) + res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) return res From d7d02e930899048a18b094d798080e59c5b9af9b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 5 Jul 2021 12:11:03 +0300 Subject: [PATCH 08/13] Tiny comment fix --- lark/parse_tree_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 39d3510..286038e 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -40,8 +40,8 @@ class PropagatePositions: first_meta = self._pp_get_meta(children) if first_meta is not None: - # meta was already set, probably because the rule has been inlined (e.g. `?rule`) if not hasattr(res_meta, 'line'): + # meta was already set, probably because the rule has been inlined (e.g. `?rule`) res_meta.line = getattr(first_meta, 'container_line', first_meta.line) res_meta.column = getattr(first_meta, 'container_column', first_meta.column) res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos) From c953dd9505dbba1bd8fbded0077a040a1ce0e5b5 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 5 Jul 2021 13:15:29 +0300 Subject: [PATCH 09/13] Tests: Added a test case demonstrating the need for calculating containers --- tests/test_parser.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_parser.py b/tests/test_parser.py index ff4e064..40ed131 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -94,6 +94,26 @@ class TestParsers(unittest.TestCase): r = g.parse('a') self.assertEqual( r.children[0].meta.line, 1 ) + def test_propagate_positions2(self): + g = Lark("""start: a + a: b + ?b: "(" t ")" + !t: "t" + """, propagate_positions=True) + + start = g.parse("(t)") + a ,= start.children + t ,= a.children + assert t.children[0] == "t" + + assert t.meta.column == 2 + assert t.meta.end_column == 3 + + assert start.column == a.column == 1 + assert start.end_column == a.end_column == 4 + + + def test_expand1(self): g = Lark("""start: a From f14ff6d4d14b500410b8d0d5e14fd2908be95dd9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 5 Jul 2021 14:33:28 +0300 Subject: [PATCH 10/13] Fixed tests to use meta (Tree.column is deprecated) --- tests/test_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index 40ed131..8fec82d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -109,8 +109,8 @@ class TestParsers(unittest.TestCase): assert t.meta.column == 2 assert t.meta.end_column == 3 - assert start.column == a.column == 1 - assert start.end_column == a.end_column == 4 + assert start.meta.column == a.meta.column == 1 + assert start.meta.end_column == a.meta.end_column == 4 From 688c581949b94eccd7ba30baa092a3e4189af008 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Tue, 13 Jul 2021 16:12:09 +0200 Subject: [PATCH 11/13] Updated a few links I believe that the changed link from `examples` to `/examples` isn't a problem on readthedocs, but we should check. If it works, this PR fixes #941 . --- docs/json_tutorial.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index 65c6c78..668d9de 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -427,9 +427,9 @@ I measured memory consumption using a little script called [memusg](https://gist | Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M | | Lark - LALR(1) | 8s | 1.53s | 453M | 266M | | Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M | -| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M | -| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | -| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M | +| PyParsing ([Parser](https://github.com/pyparsing/pyparsing/blob/master/examples/jsonParser.py)) | 32s | 3.53s | 443M | 225M | +| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | +| Parsimonious ([Parser](https://gist.github.com/reclosedev/5222560)) | ? | 5.7s | ? | 1545M | I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1). @@ -442,7 +442,7 @@ Once again, shout-out to PyPy for being so effective. This is the end of the tutorial. I hoped you liked it and learned a little about Lark. -To see what else you can do with Lark, check out the [examples](examples). +To see what else you can do with Lark, check out the [examples](/examples). For questions or any other subject, feel free to email me at erezshin at gmail dot com. From 7cb8acbe54eb108b6e99859adfd41717df43e032 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 9 Jul 2021 22:44:31 +0300 Subject: [PATCH 12/13] Bugfix for deepcopy + small unrelated refactor (issue #938) --- lark/common.py | 12 ++++++++++++ lark/utils.py | 14 +++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/lark/common.py b/lark/common.py index 467acf8..cb408d9 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,4 +1,5 @@ from warnings import warn +from copy import deepcopy from .utils import Serialize from .lexer import TerminalDef @@ -31,6 +32,17 @@ class LexerConf(Serialize): def _deserialize(self): self.terminals_by_name = {t.name: t for t in self.terminals} + def __deepcopy__(self, memo=None): + return type(self)( + deepcopy(self.terminals, memo), + self.re_module, + deepcopy(self.ignore, memo), + deepcopy(self.postlex, memo), + deepcopy(self.callbacks, memo), + deepcopy(self.g_regex_flags, memo), + deepcopy(self.skip_validation, memo), + deepcopy(self.use_bytes, memo), + ) class ParserConf(Serialize): diff --git a/lark/utils.py b/lark/utils.py index b9d7ac3..ea78801 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -73,14 +73,13 @@ class Serialize(object): fields = getattr(self, '__serialize_fields__') res = {f: _serialize(getattr(self, f), memo) for f in fields} res['__type__'] = type(self).__name__ - postprocess = getattr(self, '_serialize', None) - if postprocess: - postprocess(res, memo) + if hasattr(self, '_serialize'): + self._serialize(res, memo) return res @classmethod def deserialize(cls, data, memo): - namespace = getattr(cls, '__serialize_namespace__', {}) + namespace = getattr(cls, '__serialize_namespace__', []) namespace = {c.__name__:c for c in namespace} fields = getattr(cls, '__serialize_fields__') @@ -94,9 +93,10 @@ class Serialize(object): setattr(inst, f, _deserialize(data[f], namespace, memo)) except KeyError as e: raise KeyError("Cannot find key for class", cls, e) - postprocess = getattr(inst, '_deserialize', None) - if postprocess: - postprocess() + + if hasattr(inst, '_deserialize'): + inst._deserialize() + return inst From 87a18a098e306dbe0f4258732ad8944832dc4a39 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 15 Jul 2021 17:00:15 +0300 Subject: [PATCH 13/13] Tiny fix: MakeParsingFrontend is a regular method, not a classmethod --- lark/parser_frontends.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 1818ca7..0e53dd5 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -39,8 +39,7 @@ class MakeParsingFrontend: lexer_conf.lexer_type = self.lexer_type return ParsingFrontend(lexer_conf, parser_conf, options) - @classmethod - def deserialize(cls, data, memo, lexer_conf, callbacks, options): + def deserialize(self, data, memo, lexer_conf, callbacks, options): parser_conf = ParserConf.deserialize(data['parser_conf'], memo) parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) parser_conf.callbacks = callbacks