From bdcd2e0011bc0cd4fa3c35f59f28c78a1fa61a78 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Tue, 29 Jun 2021 22:32:56 +0200 Subject: [PATCH 01/48] fix tree_matcher when keep_all_tokens=True by setting sym.filter_out correctly. --- lark/load_grammar.py | 5 ++++- tests/test_reconstructor.py | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dcb4c81..c7b98a7 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -630,7 +630,10 @@ class Grammar: else: exp_options = options - assert all(isinstance(x, Symbol) for x in expansion), expansion + for sym in expansion: + assert isinstance(sym, Symbol) + if sym.is_term and exp_options and exp_options.keep_all_tokens: + sym.filter_out = False rule = Rule(NonTerminal(name), expansion, i, alias, exp_options) compiled_rules.append(rule) diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index f132312..e2f2dbe 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -3,6 +3,7 @@ import json import sys import unittest +from itertools import product from unittest import TestCase from lark import Lark @@ -20,8 +21,8 @@ def _remove_ws(s): class TestReconstructor(TestCase): - def assert_reconstruct(self, grammar, code): - parser = Lark(grammar, parser='lalr', maybe_placeholders=False) + def assert_reconstruct(self, grammar, code, **options): + parser = Lark(grammar, parser='lalr', maybe_placeholders=False, **options) tree = parser.parse(code) new = Reconstructor(parser).reconstruct(tree) self.assertEqual(_remove_ws(code), _remove_ws(new)) @@ -142,6 +143,17 @@ class TestReconstructor(TestCase): new_json = Reconstructor(json_parser).reconstruct(tree) self.assertEqual(json.loads(new_json), json.loads(test_json)) + def test_keep_all_tokens(self): + g = """ + start: "a"? _B? c? _d? + _B: "b" + c: "c" + _d: "d" + """ + examples = list(map(''.join, product(('', 'a'), ('', 'b'), ('', 'c'), ('', 'd'), ))) + for code in examples: + self.assert_reconstruct(g, code, keep_all_tokens=True) + @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.") def test_switch_grammar_unicode_terminal(self): """ From b37519b7c882d3fbfbf44822d8f3e72898a2c2c3 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 9 Jul 2021 22:44:31 +0300 Subject: [PATCH 02/48] Bugfix for deepcopy + small unrelated refactor (issue #938) --- lark/common.py | 12 ++++++++++++ lark/utils.py | 14 +++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/lark/common.py b/lark/common.py index 467acf8..cb408d9 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,4 +1,5 @@ from warnings import warn +from copy import deepcopy from .utils import Serialize from .lexer import TerminalDef @@ -31,6 +32,17 @@ class LexerConf(Serialize): def _deserialize(self): self.terminals_by_name = {t.name: t for t in self.terminals} + def __deepcopy__(self, memo=None): + return type(self)( + deepcopy(self.terminals, memo), + self.re_module, + deepcopy(self.ignore, memo), + deepcopy(self.postlex, memo), + deepcopy(self.callbacks, memo), + deepcopy(self.g_regex_flags, memo), + deepcopy(self.skip_validation, memo), + deepcopy(self.use_bytes, memo), + ) class ParserConf(Serialize): diff --git a/lark/utils.py b/lark/utils.py index b9d7ac3..ea78801 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -73,14 +73,13 @@ class Serialize(object): fields = getattr(self, '__serialize_fields__') res = {f: _serialize(getattr(self, f), memo) for f in fields} res['__type__'] = type(self).__name__ - postprocess = getattr(self, '_serialize', None) - if postprocess: - postprocess(res, memo) + if hasattr(self, '_serialize'): + self._serialize(res, memo) return res @classmethod def deserialize(cls, data, memo): - namespace = getattr(cls, '__serialize_namespace__', {}) + namespace = getattr(cls, '__serialize_namespace__', []) namespace = {c.__name__:c for c in namespace} fields = getattr(cls, '__serialize_fields__') @@ -94,9 +93,10 @@ class Serialize(object): setattr(inst, f, _deserialize(data[f], namespace, memo)) except KeyError as e: raise KeyError("Cannot find key for class", cls, e) - postprocess = getattr(inst, '_deserialize', None) - if postprocess: - postprocess() + + if hasattr(inst, '_deserialize'): + inst._deserialize() + return inst From 5e5bd187a6fed1d94ff253dbd4f7d908e1d72476 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 22 Jul 2021 11:21:13 +0300 Subject: [PATCH 03/48] Docs: Improved documentation of exceptions --- docs/classes.rst | 2 ++ docs/visitors.rst | 5 +++++ lark/ast_utils.py | 4 ++-- lark/exceptions.py | 31 +++++++++++++++++++++++++------ lark/lark.py | 8 +++++++- 5 files changed, 41 insertions(+), 9 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index 7b18460..1287896 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -66,6 +66,8 @@ UnexpectedInput .. autoclass:: lark.exceptions.UnexpectedCharacters +.. autoclass:: lark.exceptions.UnexpectedEOF + InteractiveParser ----------------- diff --git a/docs/visitors.rst b/docs/visitors.rst index a0e1711..f263712 100644 --- a/docs/visitors.rst +++ b/docs/visitors.rst @@ -107,3 +107,8 @@ Discard ------- .. autoclass:: lark.visitors.Discard + +VisitError +------- + +.. autoclass:: lark.exceptions.VisitError \ No newline at end of file diff --git a/lark/ast_utils.py b/lark/ast_utils.py index 0f2e498..b5463a2 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -36,8 +36,8 @@ def create_transformer(ast_module, transformer=None): Classes starting with an underscore (`_`) will be skipped. Parameters: - ast_module - A Python module containing all the subclasses of `ast_utils.Ast` - transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten. + ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` + transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. """ t = transformer or Transformer() diff --git a/lark/exceptions.py b/lark/exceptions.py index 9d326b8..fdcd52b 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -36,8 +36,9 @@ class UnexpectedInput(LarkError): Used as a base class for the following exceptions: - - ``UnexpectedToken``: The parser received an unexpected token - ``UnexpectedCharacters``: The lexer encountered an unexpected string + - ``UnexpectedToken``: The parser received an unexpected token + - ``UnexpectedEOF``: The parser expected a token, but the input ended After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ @@ -128,6 +129,9 @@ class UnexpectedInput(LarkError): class UnexpectedEOF(ParseError, UnexpectedInput): + """An exception that is raised by the parser, when the input ends while it still expects a token. + """ + def __init__(self, expected, state=None, terminals_by_name=None): super(UnexpectedEOF, self).__init__() @@ -148,6 +152,10 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): + """An exception that is raised by the lexer, when it cannot match the next + string of characters to any of its terminals. + """ + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, terminals_by_name=None, considered_rules=None): super(UnexpectedCharacters, self).__init__() @@ -185,10 +193,15 @@ class UnexpectedToken(ParseError, UnexpectedInput): """An exception that is raised by the parser, when the token it received doesn't match any valid step forward. - The parser provides an interactive instance through `interactive_parser`, - which is initialized to the point of failture, and can be used for debugging and error handling. + Parameters: + token: The mismatched token + expected: The set of expected tokens + considered_rules: Which rules were considered, to deduce the expected tokens + state: A value representing the parser state. Do not rely on its value or type. + interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture, + and can be used for debugging and error handling. - see: ``InteractiveParser``. + Note: These parameters are available as attributes of the instance. """ def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): @@ -234,14 +247,20 @@ class VisitError(LarkError): """VisitError is raised when visitors are interrupted by an exception It provides the following attributes for inspection: - - obj: the tree node or token it was processing when the exception was raised - - orig_exc: the exception that cause it to fail + + Parameters: + rule: the name of the visit rule that failed + obj: the tree-node or token that was being processed + orig_exc: the exception that cause it to fail + + Note: These parameters are available as attributes """ def __init__(self, rule, obj, orig_exc): message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) + self.rule = rule self.obj = obj self.orig_exc = orig_exc diff --git a/lark/lark.py b/lark/lark.py index 9a4b2d5..45dec4d 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -102,7 +102,7 @@ class LarkOptions(Serialize): A List of either paths or loader functions to specify from where grammars are imported source_path Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading - **=== End Options ===** + **=== End of Options ===** """ if __doc__: __doc__ += OPTIONS_DOC @@ -527,6 +527,8 @@ class Lark(Serialize): """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. + + :raises UnexpectedCharacters: In case the lexer cannot find a suitable match. """ if not hasattr(self, 'lexer') or dont_ignore: lexer = self._build_lexer(dont_ignore) @@ -569,6 +571,10 @@ class Lark(Serialize): If a transformer is supplied to ``__init__``, returns whatever is the result of the transformation. Otherwise, returns a Tree instance. + :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: + ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. + For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. + """ return self.parser.parse(text, start=start, on_error=on_error) From 55642be13c1a5ac36a999124ae3c875492d574d1 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 25 Jul 2021 17:13:23 +0300 Subject: [PATCH 04/48] Tiny adjustments --- examples/standalone/json_parser_main.py | 4 +++- lark/parsers/lalr_interactive_parser.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/standalone/json_parser_main.py b/examples/standalone/json_parser_main.py index 503b249..3d9b5a6 100644 --- a/examples/standalone/json_parser_main.py +++ b/examples/standalone/json_parser_main.py @@ -10,7 +10,9 @@ Standalone Parser import sys -from json_parser import Lark_StandAlone, Transformer, inline_args +from json_parser import Lark_StandAlone, Transformer, v_args + +inline_args = v_args(inline=True) class TreeToJson(Transformer): @inline_args diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py index ce596b5..d6780cb 100644 --- a/lark/parsers/lalr_interactive_parser.py +++ b/lark/parsers/lalr_interactive_parser.py @@ -65,7 +65,7 @@ class InteractiveParser(object): """Print the output of ``choices()`` in a way that's easier to read.""" out = ["Parser choices:"] for k, v in self.choices().items(): - out.append('\t- %s -> %s' % (k, v)) + out.append('\t- %s -> %r' % (k, v)) out.append('stack size: %s' % len(self.parser_state.state_stack)) return '\n'.join(out) From b0a9afb287eaaeb139140d088cccbd6167f92aa1 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Sun, 25 Jul 2021 23:07:08 +0200 Subject: [PATCH 05/48] Split up repeats from tilde into different rules. --- lark/load_grammar.py | 24 ++++++++++++++++++++++-- lark/utils.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dbf4a1f..569e67d 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -9,7 +9,7 @@ import pkgutil from ast import literal_eval from numbers import Integral -from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique +from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -196,6 +196,26 @@ class EBNF_to_BNF(Transformer_InPlace): self.rules_by_expr[expr] = t return t + def _add_repeat_rule(self, a, b, target, atom): + if (a, b, target, atom) in self.rules_by_expr: + return self.rules_by_expr[(a, b, target, atom)] + new_name = '__%s_a%d_b%d_%d' % (self.prefix, a, b, self.i) + self.i += 1 + t = NonTerminal(new_name) + tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) + self.new_rules.append((new_name, tree, self.rule_options)) + self.rules_by_expr[(a, b, target, atom)] = t + return t + + def _generate_repeats(self, rule, mn, mx): + factors = small_factors(mn) + target = rule + for a, b in factors: + target = self._add_repeat_rule(a, b, target, rule) + + # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + return ST('expansions', [ST('expansion', [target] + [rule] * n) for n in range(0, mx - mn + 1)]) + def expr(self, rule, op, *args): if op.value == '?': empty = ST('expansion', []) @@ -220,7 +240,7 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) + return self._generate_repeats(rule, mn, mx) assert False, op def maybe(self, rule): diff --git a/lark/utils.py b/lark/utils.py index ea78801..a3a077f 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -359,3 +359,33 @@ def _serialize(value, memo): return {key:_serialize(elem, memo) for key, elem in value.items()} # assert value is None or isinstance(value, (int, float, str, tuple)), value return value + + +def small_factors(n): + """ + Splits n up into smaller factors and summands <= 10. + Returns a list of [(a, b), ...] + so that the following code returns n: + + n = 1 + for a, b in values: + n = n * a + b + + Currently, we also keep a + b <= 10, but that might change + """ + assert n > 0 + if n < 10: + return [(n, 0)] + # TODO: Think of better algorithms (Prime factors should minimize the number of steps) + for a in range(10, 1, -1): + b = n % a + if a + b > 10: + continue + r = n // a + assert r * a + b == n # Sanity check + if r <= 10: + return [(r, 0), (a, b)] + else: + return [*small_factors(r), (a, b)] + # This should be unreachable, since 2 + 1 <= 10 + assert False, "Failed to factorize %s" % n From 845b6fa477827d6ee77a21eaced1c3f3a4a8d8b0 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 01:14:46 +0200 Subject: [PATCH 06/48] Refactor + tests + additional splitting up. --- lark/load_grammar.py | 100 ++++++++++++++++++++++++++++++++----------- lark/utils.py | 2 +- tests/test_parser.py | 29 +++++++++++++ 3 files changed, 105 insertions(+), 26 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 569e67d..2f51ff6 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -179,42 +179,87 @@ RULES = { class EBNF_to_BNF(Transformer_InPlace): def __init__(self): self.new_rules = [] - self.rules_by_expr = {} + self.rules_cache = {} self.prefix = 'anon' self.i = 0 self.rule_options = None - def _add_recurse_rule(self, type_, expr): - if expr in self.rules_by_expr: - return self.rules_by_expr[expr] - - new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) + def _name_rule(self, inner): + new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) self.i += 1 - t = NonTerminal(new_name) - tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) - self.new_rules.append((new_name, tree, self.rule_options)) - self.rules_by_expr[expr] = t + return new_name + + def _add_rule(self, key, name, expansions): + t = NonTerminal(name) + self.new_rules.append((name, expansions, self.rule_options)) + self.rules_cache[key] = t return t + def _add_recurse_rule(self, type_, expr): + try: + return self.rules_cache[expr] + except KeyError: + new_name = self._name_rule(type_) + t = NonTerminal(new_name) + tree = ST('expansions', [ + ST('expansion', [expr]), + ST('expansion', [t, expr]) + ]) + return self._add_rule(expr, new_name, tree) + def _add_repeat_rule(self, a, b, target, atom): - if (a, b, target, atom) in self.rules_by_expr: - return self.rules_by_expr[(a, b, target, atom)] - new_name = '__%s_a%d_b%d_%d' % (self.prefix, a, b, self.i) - self.i += 1 - t = NonTerminal(new_name) - tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) - self.new_rules.append((new_name, tree, self.rule_options)) - self.rules_by_expr[(a, b, target, atom)] = t - return t + """ + When target matches n times atom + This builds a rule that matches atom (a*n + b) times + """ + key = (a, b, target, atom) + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('a%d_b%d' % (a, b)) + tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) + return self._add_rule(key, new_name, tree) + + def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): + """ + When target matches n times atom, and target_opt 0 to n-1 times target_opt, + This builds a rule that matches atom 0 to (a*n+b)-1 times + """ + key = (a, b, target, atom, "opt") + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('a%d_b%d_opt' % (a, b)) + tree = ST('expansions', [ + ST('expansion', [target] * i + [target_opt]) + for i in range(a) + ] + [ + ST('expansion', [target] * a + [atom] * i) + for i in range(1, b) + ]) + return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): - factors = small_factors(mn) - target = rule - for a, b in factors: - target = self._add_repeat_rule(a, b, target, rule) + mn_factors = small_factors(mn) + mn_target = rule + for a, b in mn_factors: + mn_target = self._add_repeat_rule(a, b, mn_target, rule) + if mx == mn: + return mn_target + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule needs it. + diff_factors = small_factors(diff) + diff_target = rule + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times) + for a, b in diff_factors[:-1]: + new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + diff_target = new_diff_target + a, b = diff_factors[-1] + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) - return ST('expansions', [ST('expansion', [target] + [rule] * n) for n in range(0, mx - mn + 1)]) + # return ST('expansions', [ST('expansion', [mn_target] + [rule] * n) for n in range(0, mx - mn + 1)]) + return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) def expr(self, rule, op, *args): if op.value == '?': @@ -240,7 +285,12 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - return self._generate_repeats(rule, mn, mx) + # For small number of repeats, we don't need to build new rules. + # Value 20 is arbitrarily chosen + if mx > 20: + return self._generate_repeats(rule, mn, mx) + else: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) assert False, op def maybe(self, rule): diff --git a/lark/utils.py b/lark/utils.py index a3a077f..2fa5f43 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -373,7 +373,7 @@ def small_factors(n): Currently, we also keep a + b <= 10, but that might change """ - assert n > 0 + assert n >= 0 if n < 10: return [(n, 0)] # TODO: Think of better algorithms (Prime factors should minimize the number of steps) diff --git a/tests/test_parser.py b/tests/test_parser.py index 8fec82d..6c00fbb 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2226,6 +2226,35 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") + def test_ranged_repeat_large(self): + # Large is currently arbitrarily chosen to be large than 20 + g = u"""!start: "A"~30 + """ + l = _Lark(g) + self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") + self.assertEqual(l.parse(u'A'*30), Tree('start', ["A"]*30)) + self.assertRaises(ParseError, l.parse, u'A'*29) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A'*31) + + + g = u"""!start: "A"~0..100 + """ + l = _Lark(g) + self.assertEqual(l.parse(u''), Tree('start', [])) + self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) + self.assertEqual(l.parse(u'A'*100), Tree('start', ['A']*100)) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101) + + # 8191 is a Mersenne prime + g = u"""start: "A"~8191 + """ + l = _Lark(g) + self.assertEqual(l.parse(u'A'*8191), Tree('start', [])) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) + + @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX def test_priority_vs_embedded(self): g = """ From b4fe22a27dd67bca414be767b92ab2960798f0d6 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 10:50:37 +0200 Subject: [PATCH 07/48] Python2.7 + comments + Magic constants --- lark/load_grammar.py | 48 ++++++++++++++++++++++++++++++++++++-------- lark/utils.py | 23 ++++++++++++--------- tests/test_parser.py | 11 +++++----- 3 files changed, 58 insertions(+), 24 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 2f51ff6..2b1030f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -174,6 +174,10 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } +REPEAT_BREAK_THRESHOLD = 20 +# The Threshold whether repeat via ~ are split up into different rules +# For the moment 20 is arbitrarily chosen + @inline_args class EBNF_to_BNF(Transformer_InPlace): @@ -211,25 +215,50 @@ class EBNF_to_BNF(Transformer_InPlace): """ When target matches n times atom This builds a rule that matches atom (a*n + b) times + + The rule is of the form: + + The rules are of the form: (Example a = 3, b = 4) + + new_rule: target target target atom atom atom atom + + e.g. we use target * a and atom * b """ key = (a, b, target, atom) try: return self.rules_cache[key] except KeyError: - new_name = self._name_rule('a%d_b%d' % (a, b)) + new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) return self._add_rule(key, new_name, tree) def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): """ When target matches n times atom, and target_opt 0 to n-1 times target_opt, - This builds a rule that matches atom 0 to (a*n+b)-1 times + This builds a rule that matches atom 0 to (a*n+b)-1 times. + The created rule will not have any shift/reduce conflicts so that it can be used with lalr + + The rules are of the form: (Example a = 3, b = 4) + + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target atom + | target target target atom atom + | target target target atom atom atom + + First we generate target * i followed by target_opt for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i for i from 1 to b-1 + These match n*a to n*a + b-1 times atom """ key = (a, b, target, atom, "opt") try: return self.rules_cache[key] except KeyError: - new_name = self._name_rule('a%d_b%d_opt' % (a, b)) + new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) tree = ST('expansions', [ ST('expansion', [target] * i + [target_opt]) for i in range(a) @@ -240,13 +269,19 @@ class EBNF_to_BNF(Transformer_InPlace): return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): + """ + We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn). + We then use small_factors to split up mn and diff up into values [(a, b), ...] + This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + to generate a complete rule/expression that matches the corresponding number of repeats + """ mn_factors = small_factors(mn) mn_target = rule for a, b in mn_factors: mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target - diff = mx - mn + 1 # We add one because _add_repeat_opt_rule needs it. + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less diff_factors = small_factors(diff) diff_target = rule diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times) @@ -257,8 +292,6 @@ class EBNF_to_BNF(Transformer_InPlace): a, b = diff_factors[-1] diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) - # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) - # return ST('expansions', [ST('expansion', [mn_target] + [rule] * n) for n in range(0, mx - mn + 1)]) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) def expr(self, rule, op, *args): @@ -286,8 +319,7 @@ class EBNF_to_BNF(Transformer_InPlace): if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) # For small number of repeats, we don't need to build new rules. - # Value 20 is arbitrarily chosen - if mx > 20: + if mx > REPEAT_BREAK_THRESHOLD: return self._generate_repeats(rule, mn, mx) else: return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) diff --git a/lark/utils.py b/lark/utils.py index 2fa5f43..1648720 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -187,7 +187,7 @@ def get_regexp_width(expr): return 1, sre_constants.MAXREPEAT else: return 0, sre_constants.MAXREPEAT - + ###} @@ -288,7 +288,7 @@ except ImportError: class FS: exists = os.path.exists - + @staticmethod def open(name, mode="r", **kwargs): if atomicwrites and "w" in mode: @@ -361,9 +361,13 @@ def _serialize(value, memo): return value +# 10 is arbitrarily chosen +SMALL_FACTOR_THRESHOLD = 10 + + def small_factors(n): """ - Splits n up into smaller factors and summands <= 10. + Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD. Returns a list of [(a, b), ...] so that the following code returns n: @@ -371,21 +375,20 @@ def small_factors(n): for a, b in values: n = n * a + b - Currently, we also keep a + b <= 10, but that might change + Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change """ assert n >= 0 - if n < 10: + if n < SMALL_FACTOR_THRESHOLD: return [(n, 0)] # TODO: Think of better algorithms (Prime factors should minimize the number of steps) - for a in range(10, 1, -1): + for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): b = n % a - if a + b > 10: + if a + b > SMALL_FACTOR_THRESHOLD: continue r = n // a assert r * a + b == n # Sanity check - if r <= 10: + if r <= SMALL_FACTOR_THRESHOLD: return [(r, 0), (a, b)] else: - return [*small_factors(r), (a, b)] - # This should be unreachable, since 2 + 1 <= 10 + return small_factors(r) + [(a, b)] assert False, "Failed to factorize %s" % n diff --git a/tests/test_parser.py b/tests/test_parser.py index 6c00fbb..2247b46 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2233,24 +2233,23 @@ def _make_parser_test(LEXER, PARSER): """ l = _Lark(g) self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") - self.assertEqual(l.parse(u'A'*30), Tree('start', ["A"]*30)) - self.assertRaises(ParseError, l.parse, u'A'*29) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A'*31) - + self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30)) + self.assertRaises(ParseError, l.parse, u'A' * 29) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31) g = u"""!start: "A"~0..100 """ l = _Lark(g) self.assertEqual(l.parse(u''), Tree('start', [])) self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) - self.assertEqual(l.parse(u'A'*100), Tree('start', ['A']*100)) + self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100)) self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101) # 8191 is a Mersenne prime g = u"""start: "A"~8191 """ l = _Lark(g) - self.assertEqual(l.parse(u'A'*8191), Tree('start', [])) + self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) From 6872404f1123bc6dcabb4f1735622747999b2bdc Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 26 Jul 2021 12:20:15 +0300 Subject: [PATCH 08/48] Improvements to the Python3 grammar --- examples/advanced/python3.lark | 142 +++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 53 deletions(-) diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark index 0fc5949..e54eb69 100644 --- a/examples/advanced/python3.lark +++ b/examples/advanced/python3.lark @@ -21,7 +21,7 @@ decorators: decorator+ decorated: decorators (classdef | funcdef | async_funcdef) async_funcdef: "async" funcdef -funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite +funcdef: "def" NAME "(" [parameters] ")" ["->" test] ":" suite parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]] | starparams @@ -29,25 +29,36 @@ parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] -kwparams: "**" typedparam +kwparams: "**" typedparam ","? -?paramvalue: typedparam ["=" test] -?typedparam: NAME [":" test] +?paramvalue: typedparam ("=" test)? +?typedparam: NAME (":" test)? -varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]] - | "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] - | "**" vfpdef [","]) -vfpdef: NAME +lambdef: "lambda" [lambda_params] ":" test +lambdef_nocond: "lambda" [lambda_params] ":" test_nocond +lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]] + | lambda_starparams + | lambda_kwparams +?lambda_paramvalue: NAME ("=" test)? +lambda_starparams: "*" [NAME] ("," lambda_paramvalue)* ["," [lambda_kwparams]] +lambda_kwparams: "**" NAME ","? + ?stmt: simple_stmt | compound_stmt ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE -?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) -?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) - | ("=" (yield_expr|testlist_star_expr))*) -annassign: ":" test ["=" test] -?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","] -!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") +?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) +expr_stmt: testlist_star_expr +assign_stmt: annassign | augassign | assign + +annassign: testlist_star_expr ":" test ["=" test] +assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+ +augassign: testlist_star_expr augassign_op (yield_expr|testlist) +!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=" +?testlist_star_expr: test_or_star_expr + | test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple + | test_or_star_expr "," -> tuple + // For normal and annotated assignments, additional restrictions enforced by the interpreter del_stmt: "del" exprlist pass_stmt: "pass" @@ -71,43 +82,52 @@ global_stmt: "global" NAME ("," NAME)* nonlocal_stmt: "nonlocal" NAME ("," NAME)* assert_stmt: "assert" test ["," test] -compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt +?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt async_stmt: "async" (funcdef | with_stmt | for_stmt) -if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] +if_stmt: "if" test ":" suite elifs ["else" ":" suite] +elifs: elif_* +elif_: "elif" test ":" suite while_stmt: "while" test ":" suite ["else" ":" suite] for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] -try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) -with_stmt: "with" with_item ("," with_item)* ":" suite +try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally] + | "try" ":" suite finally -> try_finally +finally: "finally" ":" suite +except_clauses: except_clause+ +except_clause: "except" [test ["as" NAME]] ":" suite + +with_stmt: "with" with_items ":" suite +with_items: with_item ("," with_item)* with_item: test ["as" expr] // NB compile.c makes sure that the default except clause is last -except_clause: "except" [test ["as" NAME]] suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT -?test: or_test ("if" or_test "else" test)? | lambdef +?test: or_test ("if" or_test "else" test)? + | lambdef ?test_nocond: or_test | lambdef_nocond -lambdef: "lambda" [varargslist] ":" test -lambdef_nocond: "lambda" [varargslist] ":" test_nocond + ?or_test: and_test ("or" and_test)* ?and_test: not_test ("and" not_test)* -?not_test: "not" not_test -> not +?not_test: "not" not_test -> not_test | comparison -?comparison: expr (_comp_op expr)* +?comparison: expr (comp_op expr)* star_expr: "*" expr -?expr: xor_expr ("|" xor_expr)* + +?expr: or_expr +?or_expr: xor_expr ("|" xor_expr)* ?xor_expr: and_expr ("^" and_expr)* ?and_expr: shift_expr ("&" shift_expr)* ?shift_expr: arith_expr (_shift_op arith_expr)* ?arith_expr: term (_add_op term)* ?term: factor (_mul_op factor)* -?factor: _factor_op factor | power +?factor: _unary_op factor | power -!_factor_op: "+"|"-"|"~" +!_unary_op: "+"|"-"|"~" !_add_op: "+"|"-" !_shift_op: "<<"|">>" !_mul_op: "*"|"@"|"/"|"%"|"//" // <> isn't actually a valid comparison operator in Python. It's here for the // sake of a __future__ import described in PEP 401 (which really works :-) -!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" +!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" ?power: await_expr ("**" factor)? ?await_expr: AWAIT? atom_expr @@ -118,61 +138,76 @@ AWAIT: "await" | atom_expr "." NAME -> getattr | atom -?atom: "(" [yield_expr|tuplelist_comp] ")" -> tuple - | "[" [testlist_comp] "]" -> list - | "{" [dict_comp] "}" -> dict - | "{" set_comp "}" -> set +?atom: "(" yield_expr ")" + | "(" _tuple_inner? ")" -> tuple + | "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension + | "[" _testlist_comp? "]" -> list + | "[" comprehension{test_or_star_expr} "]" -> list_comprehension + | "{" _dict_exprlist? "}" -> dict + | "{" comprehension{key_value} "}" -> dict_comprehension + | "{" _set_exprlist "}" -> set + | "{" comprehension{test} "}" -> set_comprehension | NAME -> var - | number | string+ + | TEMPLATE_NAME -> template_var + | number + | string_concat | "(" test ")" | "..." -> ellipsis | "None" -> const_none | "True" -> const_true | "False" -> const_false -?testlist_comp: test | tuplelist_comp -tuplelist_comp: (test|star_expr) (comp_for | ("," (test|star_expr))+ [","] | ",") + +?string_concat: string+ + +_testlist_comp: test | _tuple_inner +_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",") + + +?test_or_star_expr: test + | star_expr + ?subscriptlist: subscript | subscript (("," subscript)+ [","] | ",") -> subscript_tuple -subscript: test | ([test] ":" [test] [sliceop]) -> slice +?subscript: test | ([test] ":" [test] [sliceop]) -> slice sliceop: ":" [test] -exprlist: (expr|star_expr) - | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") -> exprlist_tuple -testlist: test | testlist_tuple +?exprlist: (expr|star_expr) + | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") +?testlist: test | testlist_tuple testlist_tuple: test (("," test)+ [","] | ",") -dict_comp: key_value comp_for - | (key_value | "**" expr) ("," (key_value | "**" expr))* [","] +_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","] key_value: test ":" test -set_comp: test comp_for - | (test|star_expr) ("," (test | star_expr))* [","] +_set_exprlist: test_or_star_expr ("," test_or_star_expr)* [","] classdef: "class" NAME ["(" [arguments] ")"] ":" suite + + arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? | starargs | kwargs - | test comp_for + | comprehension{test} -starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs] +starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs] +stararg: "*" test kwargs: "**" test ?argvalue: test ("=" test)? - -comp_iter: comp_for | comp_if | async_for -async_for: "async" "for" exprlist "in" or_test [comp_iter] -comp_for: "for" exprlist "in" or_test [comp_iter] -comp_if: "if" test_nocond [comp_iter] +comprehension{comp_result}: comp_result comp_fors [comp_if] +comp_fors: comp_for+ +comp_for: [ASYNC] "for" exprlist "in" or_test +ASYNC: "async" +?comp_if: "if" test_nocond // not used in grammar, but may appear in "node" passed from Parser to Compiler encoding_decl: NAME -yield_expr: "yield" [yield_arg] -yield_arg: "from" test | testlist - +yield_expr: "yield" [testlist] + | "yield" "from" test -> yield_from number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER string: STRING | LONG_STRING @@ -181,6 +216,7 @@ string: STRING | LONG_STRING %import python (NAME, COMMENT, STRING, LONG_STRING) %import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER) + // Other terminals _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ From 6a027982c7f8c014dc8403f9d22e52d1c9cb5a21 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 26 Jul 2021 14:54:41 +0300 Subject: [PATCH 09/48] Tiny fix to PR --- examples/advanced/python3.lark | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark index e54eb69..7fb5ae5 100644 --- a/examples/advanced/python3.lark +++ b/examples/advanced/python3.lark @@ -148,7 +148,6 @@ AWAIT: "await" | "{" _set_exprlist "}" -> set | "{" comprehension{test} "}" -> set_comprehension | NAME -> var - | TEMPLATE_NAME -> template_var | number | string_concat | "(" test ")" From fa8565366be027df0f788cb1432d90b2b94aa264 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 18:53:43 +0200 Subject: [PATCH 10/48] Off-by-one fix + Change of thresholds + fix tests --- lark/load_grammar.py | 29 ++++++++++++++++++++++------- lark/utils.py | 8 +++++--- tests/test_parser.py | 19 ++++++++++--------- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 2b1030f..36f6e2c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -174,9 +174,21 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } -REPEAT_BREAK_THRESHOLD = 20 +REPEAT_BREAK_THRESHOLD = 50 # The Threshold whether repeat via ~ are split up into different rules -# For the moment 20 is arbitrarily chosen +# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, +# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. +# For a grammar of the form start: "A"~0..N, these are the timing stats: +# N t +# 10 0.000 +# 20 0.004 +# 30 0.016 +# 40 0.049 +# 50 0.109 +# 60 0.215 +# 70 0.383 +# 80 0.631 +# (See PR #949) @inline_args @@ -244,6 +256,7 @@ class EBNF_to_BNF(Transformer_InPlace): | target target_opt | target target target_opt + | target target target | target target target atom | target target target atom atom | target target target atom atom atom @@ -251,7 +264,7 @@ class EBNF_to_BNF(Transformer_InPlace): First we generate target * i followed by target_opt for i from 0 to a-1 These match 0 to n*a - 1 times atom - Then we generate target * a followed by atom * i for i from 1 to b-1 + Then we generate target * a followed by atom * i for i from 0 to b-1 These match n*a to n*a + b-1 times atom """ key = (a, b, target, atom, "opt") @@ -264,7 +277,7 @@ class EBNF_to_BNF(Transformer_InPlace): for i in range(a) ] + [ ST('expansion', [target] * a + [atom] * i) - for i in range(1, b) + for i in range(b) ]) return self._add_rule(key, new_name, tree) @@ -281,15 +294,17 @@ class EBNF_to_BNF(Transformer_InPlace): mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less diff_factors = small_factors(diff) - diff_target = rule - diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times) + diff_target = rule # Match rule 1 times + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) for a, b in diff_factors[:-1]: new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) diff_target = new_diff_target - a, b = diff_factors[-1] + + a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) diff --git a/lark/utils.py b/lark/utils.py index 1648720..f447b9e 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -361,8 +361,9 @@ def _serialize(value, memo): return value -# 10 is arbitrarily chosen -SMALL_FACTOR_THRESHOLD = 10 +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 def small_factors(n): @@ -380,7 +381,8 @@ def small_factors(n): assert n >= 0 if n < SMALL_FACTOR_THRESHOLD: return [(n, 0)] - # TODO: Think of better algorithms (Prime factors should minimize the number of steps) + # While this does not provide an optimal solution, it produces a pretty good one. + # See above comment and PR #949 for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): b = n % a if a + b > SMALL_FACTOR_THRESHOLD: diff --git a/tests/test_parser.py b/tests/test_parser.py index 2247b46..b55f848 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2229,21 +2229,22 @@ def _make_parser_test(LEXER, PARSER): @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") def test_ranged_repeat_large(self): # Large is currently arbitrarily chosen to be large than 20 - g = u"""!start: "A"~30 + g = u"""!start: "A"~60 """ l = _Lark(g) self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") - self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30)) - self.assertRaises(ParseError, l.parse, u'A' * 29) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31) + self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) + self.assertRaises(ParseError, l.parse, u'A' * 59) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) - g = u"""!start: "A"~0..100 + g = u"""!start: "A"~15..100 """ l = _Lark(g) - self.assertEqual(l.parse(u''), Tree('start', [])) - self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) - self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100)) - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101) + for i in range(0, 110): + if 15 <= i <= 100: + self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) + else: + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i) # 8191 is a Mersenne prime g = u"""start: "A"~8191 From 3436b3388546cfd8c802ab09d59d6e9e82cb1c7e Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 21:13:29 +0200 Subject: [PATCH 11/48] Refactor small_factors --- lark/utils.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/lark/utils.py b/lark/utils.py index f447b9e..610d160 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -379,18 +379,12 @@ def small_factors(n): Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change """ assert n >= 0 - if n < SMALL_FACTOR_THRESHOLD: + if n <= SMALL_FACTOR_THRESHOLD: return [(n, 0)] # While this does not provide an optimal solution, it produces a pretty good one. # See above comment and PR #949 for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): - b = n % a - if a + b > SMALL_FACTOR_THRESHOLD: - continue - r = n // a - assert r * a + b == n # Sanity check - if r <= SMALL_FACTOR_THRESHOLD: - return [(r, 0), (a, b)] - else: + r, b = divmod(n, a) + if a + b <= SMALL_FACTOR_THRESHOLD: return small_factors(r) + [(a, b)] assert False, "Failed to factorize %s" % n From 90460f31d98da5a08ec14c0ad7062756dcc82668 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 27 Jul 2021 11:45:01 +0300 Subject: [PATCH 12/48] Refactored PR #949 and edited the comments/docstrings --- lark/load_grammar.py | 100 ++++++++++++++++++++----------------------- lark/utils.py | 21 ++++----- tests/test_parser.py | 8 ++-- 3 files changed, 60 insertions(+), 69 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 36f6e2c..d1d06cc 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -174,21 +174,15 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } -REPEAT_BREAK_THRESHOLD = 50 + +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 # The Threshold whether repeat via ~ are split up into different rules # 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, # while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. -# For a grammar of the form start: "A"~0..N, these are the timing stats: -# N t -# 10 0.000 -# 20 0.004 -# 30 0.016 -# 40 0.049 -# 50 0.109 -# 60 0.215 -# 70 0.383 -# 80 0.631 # (See PR #949) +REPEAT_BREAK_THRESHOLD = 50 @inline_args @@ -224,17 +218,16 @@ class EBNF_to_BNF(Transformer_InPlace): return self._add_rule(expr, new_name, tree) def _add_repeat_rule(self, a, b, target, atom): - """ - When target matches n times atom - This builds a rule that matches atom (a*n + b) times + """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. - The rule is of the form: + When called recursively (into target), it repeats atom for x(n) times, where: + x(0) = 1 + x(n) = a(n) * x(n-1) + b - The rules are of the form: (Example a = 3, b = 4) + Example rule when a=3, b=4: - new_rule: target target target atom atom atom atom + new_rule: target target target atom atom atom atom - e.g. we use target * a and atom * b """ key = (a, b, target, atom) try: @@ -245,27 +238,29 @@ class EBNF_to_BNF(Transformer_InPlace): return self._add_rule(key, new_name, tree) def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): - """ + """Creates a rule that matches atom 0 to (a*n+b)-1 times. + When target matches n times atom, and target_opt 0 to n-1 times target_opt, - This builds a rule that matches atom 0 to (a*n+b)-1 times. - The created rule will not have any shift/reduce conflicts so that it can be used with lalr - The rules are of the form: (Example a = 3, b = 4) + First we generate target * i followed by target_opt, for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i, for i from 0 to b-1 + These match n*a to n*a + b-1 times atom - new_rule: target_opt - | target target_opt - | target target target_opt + The created rule will not have any shift/reduce conflicts so that it can be used with lalr - | target target target - | target target target atom - | target target target atom atom - | target target target atom atom atom + Example rule when a=3, b=4: - First we generate target * i followed by target_opt for i from 0 to a-1 - These match 0 to n*a - 1 times atom + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target + | target target target atom + | target target target atom atom + | target target target atom atom atom - Then we generate target * a followed by atom * i for i from 0 to b-1 - These match n*a to n*a + b-1 times atom """ key = (a, b, target, atom, "opt") try: @@ -273,38 +268,39 @@ class EBNF_to_BNF(Transformer_InPlace): except KeyError: new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) tree = ST('expansions', [ - ST('expansion', [target] * i + [target_opt]) - for i in range(a) + ST('expansion', [target]*i + [target_opt]) for i in range(a) ] + [ - ST('expansion', [target] * a + [atom] * i) - for i in range(b) + ST('expansion', [target]*a + [atom]*i) for i in range(b) ]) return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): + """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. """ - We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn). - We then use small_factors to split up mn and diff up into values [(a, b), ...] - This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt - to generate a complete rule/expression that matches the corresponding number of repeats - """ - mn_factors = small_factors(mn) + # For a small number of repeats, we can take the naive approach + if mx < REPEAT_BREAK_THRESHOLD: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + + # For large repeat values, we break the repetition into sub-rules. + # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. + # We then use small_factors to split up mn and diff up into values [(a, b), ...] + # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + # to generate a complete rule/expression that matches the corresponding number of repeats mn_target = rule - for a, b in mn_factors: + for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less - diff_factors = small_factors(diff) + diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) diff_target = rule # Match rule 1 times diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) for a, b in diff_factors[:-1]: - new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) - diff_target = new_diff_target + diff_target = self._add_repeat_rule(a, b, diff_target, rule) - a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule + a, b = diff_factors[-1] diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) @@ -333,11 +329,9 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - # For small number of repeats, we don't need to build new rules. - if mx > REPEAT_BREAK_THRESHOLD: - return self._generate_repeats(rule, mn, mx) - else: - return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + + return self._generate_repeats(rule, mn, mx) + assert False, op def maybe(self, rule): diff --git a/lark/utils.py b/lark/utils.py index 610d160..2938591 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -361,14 +361,11 @@ def _serialize(value, memo): return value -# Value 5 keeps the number of states in the lalr parser somewhat minimal -# It isn't optimal, but close to it. See PR #949 -SMALL_FACTOR_THRESHOLD = 5 -def small_factors(n): +def small_factors(n, max_factor): """ - Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD. + Splits n up into smaller factors and summands <= max_factor. Returns a list of [(a, b), ...] so that the following code returns n: @@ -376,15 +373,15 @@ def small_factors(n): for a, b in values: n = n * a + b - Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change + Currently, we also keep a + b <= max_factor, but that might change """ assert n >= 0 - if n <= SMALL_FACTOR_THRESHOLD: + assert max_factor > 2 + if n <= max_factor: return [(n, 0)] - # While this does not provide an optimal solution, it produces a pretty good one. - # See above comment and PR #949 - for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): + + for a in range(max_factor, 1, -1): r, b = divmod(n, a) - if a + b <= SMALL_FACTOR_THRESHOLD: - return small_factors(r) + [(a, b)] + if a + b <= max_factor: + return small_factors(r, max_factor) + [(a, b)] assert False, "Failed to factorize %s" % n diff --git a/tests/test_parser.py b/tests/test_parser.py index b55f848..ffb1d8f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2226,7 +2226,7 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") + @unittest.skipIf(PARSER != 'lalr', "We only need to test rule generation, we know BNF is solid on all parsers") def test_ranged_repeat_large(self): # Large is currently arbitrarily chosen to be large than 20 g = u"""!start: "A"~60 @@ -2244,15 +2244,15 @@ def _make_parser_test(LEXER, PARSER): if 15 <= i <= 100: self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) else: - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i) + self.assertRaises(UnexpectedInput, l.parse, u'A' * i) # 8191 is a Mersenne prime g = u"""start: "A"~8191 """ l = _Lark(g) self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX From cf61f78509e52b44c4fbbaf40f42a688f754342b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 27 Jul 2021 15:08:29 +0300 Subject: [PATCH 13/48] Tests: Moved repeat operator tests to test_grammar --- tests/test_grammar.py | 49 ++++++++++++++++++++++++++++++++++++++++++- tests/test_parser.py | 49 ------------------------------------------- 2 files changed, 48 insertions(+), 50 deletions(-) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index a643117..3ae65f2 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -3,7 +3,7 @@ from __future__ import absolute_import import sys from unittest import TestCase, main -from lark import Lark, Token, Tree +from lark import Lark, Token, Tree, ParseError, UnexpectedInput from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors from lark.load_grammar import FromPackageLoader @@ -198,6 +198,53 @@ class TestGrammar(TestCase): x = find_grammar_errors(text) assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] + def test_ranged_repeat_terms(self): + g = u"""!start: AAA + AAA: "A"~3 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') + + g = u"""!start: AABB CC + AABB: "A"~0..2 "B"~2 + CC: "C"~1..2 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) + self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) + self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + + def test_ranged_repeat_large(self): + g = u"""!start: "A"~60 + """ + l = Lark(g, parser='lalr') + self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") + self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) + self.assertRaises(ParseError, l.parse, u'A' * 59) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) + + g = u"""!start: "A"~15..100 + """ + l = Lark(g, parser='lalr') + for i in range(0, 110): + if 15 <= i <= 100: + self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) + else: + self.assertRaises(UnexpectedInput, l.parse, u'A' * i) + + # 8191 is a Mersenne prime + g = u"""start: "A"~8191 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) if __name__ == '__main__': diff --git a/tests/test_parser.py b/tests/test_parser.py index ffb1d8f..9eb7b26 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2204,55 +2204,6 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - def test_ranged_repeat_terms(self): - g = u"""!start: AAA - AAA: "A"~3 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') - - g = u"""!start: AABB CC - AABB: "A"~0..2 "B"~2 - CC: "C"~1..2 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) - self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) - self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - - @unittest.skipIf(PARSER != 'lalr', "We only need to test rule generation, we know BNF is solid on all parsers") - def test_ranged_repeat_large(self): - # Large is currently arbitrarily chosen to be large than 20 - g = u"""!start: "A"~60 - """ - l = _Lark(g) - self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") - self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) - self.assertRaises(ParseError, l.parse, u'A' * 59) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) - - g = u"""!start: "A"~15..100 - """ - l = _Lark(g) - for i in range(0, 110): - if 15 <= i <= 100: - self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) - else: - self.assertRaises(UnexpectedInput, l.parse, u'A' * i) - - # 8191 is a Mersenne prime - g = u"""start: "A"~8191 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) - self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) - self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX From e8d5e7e30db5e728cd4521308036aa55730f9957 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 7 Aug 2021 11:11:10 +0300 Subject: [PATCH 14/48] Docs: Updated IDE link --- README.md | 2 +- docs/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8ec22ed..f4335d0 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) -- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html) +- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide) - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - [Gitter chat](https://gitter.im/lark-parser/Lobby) diff --git a/docs/index.rst b/docs/index.rst index 39ecd5a..c4e8be6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -113,7 +113,7 @@ Resources .. _Examples: https://github.com/lark-parser/lark/tree/master/examples .. _Third-party examples: https://github.com/ligurio/lark-grammars -.. _Online IDE: https://lark-parser.github.io/lark/ide/app.html +.. _Online IDE: https://lark-parser.github.io/lark/ide .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf From 41b2ba0d3a37757c30e3010763a516e822eaba87 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 7 Aug 2021 11:13:31 +0300 Subject: [PATCH 15/48] Docs: Updated IDE links again --- README.md | 2 +- docs/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f4335d0..82f6148 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) -- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide) +- [Online IDE](https://lark-parser.github.io/ide) - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - [Gitter chat](https://gitter.im/lark-parser/Lobby) diff --git a/docs/index.rst b/docs/index.rst index c4e8be6..e8bd6b2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -113,7 +113,7 @@ Resources .. _Examples: https://github.com/lark-parser/lark/tree/master/examples .. _Third-party examples: https://github.com/ligurio/lark-grammars -.. _Online IDE: https://lark-parser.github.io/lark/ide +.. _Online IDE: https://lark-parser.github.io/ide .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf From f3826ed3d16d98d1a6619ca8bc8a0d0b9493d596 Mon Sep 17 00:00:00 2001 From: Louis Sautier Date: Tue, 10 Aug 2021 00:02:08 +0200 Subject: [PATCH 16/48] Remove ineffective description-file key from setup.cfg Otherwise, setuptools warns that: "UserWarning: Usage of dash-separated 'description-file' will not be supported in future versions. Please use the underscore name 'description_file' instead" This key doesn't seem to do anything unless you use pbr. --- setup.cfg | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 6ddead9..6d71f28 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,6 +5,4 @@ zip_safe= universal = 1 [metadata] -description-file = README.md license_file = LICENSE - From 3269605211f92942296257e34722a979801c204c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20K=C3=A4ufl?= Date: Tue, 17 Aug 2021 10:46:37 +0200 Subject: [PATCH 17/48] Remove config for Travis CI --- .travis.yml | 15 --------------- README.md | 2 +- tox.ini | 11 ----------- 3 files changed, 1 insertion(+), 27 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 6448cc8..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -dist: xenial -language: python -python: - - "2.7" - - "3.4" - - "3.5" - - "3.6" - - "3.7" - - "3.8" - - "3.9-dev" - - "pypy2.7-6.0" - - "pypy3.5-6.0" -install: pip install tox-travis -script: - - tox diff --git a/README.md b/README.md index 82f6148..70be4fe 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h Lark has no dependencies. -[![Build Status](https://travis-ci.org/lark-parser/lark.svg?branch=master)](https://travis-ci.org/lark-parser/lark) +[![Tests](https://github.com/lark-parser/lark/actions/workflows/tests.yml/badge.svg)](https://github.com/lark-parser/lark/actions/workflows/tests.yml) ### Syntax Highlighting diff --git a/tox.ini b/tox.ini index ef19e2c..cef423b 100644 --- a/tox.ini +++ b/tox.ini @@ -2,17 +2,6 @@ envlist = py27, py34, py35, py36, py37, py38, py39, pypy, pypy3 skip_missing_interpreters=true -[travis] -2.7 = py27 -3.4 = py34 -3.5 = py35 -3.6 = py36 -3.7 = py37 -3.8 = py38 -3.9 = py39 -pypy = pypy -pypy3 = pypy3 - [testenv] whitelist_externals = git deps = From 8f73a58a5446a2ffb078905af8acd11c358d3425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20K=C3=A4ufl?= Date: Tue, 17 Aug 2021 10:49:20 +0200 Subject: [PATCH 18/48] Run tests against Python 3.10 --- .github/workflows/tests.yml | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1630c8b..c7b9286 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy2, pypy3] + python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10.0-rc - 3.10, pypy2, pypy3] steps: - uses: actions/checkout@v2 diff --git a/tox.ini b/tox.ini index ef19e2c..842ed2b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py27, py34, py35, py36, py37, py38, py39, pypy, pypy3 +envlist = py27, py34, py35, py36, py37, py38, py39, py310, pypy, pypy3 skip_missing_interpreters=true [travis] From 8f2bef29bca3aaee6844a861b9f1f2ac4f72d2b2 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 23 Aug 2021 11:51:17 +0100 Subject: [PATCH 19/48] README: Added link to Lark.js --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 70be4fe..76dcdea 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,10 @@ Lark provides syntax highlighting for its grammar files (\*.lark): ### Clones +These are implementations of Lark in other languages. They accept Lark grammars, and provide similar utilities. + - [Lerche (Julia)](https://github.com/jamesrhester/Lerche.jl) - an unofficial clone, written entirely in Julia. +- [Lark.js (Javascript)](https://github.com/lark-parser/lark.js) - a port of the stand-alone LALR(1) parser generator to Javascsript. ### Hello World From 3d3858a30cad78136ff07bdedfe27b90b3400956 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 24 Aug 2021 14:20:10 +0100 Subject: [PATCH 20/48] README: Added Poetry to 'projects using Lark' --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 76dcdea..156a671 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail ### Projects using Lark + - [Poetry](https://github.com/python-poetry/poetry-core) - A utility for dependency management and packaging - [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL server by Dailymotion - [Hypothesis](https://github.com/HypothesisWorks/hypothesis) - Library for property-based testing - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration From c9c33423fca6b7d4b8cc3cbb794e991670d78c0a Mon Sep 17 00:00:00 2001 From: MegaIng Date: Wed, 25 Aug 2021 03:10:11 +0200 Subject: [PATCH 21/48] Fix recursion error for many options in Terminal --- lark/load_grammar.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 69bd788..e1f9223 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -600,7 +600,21 @@ def _make_joined_pattern(regexp, flags_set): return PatternRE(regexp, flags) -class TerminalTreeToPattern(Transformer): +class FlattenExpansions(Transformer_InPlace): + @v_args(tree=True) + def expansions(self, tree): + i = 0 + while i < len(tree.children): + c = tree.children[i] + if isinstance(c, Tree) and c.data == 'expansions': + tree.children[i:i+1] = c.children + i += len(c.children) + else: + i += 1 + return tree + + +class TerminalTreeToPattern(Transformer_NonRecursive): def pattern(self, ps): p ,= ps return p @@ -670,7 +684,7 @@ class Grammar: def compile(self, start, terminals_to_keep): # We change the trees in-place (to support huge grammars) # So deepcopy allows calling compile more than once. - term_defs = deepcopy(list(self.term_defs)) + term_defs = [(n,(nr_deepcopy_tree(t), p)) for n,(t,p) in self.term_defs] rule_defs = [(n,p,nr_deepcopy_tree(t),o) for n,p,t,o in self.rule_defs] # =================== @@ -686,7 +700,7 @@ class Grammar: if len(expansions) == 1 and not expansions[0].children: raise GrammarError("Terminals cannot be empty (%s)" % name) - transformer = PrepareLiterals() * TerminalTreeToPattern() + transformer = PrepareLiterals() * FlattenExpansions() * TerminalTreeToPattern() terminals = [TerminalDef(name, transformer.transform(term_tree), priority) for name, (term_tree, priority) in term_defs if term_tree] From 3a4568df246b7c413cbe2309b6411c3ac599136b Mon Sep 17 00:00:00 2001 From: MegaIng Date: Thu, 26 Aug 2021 00:09:22 +0200 Subject: [PATCH 22/48] Reworked grammar to simplify later processing + expand_kids_by_data + tests --- lark-stubs/tree.pyi | 3 +++ lark/exceptions.py | 2 +- lark/lexer.py | 2 +- lark/load_grammar.py | 40 +++++++++++--------------------------- lark/parse_tree_builder.py | 3 +-- lark/tree.py | 9 +++++++++ tests/test_grammar.py | 12 ++++++++++++ 7 files changed, 38 insertions(+), 33 deletions(-) diff --git a/lark-stubs/tree.pyi b/lark-stubs/tree.pyi index ea99ff6..0c12819 100644 --- a/lark-stubs/tree.pyi +++ b/lark-stubs/tree.pyi @@ -40,6 +40,9 @@ class Tree: def expand_kids_by_index(self, *indices: int) -> None: ... + def expand_kids_by_data(self, *data_values: str) -> bool: + ... + def scan_values(self, pred: Callable[[Union[str, Tree]], bool]) -> Iterator[str]: ... diff --git a/lark/exceptions.py b/lark/exceptions.py index fdcd52b..9f18753 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -210,7 +210,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') - self.pos_in_stream = getattr(token, 'pos_in_stream', None) + self.pos_in_stream = getattr(token, 'start_pos', None) self.state = state self.token = token diff --git a/lark/lexer.py b/lark/lexer.py index 7c2f979..a82cc18 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -150,7 +150,7 @@ class Token(Str): @property def pos_in_stream(self): - warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning) + warn("Attribute Token.pos_in_stream was renamed to Token.start_pos", DeprecationWarning, 2) return self.start_pos def update(self, type_=None, value=None): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index e1f9223..f2f5499 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -91,6 +91,7 @@ TERMINALS = { 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', + '_NL_OR': r'(\r?\n)+\s*\|', 'WS': r'[ \t]+', 'COMMENT': r'\s*//[^\n]*', '_TO': '->', @@ -113,9 +114,10 @@ RULES = { ''], '_template_params': ['RULE', '_template_params _COMMA RULE'], - 'expansions': ['alias', - 'expansions _OR alias', - 'expansions _NL _OR alias'], + 'expansions': ['_expansions'], + '_expansions': ['alias', + '_expansions _OR alias', + '_expansions _NL_OR alias'], '?alias': ['expansion _TO RULE', 'expansion'], 'expansion': ['_expansion'], @@ -357,11 +359,8 @@ class SimplifyRule_Visitor(Visitor): @staticmethod def _flatten(tree): while True: - to_expand = [i for i, child in enumerate(tree.children) - if isinstance(child, Tree) and child.data == tree.data] - if not to_expand: + if not tree.expand_kids_by_data(tree.data): break - tree.expand_kids_by_index(*to_expand) def expansion(self, tree): # rules_list unpacking @@ -599,21 +598,6 @@ def _make_joined_pattern(regexp, flags_set): return PatternRE(regexp, flags) - -class FlattenExpansions(Transformer_InPlace): - @v_args(tree=True) - def expansions(self, tree): - i = 0 - while i < len(tree.children): - c = tree.children[i] - if isinstance(c, Tree) and c.data == 'expansions': - tree.children[i:i+1] = c.children - i += len(c.children) - else: - i += 1 - return tree - - class TerminalTreeToPattern(Transformer_NonRecursive): def pattern(self, ps): p ,= ps @@ -684,8 +668,8 @@ class Grammar: def compile(self, start, terminals_to_keep): # We change the trees in-place (to support huge grammars) # So deepcopy allows calling compile more than once. - term_defs = [(n,(nr_deepcopy_tree(t), p)) for n,(t,p) in self.term_defs] - rule_defs = [(n,p,nr_deepcopy_tree(t),o) for n,p,t,o in self.rule_defs] + term_defs = [(n, (nr_deepcopy_tree(t), p)) for n, (t, p) in self.term_defs] + rule_defs = [(n, p, nr_deepcopy_tree(t), o) for n, p, t, o in self.rule_defs] # =================== # Compile Terminals @@ -700,7 +684,7 @@ class Grammar: if len(expansions) == 1 and not expansions[0].children: raise GrammarError("Terminals cannot be empty (%s)" % name) - transformer = PrepareLiterals() * FlattenExpansions() * TerminalTreeToPattern() + transformer = PrepareLiterals() * TerminalTreeToPattern() terminals = [TerminalDef(name, transformer.transform(term_tree), priority) for name, (term_tree, priority) in term_defs if term_tree] @@ -933,7 +917,7 @@ def _get_parser(): parser_conf = ParserConf(rules, callback, ['start']) lexer_conf.lexer_type = 'standard' parser_conf.parser_type = 'lalr' - _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {}) + _get_parser.cache = ParsingFrontend(lexer_conf, parser_conf, None) return _get_parser.cache GRAMMAR_ERRORS = [ @@ -1110,9 +1094,7 @@ class GrammarBuilder: # TODO: think about what to do with 'options' base = self._definitions[name][1] - while len(base.children) == 2: - assert isinstance(base.children[0], Tree) and base.children[0].data == 'expansions', base - base = base.children[0] + assert isinstance(base, Tree) and base.data == 'expansions' base.children.insert(0, exp) def _ignore(self, exp_or_name): diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 286038e..fa526b0 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -204,8 +204,7 @@ class AmbiguousExpander: if i in self.to_expand: ambiguous.append(i) - to_expand = [j for j, grandchild in enumerate(child.children) if _is_ambig_tree(grandchild)] - child.expand_kids_by_index(*to_expand) + child.expand_kids_by_data('_ambig') if not ambiguous: return self.node_builder(children) diff --git a/lark/tree.py b/lark/tree.py index bee53cf..8a29bcb 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -107,6 +107,15 @@ class Tree(object): kid = self.children[i] self.children[i:i+1] = kid.children + def expand_kids_by_data(self, *data_values): + """Expand (inline) children with any of the given data values. Returns True if anything changed""" + indices = [i for i, c in enumerate(self.children) if isinstance(c, Tree) and c.data in data_values] + if indices: + self.expand_kids_by_index(*indices) + return True + else: + return False + def scan_values(self, pred): """Return all values in the tree that evaluate pred(value) as true. diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 3ae65f2..47a345c 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -246,6 +246,18 @@ class TestGrammar(TestCase): self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) + def test_large_terminal(self): + # TODO: The `reversed` below is required because otherwise the regex engine is happy + # with just parsing 9 from the string 999 instead of consuming the longest + g = "start: NUMBERS\n" + g += "NUMBERS: " + '|'.join('"%s"' % i for i in reversed(range(0, 1000))) + + l = Lark(g, parser='lalr') + for i in (0, 9, 99, 999): + self.assertEqual(l.parse(str(i)), Tree('start', [str(i)])) + for i in (-1, 1000): + self.assertRaises(UnexpectedInput, l.parse, str(i)) + if __name__ == '__main__': main() From f5c7af8ce9996f303e05aa4caec32abccde615fa Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 25 Aug 2021 23:57:20 +0100 Subject: [PATCH 23/48] Proposed corrections to PR #970 --- lark/load_grammar.py | 5 ++--- lark/tree.py | 10 ++++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index f2f5499..7c64196 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -358,9 +358,8 @@ class SimplifyRule_Visitor(Visitor): @staticmethod def _flatten(tree): - while True: - if not tree.expand_kids_by_data(tree.data): - break + while tree.expand_kids_by_data(tree.data): + pass def expansion(self, tree): # rules_list unpacking diff --git a/lark/tree.py b/lark/tree.py index 8a29bcb..1d14bf3 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -110,12 +110,14 @@ class Tree(object): def expand_kids_by_data(self, *data_values): """Expand (inline) children with any of the given data values. Returns True if anything changed""" indices = [i for i, c in enumerate(self.children) if isinstance(c, Tree) and c.data in data_values] - if indices: - self.expand_kids_by_index(*indices) - return True - else: + if not indices: return False + for i in reversed(indices): # reverse so that changing tail won't affect indices + child = self.children[i] + self.children[i:i+1] = child.children + return True + def scan_values(self, pred): """Return all values in the tree that evaluate pred(value) as true. From d2e8b15c2f846a15c57f1e6ade625c46560a43de Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 26 Aug 2021 08:22:02 +0100 Subject: [PATCH 24/48] Another update for the PR --- lark/tree.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lark/tree.py b/lark/tree.py index 1d14bf3..2cd8233 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -109,14 +109,14 @@ class Tree(object): def expand_kids_by_data(self, *data_values): """Expand (inline) children with any of the given data values. Returns True if anything changed""" - indices = [i for i, c in enumerate(self.children) if isinstance(c, Tree) and c.data in data_values] - if not indices: - return False + changed = False + for i, c in reversed(list(enumerate(self.children))): + if isinstance(c, Tree) and c.data in data_values: + child = self.children[i] + self.children[i:i+1] = child.children + changed = True + return changed - for i in reversed(indices): # reverse so that changing tail won't affect indices - child = self.children[i] - self.children[i:i+1] = child.children - return True def scan_values(self, pred): """Return all values in the tree that evaluate pred(value) as true. From 4fe49c9cdb34e3a1fdd7bb6f1ff1321e87f390b7 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 26 Aug 2021 13:36:05 +0100 Subject: [PATCH 25/48] Change expand_kids_by_data to use range --- lark/tree.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lark/tree.py b/lark/tree.py index 2cd8233..0937b85 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -110,9 +110,9 @@ class Tree(object): def expand_kids_by_data(self, *data_values): """Expand (inline) children with any of the given data values. Returns True if anything changed""" changed = False - for i, c in reversed(list(enumerate(self.children))): - if isinstance(c, Tree) and c.data in data_values: - child = self.children[i] + for i in range(len(self.children)-1, -1, -1): + child = self.children[i] + if isinstance(child, Tree) and child.data in data_values: self.children[i:i+1] = child.children changed = True return changed From b3816b9a90abcf914f12891cd6a6fbb74f0ee771 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 27 Aug 2021 12:59:31 +0100 Subject: [PATCH 26/48] Added utility function: List grammar imports + test --- lark/load_grammar.py | 6 ++++++ tests/test_grammar.py | 18 ++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 7c64196..4a9360c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -1336,6 +1336,12 @@ def verify_used_files(file_hashes): return False return True +def list_grammar_imports(grammar, import_paths=[]): + "Returns a list of paths to the lark grammars imported by the given grammar (recursively)" + builder = GrammarBuilder(False, import_paths) + builder.load_grammar(grammar, '') + return list(builder.used_files.keys()) + def load_grammar(grammar, source, import_paths, global_keep_all_tokens): builder = GrammarBuilder(global_keep_all_tokens, import_paths) builder.load_grammar(grammar, source) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 47a345c..319d709 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -1,10 +1,10 @@ from __future__ import absolute_import -import sys +import os from unittest import TestCase, main from lark import Lark, Token, Tree, ParseError, UnexpectedInput -from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors +from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors, list_grammar_imports from lark.load_grammar import FromPackageLoader @@ -258,6 +258,20 @@ class TestGrammar(TestCase): for i in (-1, 1000): self.assertRaises(UnexpectedInput, l.parse, str(i)) + def test_list_grammar_imports(self): + grammar = """ + %import .test_templates_import (start, sep) + + %override sep{item, delim}: item (delim item)* delim? + %ignore " " + """ + + imports = list_grammar_imports(grammar, [os.path.dirname(__file__)]) + self.assertEqual({os.path.split(i)[-1] for i in imports}, {'test_templates_import.lark', 'templates.lark'}) + + imports = list_grammar_imports('%import common.WS', []) + assert len(imports) == 1 and imports[0].pkg_name == 'lark' + if __name__ == '__main__': main() From fc24666f37304954a7508fd77638bbb529deaba6 Mon Sep 17 00:00:00 2001 From: "Robin A. Dorstijn" Date: Fri, 27 Aug 2021 09:45:00 +0200 Subject: [PATCH 27/48] Add merge function --- lark/visitors.py | 56 +++++++++++++++++++++++++++++++++++++++++++++ tests/test_trees.py | 56 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 108 insertions(+), 4 deletions(-) diff --git a/lark/visitors.py b/lark/visitors.py index 23ef64a..4894cab 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -149,6 +149,62 @@ class Transformer(_Decoratable): return token +def merge_transformers(base_transformer=None, **kwargs): + """ + Add the methods of other transformer to this one. + + This method is meant to aid in the maintenance of imports. + + Example: + ```python + class T1(Transformer): + def method_on_main_grammar(self, children): + # Do something + pass + + def imported_grammar__method(self, children): + # Do something else + pass + + class TMain(Transformer): + def method_on_main_grammar(self, children): + # Do something + pass + + class TImportedGrammar(Transformer): + def method(self, children): + # Do something else + pass + + regular_transformer = T1() + composed_transformer = merge_transformers(TMain(), + imported_grammar=TImportedGrammar()) + ``` + In the above code block `regular_transformer` and `composed_transformer` + should behave identically. + """ + + if base_transformer is None: + base_transformer = Transformer() + for prefix, transformer in kwargs.items(): + prefix += "__" + + for method_name in dir(transformer): + method = getattr(transformer, method_name) + if not callable(method): + continue + if method_name.startswith("_") or method_name == "transform": + continue + new_method_name = prefix + method_name + if prefix + method_name in dir(base_transformer): + raise AttributeError( + ("Method '{new_method_name}' already present in base " + "transformer").format(new_method_name=new_method_name)) + setattr(base_transformer, new_method_name, method) + + return base_transformer + + class InlineTransformer(Transformer): # XXX Deprecated def _call_userfunc(self, tree, new_children=None): # Assumes tree is already transformed diff --git a/tests/test_trees.py b/tests/test_trees.py index c7f9787..730b80b 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -9,7 +9,7 @@ import functools from lark.tree import Tree from lark.lexer import Token from lark.visitors import Visitor, Visitor_Recursive, Transformer, Interpreter, visit_children_decor, v_args, Discard, Transformer_InPlace, \ - Transformer_InPlaceRecursive, Transformer_NonRecursive + Transformer_InPlaceRecursive, Transformer_NonRecursive, merge_transformers class TestTrees(TestCase): @@ -233,21 +233,69 @@ class TestTrees(TestCase): x = MyTransformer().transform( t ) self.assertEqual(x, t2) - + def test_transformer_variants(self): tree = Tree('start', [Tree('add', [Token('N', '1'), Token('N', '2')]), Tree('add', [Token('N', '3'), Token('N', '4')])]) for base in (Transformer, Transformer_InPlace, Transformer_NonRecursive, Transformer_InPlaceRecursive): class T(base): def add(self, children): return sum(children) - + def N(self, token): return int(token) - + copied = copy.deepcopy(tree) result = T().transform(copied) self.assertEqual(result, Tree('start', [3, 7])) + def test_merge_transformers(self): + tree = Tree('start', [ + Tree('main', [ + Token("A", '1'), Token("B", '2') + ]), + Tree("module__main", [ + Token("A", "2"), Token("B", "3") + ]) + ]) + + class T1(Transformer): + A = int + B = int + main = sum + start = list + def module__main(self, children): + prod = 1 + for child in children: + prod *= child + return prod + + class T2(Transformer): + A = int + B = int + main = sum + start = list + + class T3(Transformer): + def main(self, children): + prod = 1 + for child in children: + prod *= child + return prod + + class T4(Transformer): + def other_aspect(self, children): + pass + + t1_res = T1().transform(tree) + composed_res = merge_transformers(T2(), module=T3()).transform(tree) + self.assertEqual(t1_res, composed_res) + with self.assertRaises(AttributeError): + merge_transformers(T1(), module=T3()) + + try: + composed = merge_transformers(T1(), module=T4()) + except AttributeError: + self.fail("Should be able to add classes that do not conflict") if __name__ == '__main__': unittest.main() From fb8800914bf870fdb8bbf76afd109228400db120 Mon Sep 17 00:00:00 2001 From: "Robin A. Dorstijn" Date: Fri, 27 Aug 2021 17:07:55 +0200 Subject: [PATCH 28/48] Added advanced example for merge_transformers --- examples/advanced/advanced_transformers.py | 67 ++++++++++++++++++++++ examples/advanced/csv.lark | 14 +++++ examples/advanced/json.lark | 19 ++++++ examples/advanced/storage.lark | 8 +++ lark/visitors.py | 14 ++++- 5 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 examples/advanced/advanced_transformers.py create mode 100644 examples/advanced/csv.lark create mode 100644 examples/advanced/json.lark create mode 100644 examples/advanced/storage.lark diff --git a/examples/advanced/advanced_transformers.py b/examples/advanced/advanced_transformers.py new file mode 100644 index 0000000..9810f44 --- /dev/null +++ b/examples/advanced/advanced_transformers.py @@ -0,0 +1,67 @@ +""" +Transformer merging +================== + +This example is intended to show how transformers can be merged in order to +keep the individual steps clean and simple. + +.. note:: + The imported rules will have to be aliased according to the file it is in. + (See `storage.lark` for an implementation of this idea.) +""" +from lark import Lark, Tree +from json import dumps +from lark.visitors import Transformer, merge_transformers, v_args + +class JsonTreeToJson(Transformer): + @v_args(inline=True) + def string(self, s): + return s[1:-1].replace('\\"', '"') + + array = list + pair = tuple + object = dict + number = v_args(inline=True)(float) + + null = lambda self, _: None + true = lambda self, _: True + false = lambda self, _: False + +class CsvTreeToPandasDict(Transformer): + INT = int + FLOAT = float + SIGNED_FLOAT = float + WORD = str + NON_SEPARATOR_STRING = str + + def row(self, children): + return children + + def start(self, children): + data = {} + + header = children[0].children + for heading in header: + data[heading] = [] + + for row in children[1:]: + for i, element in enumerate(row): + data[header[i]].append(element) + + return data + +class Base(Transformer): + def start(self, children): + return children[0] + +if __name__ == "__main__": + merged = merge_transformers(Base(), csv=CsvTreeToPandasDict(), json=JsonTreeToJson()) + parser = Lark.open("storage.lark") + csv_tree = parser.parse("""# file lines author +data.json 12 Robin +data.csv 30 erezsh +compiler.py 123123 Megalng +""") + print("CSV data in pandas form:", merged.transform(csv_tree)) + json_tree = parser.parse(dumps({"test": "a", "dict": { "list": [1, 1.2] }})) + print("JSON data transformed: ", merged.transform(json_tree)) diff --git a/examples/advanced/csv.lark b/examples/advanced/csv.lark new file mode 100644 index 0000000..cc2b675 --- /dev/null +++ b/examples/advanced/csv.lark @@ -0,0 +1,14 @@ +start: header _NL row+ +header: "#" " "? (WORD _SEPARATOR?)+ +row: (_anything _SEPARATOR?)+ _NL +_anything: INT | WORD | NON_SEPARATOR_STRING | FLOAT | SIGNED_FLOAT +NON_SEPARATOR_STRING: /[a-zA-z.;\\\/]+/ +_SEPARATOR: /[ ]+/ + | "\t" + | "," + +%import common.NEWLINE -> _NL +%import common.WORD +%import common.INT +%import common.FLOAT +%import common.SIGNED_FLOAT diff --git a/examples/advanced/json.lark b/examples/advanced/json.lark new file mode 100644 index 0000000..bb77c35 --- /dev/null +++ b/examples/advanced/json.lark @@ -0,0 +1,19 @@ +?start: value + +?value: object + | array + | string + | SIGNED_NUMBER -> number + | "true" -> true + | "false" -> false + | "null" -> null + +array : "[" _WS? [value ("," _WS? value)*] "]" +object : "{" _WS? [pair ("," _WS? pair)*] "}" +pair : string ":" _WS value + +string : ESCAPED_STRING + +%import common.ESCAPED_STRING +%import common.SIGNED_NUMBER +%import common.WS -> _WS diff --git a/examples/advanced/storage.lark b/examples/advanced/storage.lark new file mode 100644 index 0000000..64718ed --- /dev/null +++ b/examples/advanced/storage.lark @@ -0,0 +1,8 @@ +start: csv__start + | json__start + +// Renaming of the import variables is required, as they +// receive the namespace of this file. +// See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565 +%import .csv.start -> csv__start +%import .json.start -> json__start diff --git a/lark/visitors.py b/lark/visitors.py index 4894cab..498a676 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -151,9 +151,16 @@ class Transformer(_Decoratable): def merge_transformers(base_transformer=None, **kwargs): """ - Add the methods of other transformer to this one. + Paramaters: + :param base_transformer: Transformer that all other transformers will be added to. + :param \**kwargs: Key-value arguments providing the prefix for the methods of the transformer and the Transformers themselves. - This method is meant to aid in the maintenance of imports. + Compose a new transformer from a base and the in the `**kwargs` provided Transformer instances. + + The key should match the grammar file that the Transformer is supposed to manipulate. + + This method is meant to aid the composing of large transformers that + manipulate grammars that cross multiple lark files. Example: ```python @@ -183,11 +190,12 @@ def merge_transformers(base_transformer=None, **kwargs): In the above code block `regular_transformer` and `composed_transformer` should behave identically. """ + infix = "__" if base_transformer is None: base_transformer = Transformer() for prefix, transformer in kwargs.items(): - prefix += "__" + prefix += infix for method_name in dir(transformer): method = getattr(transformer, method_name) From afbdb6a4a9087613cf870aa56cb15b2e3fd2b5a7 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 28 Aug 2021 10:14:10 +0100 Subject: [PATCH 29/48] Edits to merge_transformers --- lark/visitors.py | 80 ++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/lark/visitors.py b/lark/visitors.py index 498a676..3d65a13 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -149,66 +149,54 @@ class Transformer(_Decoratable): return token -def merge_transformers(base_transformer=None, **kwargs): - """ - Paramaters: - :param base_transformer: Transformer that all other transformers will be added to. - :param \**kwargs: Key-value arguments providing the prefix for the methods of the transformer and the Transformers themselves. +def merge_transformers(base_transformer=None, **transformers_to_merge): + """Merge a collection of transformers into the base_transformer, each into its own 'namespace'. + + When called, it will collect the methods from each transformer, and assign them to base_transformer, + with their name prefixed with the given keyword, as ``prefix__methodname`. - Compose a new transformer from a base and the in the `**kwargs` provided Transformer instances. + This function is especially useful for processing grammars that import other grammars, + thereby creating some of their rules in a 'namespace'. (i.e with a consitent name prefix) + In this case, the key for the transformer should match the name of the imported grammar. - The key should match the grammar file that the Transformer is supposed to manipulate. + Paramaters: + base_transformer (Transformer, optional): The transformer that all other transformers will be added to. + **transformers_to_merge: Keyword arguments, in the form of ``name_prefix = transformer``. - This method is meant to aid the composing of large transformers that - manipulate grammars that cross multiple lark files. + Raises: + AttributeError: In case of a name collision in the merged methods Example: - ```python - class T1(Transformer): - def method_on_main_grammar(self, children): - # Do something - pass - - def imported_grammar__method(self, children): - # Do something else - pass - - class TMain(Transformer): - def method_on_main_grammar(self, children): - # Do something - pass - - class TImportedGrammar(Transformer): - def method(self, children): - # Do something else - pass - - regular_transformer = T1() - composed_transformer = merge_transformers(TMain(), - imported_grammar=TImportedGrammar()) - ``` - In the above code block `regular_transformer` and `composed_transformer` - should behave identically. - """ - infix = "__" + ```python + class TBase(Transformer): + def start(self, children): + return children[0] + 'bar' + + class TImportedGrammar(Transformer): + def foo(self, children): + return "foo" + composed_transformer = merge_transformers(TBase(), imported=TImportedGrammar()) + + t = Tree('start', [ Tree('imported__foo', []) ]) + + assert composed_transformer.transform(t) == 'foobar' + ``` + """ if base_transformer is None: base_transformer = Transformer() - for prefix, transformer in kwargs.items(): - prefix += infix - + for prefix, transformer in transformers_to_merge.items(): for method_name in dir(transformer): method = getattr(transformer, method_name) if not callable(method): continue if method_name.startswith("_") or method_name == "transform": continue - new_method_name = prefix + method_name - if prefix + method_name in dir(base_transformer): - raise AttributeError( - ("Method '{new_method_name}' already present in base " - "transformer").format(new_method_name=new_method_name)) - setattr(base_transformer, new_method_name, method) + prefixed_method = prefix + "__" + method_name + if hasattr(base_transformer, prefixed_method): + raise AttributeError("Cannot merge: method '%s' appears more than once" % prefixed_method) + + setattr(base_transformer, prefixed_method, method) return base_transformer From d44951383971f5e49a71ca7b712f146786730b38 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 28 Aug 2021 10:28:38 +0100 Subject: [PATCH 30/48] Tests: Updated test_merge_transformers --- tests/test_trees.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/tests/test_trees.py b/tests/test_trees.py index 730b80b..82bf6c9 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -264,10 +264,7 @@ class TestTrees(TestCase): main = sum start = list def module__main(self, children): - prod = 1 - for child in children: - prod *= child - return prod + return sum(children) class T2(Transformer): A = int @@ -277,25 +274,21 @@ class TestTrees(TestCase): class T3(Transformer): def main(self, children): - prod = 1 - for child in children: - prod *= child - return prod + return sum(children) class T4(Transformer): - def other_aspect(self, children): - pass + main = sum + t1_res = T1().transform(tree) composed_res = merge_transformers(T2(), module=T3()).transform(tree) self.assertEqual(t1_res, composed_res) + + composed_res2 = merge_transformers(T2(), module=T4()).transform(tree) + self.assertEqual(t1_res, composed_res2) + with self.assertRaises(AttributeError): merge_transformers(T1(), module=T3()) - try: - composed = merge_transformers(T1(), module=T4()) - except AttributeError: - self.fail("Should be able to add classes that do not conflict") - if __name__ == '__main__': unittest.main() From ca1131a3a19ce2817dc95131333e81d4d20b168c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 28 Aug 2021 11:13:01 +0100 Subject: [PATCH 31/48] Examples: Moved example into 'composition' folder, and improved it --- examples/advanced/advanced_transformers.py | 67 ------------------- examples/composition/README.md | 10 +++ .../composition/combined_csv_and_json.txt | 6 ++ examples/{advanced => composition}/csv.lark | 0 examples/composition/eval_csv.py | 24 +++++++ examples/composition/eval_json.py | 15 +++++ examples/{advanced => composition}/json.lark | 0 examples/composition/main.py | 51 ++++++++++++++ .../{advanced => composition}/storage.lark | 4 +- 9 files changed, 108 insertions(+), 69 deletions(-) delete mode 100644 examples/advanced/advanced_transformers.py create mode 100644 examples/composition/README.md create mode 100644 examples/composition/combined_csv_and_json.txt rename examples/{advanced => composition}/csv.lark (100%) create mode 100644 examples/composition/eval_csv.py create mode 100644 examples/composition/eval_json.py rename examples/{advanced => composition}/json.lark (100%) create mode 100644 examples/composition/main.py rename examples/{advanced => composition}/storage.lark (79%) diff --git a/examples/advanced/advanced_transformers.py b/examples/advanced/advanced_transformers.py deleted file mode 100644 index 9810f44..0000000 --- a/examples/advanced/advanced_transformers.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Transformer merging -================== - -This example is intended to show how transformers can be merged in order to -keep the individual steps clean and simple. - -.. note:: - The imported rules will have to be aliased according to the file it is in. - (See `storage.lark` for an implementation of this idea.) -""" -from lark import Lark, Tree -from json import dumps -from lark.visitors import Transformer, merge_transformers, v_args - -class JsonTreeToJson(Transformer): - @v_args(inline=True) - def string(self, s): - return s[1:-1].replace('\\"', '"') - - array = list - pair = tuple - object = dict - number = v_args(inline=True)(float) - - null = lambda self, _: None - true = lambda self, _: True - false = lambda self, _: False - -class CsvTreeToPandasDict(Transformer): - INT = int - FLOAT = float - SIGNED_FLOAT = float - WORD = str - NON_SEPARATOR_STRING = str - - def row(self, children): - return children - - def start(self, children): - data = {} - - header = children[0].children - for heading in header: - data[heading] = [] - - for row in children[1:]: - for i, element in enumerate(row): - data[header[i]].append(element) - - return data - -class Base(Transformer): - def start(self, children): - return children[0] - -if __name__ == "__main__": - merged = merge_transformers(Base(), csv=CsvTreeToPandasDict(), json=JsonTreeToJson()) - parser = Lark.open("storage.lark") - csv_tree = parser.parse("""# file lines author -data.json 12 Robin -data.csv 30 erezsh -compiler.py 123123 Megalng -""") - print("CSV data in pandas form:", merged.transform(csv_tree)) - json_tree = parser.parse(dumps({"test": "a", "dict": { "list": [1, 1.2] }})) - print("JSON data transformed: ", merged.transform(json_tree)) diff --git a/examples/composition/README.md b/examples/composition/README.md new file mode 100644 index 0000000..259a66a --- /dev/null +++ b/examples/composition/README.md @@ -0,0 +1,10 @@ +Grammar Composition +=================== + +This example shows how to do grammar composition in Lark, by creating a new +file format that allows both CSV and JSON to co-exist. + +We show how, by using namespaces, Lark grammars and their transformers can be fully reused - +they don't need to care if their grammar is used directly, or being imported, or who is doing the importing. + +See [``main.py``](main.py) for more details. \ No newline at end of file diff --git a/examples/composition/combined_csv_and_json.txt b/examples/composition/combined_csv_and_json.txt new file mode 100644 index 0000000..5b8df82 --- /dev/null +++ b/examples/composition/combined_csv_and_json.txt @@ -0,0 +1,6 @@ +{"header": ["this", "is", "json", 1111]} +# file lines author +data.json 12 Robin +data.csv 30 erezsh +compiler.py 123123 Megalng +{"footer": "done"} diff --git a/examples/advanced/csv.lark b/examples/composition/csv.lark similarity index 100% rename from examples/advanced/csv.lark rename to examples/composition/csv.lark diff --git a/examples/composition/eval_csv.py b/examples/composition/eval_csv.py new file mode 100644 index 0000000..3323936 --- /dev/null +++ b/examples/composition/eval_csv.py @@ -0,0 +1,24 @@ +from lark import Transformer + +class CsvTreeToPandasDict(Transformer): + INT = int + FLOAT = float + SIGNED_FLOAT = float + WORD = str + NON_SEPARATOR_STRING = str + + def row(self, children): + return children + + def start(self, children): + data = {} + + header = children[0].children + for heading in header: + data[heading] = [] + + for row in children[1:]: + for i, element in enumerate(row): + data[header[i]].append(element) + + return data diff --git a/examples/composition/eval_json.py b/examples/composition/eval_json.py new file mode 100644 index 0000000..26bf501 --- /dev/null +++ b/examples/composition/eval_json.py @@ -0,0 +1,15 @@ +from lark import Transformer, v_args + +class JsonTreeToJson(Transformer): + @v_args(inline=True) + def string(self, s): + return s[1:-1].replace('\\"', '"') + + array = list + pair = tuple + object = dict + number = v_args(inline=True)(float) + + null = lambda self, _: None + true = lambda self, _: True + false = lambda self, _: False diff --git a/examples/advanced/json.lark b/examples/composition/json.lark similarity index 100% rename from examples/advanced/json.lark rename to examples/composition/json.lark diff --git a/examples/composition/main.py b/examples/composition/main.py new file mode 100644 index 0000000..a549abe --- /dev/null +++ b/examples/composition/main.py @@ -0,0 +1,51 @@ +""" +Grammar Composition +=================== + +This example shows how to do grammar composition in Lark, by creating a new +file format that allows both CSV and JSON to co-exist. + +1) We define ``storage.lark``, which imports both ``csv.lark`` and ``json.lark``, + and allows them to be used one after the other. + + In the generated tree, each imported rule/terminal is automatically prefixed (with ``json__`` or ``csv__), + which creates an implicit namespace and allows them to coexist without collisions. + +2) We merge their respective transformers (unaware of each other) into a new base transformer. + The resulting transformer can evaluate both JSON and CSV in the parse tree. + + The methods of each transformer are renamed into their appropriate namespace, using the given prefix. + This appraoch allows full re-use: the transformers don't need to care if their grammar is used directly, + or being imported, or who is doing the importing. + +""" +from pathlib import Path +from lark import Lark +from json import dumps +from lark.visitors import Transformer, merge_transformers + +from eval_csv import CsvTreeToPandasDict +from eval_json import JsonTreeToJson + +__dir__ = Path(__file__).parent + +class Storage(Transformer): + def start(self, children): + return children + +storage_transformer = merge_transformers(Storage(), csv=CsvTreeToPandasDict(), json=JsonTreeToJson()) + +parser = Lark.open("storage.lark", rel_to=__file__) + +def main(): + json_tree = parser.parse(dumps({"test": "a", "dict": { "list": [1, 1.2] }})) + res = storage_transformer.transform(json_tree) + print("Just JSON: ", res) + + csv_json_tree = parser.parse(open(__dir__ / 'combined_csv_and_json.txt').read()) + res = storage_transformer.transform(csv_json_tree) + print("JSON + CSV: ", dumps(res, indent=2)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/examples/advanced/storage.lark b/examples/composition/storage.lark similarity index 79% rename from examples/advanced/storage.lark rename to examples/composition/storage.lark index 64718ed..8e2bacc 100644 --- a/examples/advanced/storage.lark +++ b/examples/composition/storage.lark @@ -1,8 +1,8 @@ -start: csv__start - | json__start +start: (csv__start | json__start _NL?)+ // Renaming of the import variables is required, as they // receive the namespace of this file. // See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565 %import .csv.start -> csv__start +%import .csv._NL -> _NL %import .json.start -> json__start From d433e306592de534cbe121e9c98c3992a7f28b51 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 28 Aug 2021 11:25:21 +0100 Subject: [PATCH 32/48] Docs: Add merge_transformer; fix docstrings for sphinx --- docs/visitors.rst | 7 ++++++- examples/composition/eval_csv.py | 2 ++ examples/composition/eval_json.py | 2 ++ lark/visitors.py | 25 +++++++++++++------------ 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/docs/visitors.rst b/docs/visitors.rst index f263712..43d0513 100644 --- a/docs/visitors.rst +++ b/docs/visitors.rst @@ -103,12 +103,17 @@ v_args .. autofunction:: lark.visitors.v_args +merge_transformers +------------------ + +.. autofunction:: lark.visitors.merge_transformers + Discard ------- .. autoclass:: lark.visitors.Discard VisitError -------- +---------- .. autoclass:: lark.exceptions.VisitError \ No newline at end of file diff --git a/examples/composition/eval_csv.py b/examples/composition/eval_csv.py index 3323936..8b83f08 100644 --- a/examples/composition/eval_csv.py +++ b/examples/composition/eval_csv.py @@ -1,3 +1,5 @@ +"Transformer for evaluating csv.lark" + from lark import Transformer class CsvTreeToPandasDict(Transformer): diff --git a/examples/composition/eval_json.py b/examples/composition/eval_json.py index 26bf501..c665a19 100644 --- a/examples/composition/eval_json.py +++ b/examples/composition/eval_json.py @@ -1,3 +1,5 @@ +"Transformer for evaluating json.lark" + from lark import Transformer, v_args class JsonTreeToJson(Transformer): diff --git a/lark/visitors.py b/lark/visitors.py index 3d65a13..e2f8b53 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -159,7 +159,7 @@ def merge_transformers(base_transformer=None, **transformers_to_merge): thereby creating some of their rules in a 'namespace'. (i.e with a consitent name prefix) In this case, the key for the transformer should match the name of the imported grammar. - Paramaters: + Parameters: base_transformer (Transformer, optional): The transformer that all other transformers will be added to. **transformers_to_merge: Keyword arguments, in the form of ``name_prefix = transformer``. @@ -167,21 +167,22 @@ def merge_transformers(base_transformer=None, **transformers_to_merge): AttributeError: In case of a name collision in the merged methods Example: - ```python - class TBase(Transformer): - def start(self, children): - return children[0] + 'bar' + :: + + class TBase(Transformer): + def start(self, children): + return children[0] + 'bar' + + class TImportedGrammar(Transformer): + def foo(self, children): + return "foo" - class TImportedGrammar(Transformer): - def foo(self, children): - return "foo" + composed_transformer = merge_transformers(TBase(), imported=TImportedGrammar()) - composed_transformer = merge_transformers(TBase(), imported=TImportedGrammar()) + t = Tree('start', [ Tree('imported__foo', []) ]) - t = Tree('start', [ Tree('imported__foo', []) ]) + assert composed_transformer.transform(t) == 'foobar' - assert composed_transformer.transform(t) == 'foobar' - ``` """ if base_transformer is None: base_transformer = Transformer() From 155f4a8c0f9fbafcb0eafe65a4d6f1816a3058f2 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 28 Aug 2021 19:49:31 +0100 Subject: [PATCH 33/48] Examples: Tiny fix --- examples/composition/main.py | 2 +- examples/composition/storage.lark | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/composition/main.py b/examples/composition/main.py index a549abe..c6f150f 100644 --- a/examples/composition/main.py +++ b/examples/composition/main.py @@ -15,7 +15,7 @@ file format that allows both CSV and JSON to co-exist. The resulting transformer can evaluate both JSON and CSV in the parse tree. The methods of each transformer are renamed into their appropriate namespace, using the given prefix. - This appraoch allows full re-use: the transformers don't need to care if their grammar is used directly, + This approach allows full re-use: the transformers don't need to care if their grammar is used directly, or being imported, or who is doing the importing. """ diff --git a/examples/composition/storage.lark b/examples/composition/storage.lark index 8e2bacc..09bb0ae 100644 --- a/examples/composition/storage.lark +++ b/examples/composition/storage.lark @@ -1,8 +1,9 @@ start: (csv__start | json__start _NL?)+ -// Renaming of the import variables is required, as they -// receive the namespace of this file. +// Renaming of the import variables is required, as they receive the namespace of this file. // See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565 %import .csv.start -> csv__start -%import .csv._NL -> _NL %import .json.start -> json__start + +%import .csv._NL -> _NL + From 786adec5370014b8e1b9fcb6b7c68a26fe53b234 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 28 Aug 2021 19:54:01 +0100 Subject: [PATCH 34/48] Docs: Tiny fix in docstring in visitors.py --- lark/visitors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/visitors.py b/lark/visitors.py index e2f8b53..d45bb19 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -153,10 +153,10 @@ def merge_transformers(base_transformer=None, **transformers_to_merge): """Merge a collection of transformers into the base_transformer, each into its own 'namespace'. When called, it will collect the methods from each transformer, and assign them to base_transformer, - with their name prefixed with the given keyword, as ``prefix__methodname`. + with their name prefixed with the given keyword, as ``prefix__methodname``. This function is especially useful for processing grammars that import other grammars, - thereby creating some of their rules in a 'namespace'. (i.e with a consitent name prefix) + thereby creating some of their rules in a 'namespace'. (i.e with a consistent name prefix). In this case, the key for the transformer should match the name of the imported grammar. Parameters: From d28eba4f9a62ca96bbfe5069f43864a6a71bea71 Mon Sep 17 00:00:00 2001 From: Peter Wienemann Date: Sun, 29 Aug 2021 08:56:15 +0200 Subject: [PATCH 35/48] Fix typos in comment of example code --- examples/advanced/extend_python.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/advanced/extend_python.py b/examples/advanced/extend_python.py index bdc7149..ba5fa21 100644 --- a/examples/advanced/extend_python.py +++ b/examples/advanced/extend_python.py @@ -39,7 +39,7 @@ def name(n): """, start='file_input') -# Remove the 'python3__' prefix that was add to the implicitely imported rules. +# Remove the 'python3__' prefix that was added to the implicitly imported rules. for t in tree.iter_subtrees(): t.data = t.data.rsplit('__', 1)[-1] From 9c63734705bd23eae6257b23621cf49fc5649ae9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 29 Aug 2021 10:19:59 +0100 Subject: [PATCH 36/48] Fix for issue #977 --- lark/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/utils.py b/lark/utils.py index 2938591..051adfa 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -287,7 +287,7 @@ except ImportError: atomicwrites = None class FS: - exists = os.path.exists + exists = staticmethod(os.path.exists) @staticmethod def open(name, mode="r", **kwargs): From e0889a3cf3cd4de596647eee6859eb6667cfc422 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Sun, 29 Aug 2021 23:06:01 +0200 Subject: [PATCH 37/48] Sort Options inside a TerminalTree --- lark/load_grammar.py | 4 ++++ tests/test_grammar.py | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 4a9360c..abcfce1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -614,6 +614,10 @@ class TerminalTreeToPattern(Transformer_NonRecursive): if len(exps) == 1: return exps[0] + # Do a bit of sorting to make sure that the longest option is returned + # (Python's re module otherwise prefers just 'l' when given (l|ll) and both could match) + exps.sort(key=lambda x: (-x.max_width, -x.min_width, -len(x.value))) + pattern = '(?:%s)' % ('|'.join(i.to_regexp() for i in exps)) return _make_joined_pattern(pattern, {i.flags for i in exps}) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 319d709..c771f2b 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -247,10 +247,8 @@ class TestGrammar(TestCase): self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) def test_large_terminal(self): - # TODO: The `reversed` below is required because otherwise the regex engine is happy - # with just parsing 9 from the string 999 instead of consuming the longest g = "start: NUMBERS\n" - g += "NUMBERS: " + '|'.join('"%s"' % i for i in reversed(range(0, 1000))) + g += "NUMBERS: " + '|'.join('"%s"' % i for i in range(0, 1000)) l = Lark(g, parser='lalr') for i in (0, 9, 99, 999): From 20f57b9e40d6e982e39ff475aaa43221955a59df Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 30 Aug 2021 09:25:49 +0100 Subject: [PATCH 38/48] Updated docs, to accord with PR #980 --- docs/grammar.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/grammar.md b/docs/grammar.md index 0d77420..4ac5c77 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -159,14 +159,15 @@ start : (A | B)+ A : "a" | "ab" B : "b" ``` -We get this behavior: +We get only one possible derivation, instead of two: ```bash +>>> p = Lark(g, ambiguity="explicit") >>> p.parse("ab") -Tree(start, [Token(A, 'a'), Token(B, 'b')]) +Tree('start', [Token('A', 'ab')]) ``` -This is happening because Python's regex engine always returns the first matching option. +This is happening because Python's regex engine always returns the best matching option. There is no way to access the alternatives. If you find yourself in this situation, the recommended solution is to use rules instead. From d26a1c5bc6e06021eaca0eaef2ad38a96e8ca7ef Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 30 Aug 2021 09:52:29 +0100 Subject: [PATCH 39/48] BUGFIX: Regression in lexer-only mode (parser=None) --- examples/advanced/python_bytecode.py | 81 ---------------------------- lark/lark.py | 4 +- lark/load_grammar.py | 13 ++--- 3 files changed, 10 insertions(+), 88 deletions(-) delete mode 100644 examples/advanced/python_bytecode.py diff --git a/examples/advanced/python_bytecode.py b/examples/advanced/python_bytecode.py deleted file mode 100644 index 6165e82..0000000 --- a/examples/advanced/python_bytecode.py +++ /dev/null @@ -1,81 +0,0 @@ -""" -Compile Python to Bytecode -========================== - -A toy example that compiles Python directly to bytecode, without generating an AST. -It currently only works for very very simple Python code. - -It requires the 'bytecode' library. You can get it using -:: - - $ pip install bytecode - -""" -from lark import Lark, Transformer, v_args -from lark.indenter import Indenter - -from bytecode import Instr, Bytecode - -class PythonIndenter(Indenter): - NL_type = '_NEWLINE' - OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE'] - CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE'] - INDENT_type = '_INDENT' - DEDENT_type = '_DEDENT' - tab_len = 8 - - -@v_args(inline=True) -class Compile(Transformer): - def number(self, n): - return [Instr('LOAD_CONST', int(n))] - def string(self, s): - return [Instr('LOAD_CONST', s[1:-1])] - def var(self, n): - return [Instr('LOAD_NAME', n)] - - def arith_expr(self, a, op, b): - # TODO support chain arithmetic - assert op == '+' - return a + b + [Instr('BINARY_ADD')] - - def arguments(self, args): - return args - - def funccall(self, name, args): - return name + args + [Instr('CALL_FUNCTION', 1)] - - @v_args(inline=False) - def file_input(self, stmts): - return sum(stmts, []) + [Instr("RETURN_VALUE")] - - def expr_stmt(self, lval, rval): - # TODO more complicated than that - name ,= lval - assert name.name == 'LOAD_NAME' # XXX avoid with another layer of abstraction - return rval + [Instr("STORE_NAME", name.arg)] - - def __default__(self, *args): - assert False, args - - -python_parser3 = Lark.open('python3.lark', rel_to=__file__, start='file_input', - parser='lalr', postlex=PythonIndenter(), - transformer=Compile(), propagate_positions=False) - -def compile_python(s): - insts = python_parser3.parse(s+"\n") - return Bytecode(insts).to_code() - -code = compile_python(""" -a = 3 -b = 5 -print("Hello World!") -print(a+(b+2)) -print((a+b)+2) -""") -exec(code) -# -- Output -- -# Hello World! -# 10 -# 10 diff --git a/lark/lark.py b/lark/lark.py index 45dec4d..744cf4b 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -340,7 +340,9 @@ class Lark(Serialize): if self.options.ambiguity not in _VALID_AMBIGUITY_OPTIONS: raise ConfigurationError("invalid ambiguity option: %r. Must be one of %r" % (self.options.ambiguity, _VALID_AMBIGUITY_OPTIONS)) - if self.options.postlex is not None: + if self.options.parser is None: + terminals_to_keep = '*' + elif self.options.postlex is not None: terminals_to_keep = set(self.options.postlex.always_accept) else: terminals_to_keep = set() diff --git a/lark/load_grammar.py b/lark/load_grammar.py index abcfce1..309826b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -782,12 +782,13 @@ class Grammar: break # Filter out unused terminals - used_terms = {t.name for r in compiled_rules - for t in r.expansion - if isinstance(t, Terminal)} - terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) - if unused: - logger.debug("Unused terminals: %s", [t.name for t in unused]) + if terminals_to_keep != '*': + used_terms = {t.name for r in compiled_rules + for t in r.expansion + if isinstance(t, Terminal)} + terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) + if unused: + logger.debug("Unused terminals: %s", [t.name for t in unused]) return terminals, compiled_rules, self.ignore From df269f6121ac1ba71d8cf7ba4f21cd163c522ef7 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 30 Aug 2021 10:06:17 +0100 Subject: [PATCH 40/48] Examples: Fixed python3.lark for reconstruction (but maybe a bug in reconstructor?) --- examples/advanced/python3.lark | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark index 7fb5ae5..b3cb2c5 100644 --- a/examples/advanced/python3.lark +++ b/examples/advanced/python3.lark @@ -106,8 +106,8 @@ suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT ?test_nocond: or_test | lambdef_nocond ?or_test: and_test ("or" and_test)* -?and_test: not_test ("and" not_test)* -?not_test: "not" not_test -> not_test +?and_test: not_test_ ("and" not_test_)* +?not_test_: "not" not_test_ -> not_test | comparison ?comparison: expr (comp_op expr)* star_expr: "*" expr From 8f7e50e520ce60e9e0ea7e594b8fcba495b17225 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 30 Aug 2021 10:07:43 +0100 Subject: [PATCH 41/48] Examples: Removed warnings from python3.lark --- examples/advanced/python3.lark | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark index b3cb2c5..cb3b077 100644 --- a/examples/advanced/python3.lark +++ b/examples/advanced/python3.lark @@ -1,8 +1,6 @@ // Python 3 grammar for Lark -// NOTE: Work in progress!!! (XXX TODO) -// This grammar should parse all python 3.x code successfully, -// but the resulting parse-tree is still not well-organized. +// This grammar should parse all python 3.x code successfully. // Adapted from: https://docs.python.org/3/reference/grammar.html // Adapted by: Erez Shinan From 293bf07c516d4bce7a7e2e37072ab9e784d6d74c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 30 Aug 2021 10:13:58 +0100 Subject: [PATCH 42/48] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index f056182..909d410 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -7,4 +7,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, from .lexer import Token from .lark import Lark -__version__ = "0.11.4" +__version__ = "0.12.0" From b1103bdcf1a011356c1230e3dcc46dc118db2c8d Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 30 Aug 2021 11:28:44 +0100 Subject: [PATCH 43/48] Tests: Fixed skipping nearley when it isn't included --- tests/test_nearley/test_nearley.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_nearley/test_nearley.py b/tests/test_nearley/test_nearley.py index 1ad6449..a205446 100644 --- a/tests/test_nearley/test_nearley.py +++ b/tests/test_nearley/test_nearley.py @@ -15,8 +15,8 @@ TEST_PATH = os.path.abspath(os.path.dirname(__file__)) NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') -if not os.path.exists(NEARLEY_PATH): - logger.warn("Nearley not installed. Skipping Nearley tests!") +if not os.path.exists(BUILTIN_PATH): + logger.warn("Nearley not included. Skipping Nearley tests! (use git submodule to add)") raise ImportError("Skipping Nearley tests!") import js2py # Ensures that js2py exists, to avoid failing tests From f5115464ec29b5558473a14ad214dec8897dafb2 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 30 Aug 2021 13:56:27 +0100 Subject: [PATCH 44/48] ast_utils: Added support for WithMeta (Initial) --- lark/ast_utils.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lark/ast_utils.py b/lark/ast_utils.py index b5463a2..f1ab317 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -19,13 +19,22 @@ class AsList(object): Subclasses will be instanciated with the parse results as a single list, instead of as arguments. """ +class WithMeta(object): + pass + def camel_to_snake(name): return re.sub(r'(? Date: Mon, 30 Aug 2021 13:59:05 +0100 Subject: [PATCH 45/48] Oops --- lark/ast_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lark/ast_utils.py b/lark/ast_utils.py index f1ab317..a92dcbe 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -54,6 +54,7 @@ def create_transformer(ast_module, transformer=None): if not name.startswith('_') and inspect.isclass(obj): if issubclass(obj, Ast): if not issubclass(obj, AsList): + if issubclass(obj, WithMeta): obj = with_meta_inline(obj).__get__(t) else: obj = inline(obj).__get__(t) From 9638ad8edb62b6fa932b6020a934440d2a1c02b0 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 31 Aug 2021 11:37:09 +0100 Subject: [PATCH 46/48] ast_utils: Refactor and improve interface --- examples/advanced/create_ast.py | 3 ++- lark/ast_utils.py | 29 +++++++++-------------------- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/examples/advanced/create_ast.py b/examples/advanced/create_ast.py index 537e8a8..c855d84 100644 --- a/examples/advanced/create_ast.py +++ b/examples/advanced/create_ast.py @@ -31,7 +31,8 @@ class _Statement(_Ast): pass @dataclass -class Value(_Ast): +class Value(_Ast, ast_utils.WithMeta): + meta: object value: object @dataclass diff --git a/lark/ast_utils.py b/lark/ast_utils.py index a92dcbe..3b8a12a 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -20,23 +20,16 @@ class AsList(object): """ class WithMeta(object): + """Abstract class + + Subclasses will be instanciated the Meta instance of the tree. (see ``v_args`` for more detail) + """ pass def camel_to_snake(name): return re.sub(r'(? Date: Tue, 31 Aug 2021 11:42:27 +0100 Subject: [PATCH 47/48] Examples: Added correct type-info to the create_ast example --- examples/advanced/create_ast.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/advanced/create_ast.py b/examples/advanced/create_ast.py index c855d84..95ce520 100644 --- a/examples/advanced/create_ast.py +++ b/examples/advanced/create_ast.py @@ -15,6 +15,7 @@ from typing import List from dataclasses import dataclass from lark import Lark, ast_utils, Transformer, v_args +from lark.tree import Meta this_module = sys.modules[__name__] @@ -32,7 +33,8 @@ class _Statement(_Ast): @dataclass class Value(_Ast, ast_utils.WithMeta): - meta: object + "Uses WithMeta to include line-number metadata in the meta attribute" + meta: Meta value: object @dataclass From 2b5f98478578a65fe0be4c4de8b72f90178133db Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 31 Aug 2021 12:16:39 +0100 Subject: [PATCH 48/48] ast_utils: Fix docstring --- lark/ast_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/ast_utils.py b/lark/ast_utils.py index 3b8a12a..0c03d45 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -22,7 +22,7 @@ class AsList(object): class WithMeta(object): """Abstract class - Subclasses will be instanciated the Meta instance of the tree. (see ``v_args`` for more detail) + Subclasses will be instanciated with the Meta instance of the tree. (see ``v_args`` for more detail) """ pass