From 3112259b2388fa9fc3cdc2c77c85c416e3a80c58 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 1 Oct 2020 16:34:52 +0200 Subject: [PATCH 1/3] Fix for maybe_placeholders when keep_all_tokens=True --- lark/lark.py | 4 ++-- lark/load_grammar.py | 18 +++++++++++------- lark/parse_tree_builder.py | 5 ++--- tests/test_parser.py | 4 ++++ 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 8799610..72c8ad0 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -267,7 +267,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'forest', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source, re_module) + self.grammar = load_grammar(grammar, self.source, re_module, self.options.keep_all_tokens) # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) @@ -322,7 +322,7 @@ class Lark(Serialize): self._callbacks = None # we don't need these callbacks if we aren't building a tree if self.options.ambiguity != 'forest': - self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) def _build_parser(self): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index cd36e4b..156a1be 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -650,7 +650,7 @@ class Grammar: _imported_grammars = {} -def import_grammar(grammar_path, re_, base_paths=[]): +def import_grammar(grammar_path, loader, base_paths=[]): if grammar_path not in _imported_grammars: import_paths = base_paths + IMPORT_PATHS for import_path in import_paths: @@ -658,7 +658,7 @@ def import_grammar(grammar_path, re_, base_paths=[]): joined_path = os.path.join(import_path, grammar_path) with open(joined_path, encoding='utf8') as f: text = f.read() - grammar = load_grammar(text, joined_path, re_) + grammar = loader.load_grammar(text, joined_path) _imported_grammars[grammar_path] = grammar break else: @@ -803,7 +803,7 @@ class GrammarLoader: ('%ignore expects a value', ['%ignore %import\n']), ] - def __init__(self, re_module): + def __init__(self, re_module, always_keep_all_tokens): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()] @@ -816,6 +816,7 @@ class GrammarLoader: self.canonize_tree = CanonizeTree() self.re_module = re_module + self.always_keep_all_tokens = always_keep_all_tokens def load_grammar(self, grammar_text, grammar_name=''): "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." @@ -901,7 +902,7 @@ class GrammarLoader: # import grammars for dotted_path, (base_paths, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = import_grammar(grammar_path, self.re_module, base_paths=base_paths) + g = import_grammar(grammar_path, self, base_paths=base_paths) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td @@ -946,7 +947,10 @@ class GrammarLoader: rules = rule_defs rule_names = {} - for name, params, _x, _o in rules: + for name, params, _x, option in rules: + if self.always_keep_all_tokens: # We need to do this somewhere. Might as well prevent an additional loop + option.keep_all_tokens = True + if name.startswith('__'): raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) if name in rule_names: @@ -981,5 +985,5 @@ class GrammarLoader: -def load_grammar(grammar, source, re_): - return GrammarLoader(re_).load_grammar(grammar, source) +def load_grammar(grammar, source, re_, always_keep_all_tokens): + return GrammarLoader(re_, always_keep_all_tokens).load_grammar(grammar, source) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 8b81d29..a4c4330 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -299,10 +299,9 @@ def apply_visit_wrapper(func, name, wrapper): class ParseTreeBuilder: - def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): + def __init__(self, rules, tree_class, propagate_positions=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class self.propagate_positions = propagate_positions - self.always_keep_all_tokens = keep_all_tokens self.ambiguous = ambiguous self.maybe_placeholders = maybe_placeholders @@ -311,7 +310,7 @@ class ParseTreeBuilder: def _init_builders(self, rules): for rule in rules: options = rule.options - keep_all_tokens = self.always_keep_all_tokens or options.keep_all_tokens + keep_all_tokens = options.keep_all_tokens expand_single_child = options.expand1 wrapper_chain = list(filter(None, [ diff --git a/tests/test_parser.py b/tests/test_parser.py index 83336c5..d97c810 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2041,6 +2041,10 @@ def _make_parser_test(LEXER, PARSER): # Anonymous tokens shouldn't count p = _Lark("""start: ["a"] ["b"] ["c"] """, maybe_placeholders=True) self.assertEqual(p.parse("").children, []) + + # Unless keep_all_tokens=True + p = _Lark("""start: ["a"] ["b"] ["c"] """, maybe_placeholders=True, keep_all_tokens=True) + self.assertEqual(p.parse("").children, [None, None, None]) # All invisible constructs shouldn't count p = _Lark("""start: [A] ["b"] [_c] ["e" "f" _c] From ab5bc3f47e1a83f2c420fd8bfb4108719baa8d14 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Tue, 6 Oct 2020 02:32:25 +0200 Subject: [PATCH 2/3] Improved comment --- lark/load_grammar.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 156a1be..08990f2 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -948,7 +948,8 @@ class GrammarLoader: rule_names = {} for name, params, _x, option in rules: - if self.always_keep_all_tokens: # We need to do this somewhere. Might as well prevent an additional loop + # We can't just simply not throw away the tokens later, we need option.keep_all_tokens to correctly generate maybe_placeholders + if self.always_keep_all_tokens: option.keep_all_tokens = True if name.startswith('__'): From e4d73526d4eec4f049fd7737e6f3f813c82aa357 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 6 Oct 2020 18:19:52 +0300 Subject: [PATCH 3/3] Refactor --- lark/lark.py | 8 +++++++- lark/load_grammar.py | 47 ++++++++++++++++++++++---------------------- 2 files changed, 31 insertions(+), 24 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 835a6eb..770b821 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -335,7 +335,13 @@ class Lark(Serialize): self._callbacks = None # we don't need these callbacks if we aren't building a tree if self.options.ambiguity != 'forest': - self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) + self._parse_tree_builder = ParseTreeBuilder( + self.rules, + self.options.tree_class or Tree, + self.options.propagate_positions, + self.options.parser!='lalr' and self.options.ambiguity=='explicit', + self.options.maybe_placeholders + ) self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) def _build_parser(self): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 9fb49f6..d039638 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -650,22 +650,6 @@ class Grammar: _imported_grammars = {} -def import_grammar(grammar_path, loader, base_paths=[]): - if grammar_path not in _imported_grammars: - import_paths = base_paths + IMPORT_PATHS - for import_path in import_paths: - with suppress(IOError): - joined_path = os.path.join(import_path, grammar_path) - with open(joined_path, encoding='utf8') as f: - text = f.read() - grammar = loader.load_grammar(text, joined_path) - _imported_grammars[grammar_path] = grammar - break - else: - open(grammar_path, encoding='utf8') - assert False - - return _imported_grammars[grammar_path] def import_from_grammar_into_namespace(grammar, namespace, aliases): """Returns all rules and terminals of grammar, prepended @@ -803,7 +787,7 @@ class GrammarLoader: ('%ignore expects a value', ['%ignore %import\n']), ] - def __init__(self, re_module, always_keep_all_tokens): + def __init__(self, re_module, global_keep_all_tokens): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()] @@ -816,7 +800,24 @@ class GrammarLoader: self.canonize_tree = CanonizeTree() self.re_module = re_module - self.always_keep_all_tokens = always_keep_all_tokens + self.global_keep_all_tokens = global_keep_all_tokens + + def import_grammar(self, grammar_path, base_paths=[]): + if grammar_path not in _imported_grammars: + import_paths = base_paths + IMPORT_PATHS + for import_path in import_paths: + with suppress(IOError): + joined_path = os.path.join(import_path, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() + grammar = self.load_grammar(text, joined_path) + _imported_grammars[grammar_path] = grammar + break + else: + open(grammar_path, encoding='utf8') # Force a file not found error + assert False + + return _imported_grammars[grammar_path] def load_grammar(self, grammar_text, grammar_name=''): "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." @@ -902,7 +903,7 @@ class GrammarLoader: # import grammars for dotted_path, (base_paths, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = import_grammar(grammar_path, self, base_paths=base_paths) + g = self.import_grammar(grammar_path, base_paths=base_paths) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td @@ -949,9 +950,9 @@ class GrammarLoader: rule_names = {} for name, params, _x, option in rules: # We can't just simply not throw away the tokens later, we need option.keep_all_tokens to correctly generate maybe_placeholders - if self.always_keep_all_tokens: + if self.global_keep_all_tokens: option.keep_all_tokens = True - + if name.startswith('__'): raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) if name in rule_names: @@ -986,5 +987,5 @@ class GrammarLoader: -def load_grammar(grammar, source, re_, always_keep_all_tokens): - return GrammarLoader(re_, always_keep_all_tokens).load_grammar(grammar, source) +def load_grammar(grammar, source, re_, global_keep_all_tokens): + return GrammarLoader(re_, global_keep_all_tokens).load_grammar(grammar, source)