diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 35e5994..e7cff31 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -1,18 +1,13 @@ -import unicodedata +"""Reconstruct text from a tree, based on Lark grammar""" -from collections import defaultdict +import unicodedata from .tree import Tree from .visitors import Transformer_InPlace -from .common import ParserConf from .lexer import Token, PatternStr -from .parsers import earley -from .grammar import Rule, Terminal, NonTerminal - - +from .grammar import Terminal, NonTerminal -def is_discarded_terminal(t): - return t.is_term and t.filter_out +from .tree_matcher import TreeMatcher, is_discarded_terminal def is_iter_empty(i): try: @@ -61,138 +56,35 @@ class WriteTokensTransformer(Transformer_InPlace): return to_write -class MatchTree(Tree): - pass - -class MakeMatchTree: - def __init__(self, name, expansion): - self.name = name - self.expansion = expansion - - def __call__(self, args): - t = MatchTree(self.name, args) - t.meta.match_tree = True - t.meta.orig_expansion = self.expansion - return t - -def best_from_group(seq, group_key, cmp_key): - d = {} - for item in seq: - key = group_key(item) - if key in d: - v1 = cmp_key(item) - v2 = cmp_key(d[key]) - if v2 > v1: - d[key] = item - else: - d[key] = item - return list(d.values()) - - -def make_recons_rule(origin, expansion, old_expansion): - return Rule(origin, expansion, alias=MakeMatchTree(origin.name, old_expansion)) - -def make_recons_rule_to_term(origin, term): - return make_recons_rule(origin, [Terminal(term.name)], [term]) - def _isalnum(x): # Categories defined here: https://www.python.org/dev/peps/pep-3131/ return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc'] -class Reconstructor: +class Reconstructor(TreeMatcher): """ A Reconstructor that will, given a full parse Tree, generate source code. - Pass `term_subs`, a dictionary of [Terminal name as str] to [output text as str] - to say what discarded Terminals should be written as. - """ - def __init__(self, parser, term_subs=None): - # XXX TODO calling compile twice returns different results! - assert parser.options.maybe_placeholders == False - if term_subs is None: - term_subs = {} - tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start) - - self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs) - self.rules_for_root = defaultdict(list) - self.rules = list(self._build_recons_rules(rules)) - self.rules.reverse() + Note: + The reconstructor cannot generate values from regexps. If you need to produce discarded + regexes, such as newlines, use `term_subs` and provide default values for them. - # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation. - self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion)) - - self.rules.sort(key=lambda r: len(r.expansion)) - self.parser = parser - self._parser_cache = {} - - def _build_recons_rules(self, rules): - expand1s = {r.origin for r in rules if r.options.expand1} - - aliases = defaultdict(list) - for r in rules: - if r.alias: - aliases[r.origin].append( r.alias ) - - rule_names = {r.origin for r in rules} - nonterminals = {sym for sym in rule_names - if sym.name.startswith('_') or sym in expand1s or sym in aliases } - - seen = set() - for r in rules: - recons_exp = [sym if sym in nonterminals else Terminal(sym.name) - for sym in r.expansion if not is_discarded_terminal(sym)] - - # Skip self-recursive constructs - if recons_exp == [r.origin] and r.alias is None: - continue - - sym = NonTerminal(r.alias) if r.alias else r.origin - rule = make_recons_rule(sym, recons_exp, r.expansion) - - if sym in expand1s and len(recons_exp) != 1: - self.rules_for_root[sym.name].append(rule) - - if sym.name not in seen: - yield make_recons_rule_to_term(sym, sym) - seen.add(sym.name) - else: - if sym.name.startswith('_') or sym in expand1s: - yield rule - else: - self.rules_for_root[sym.name].append(rule) + Paramters: + parser: a Lark instance + term_subs: a dictionary of [Terminal name as str] to [output text as str] + """ - for origin, rule_aliases in aliases.items(): - for alias in rule_aliases: - yield make_recons_rule_to_term(origin, NonTerminal(alias)) - yield make_recons_rule_to_term(origin, origin) + def __init__(self, parser, term_subs=None): + TreeMatcher.__init__(self, parser) - def _match(self, term, token): - if isinstance(token, Tree): - return Terminal(token.data) == term - elif isinstance(token, Token): - return term == Terminal(token.type) - assert False + self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {}) def _reconstruct(self, tree): - # TODO: ambiguity? - try: - parser = self._parser_cache[tree.data] - except KeyError: - rules = self.rules + best_from_group( - self.rules_for_root[tree.data], lambda r: r, lambda r: -len(r.expansion) - ) - - rules.sort(key=lambda r: len(r.expansion)) - - callbacks = {rule: rule.alias for rule in rules} # TODO pass callbacks through dict, instead of alias? - parser = earley.Parser(ParserConf(rules, callbacks, [tree.data]), self._match, resolve_ambiguity=True) - self._parser_cache[tree.data] = parser - - unreduced_tree = parser.parse(tree.children, tree.data) # find a full derivation - assert unreduced_tree.data == tree.data + unreduced_tree = self.match_tree(tree, tree.data) + res = self.write_tokens.transform(unreduced_tree) for item in res: if isinstance(item, Tree): + # TODO use orig_expansion.rulename to support templates for x in self._reconstruct(item): yield x else: diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py new file mode 100644 index 0000000..38ac87f --- /dev/null +++ b/lark/tree_matcher.py @@ -0,0 +1,177 @@ +"""Tree matcher based on Lark grammar""" + +import re +from collections import defaultdict + +from . import Tree, Token +from .common import ParserConf +from .parsers import earley +from .grammar import Rule, Terminal, NonTerminal + + +def is_discarded_terminal(t): + return t.is_term and t.filter_out + + +class _MakeTreeMatch: + def __init__(self, name, expansion): + self.name = name + self.expansion = expansion + + def __call__(self, args): + t = Tree(self.name, args) + t.meta.match_tree = True + t.meta.orig_expansion = self.expansion + return t + + +def _best_from_group(seq, group_key, cmp_key): + d = {} + for item in seq: + key = group_key(item) + if key in d: + v1 = cmp_key(item) + v2 = cmp_key(d[key]) + if v2 > v1: + d[key] = item + else: + d[key] = item + return list(d.values()) + + +def _best_rules_from_group(rules): + rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion)) + rules.sort(key=lambda r: len(r.expansion)) + return rules + + +def _match(term, token): + if isinstance(token, Tree): + name, _args = parse_rulename(term.name) + return token.data == name + elif isinstance(token, Token): + return term == Terminal(token.type) + assert False + + +def make_recons_rule(origin, expansion, old_expansion): + return Rule(origin, expansion, alias=_MakeTreeMatch(origin.name, old_expansion)) + + +def make_recons_rule_to_term(origin, term): + return make_recons_rule(origin, [Terminal(term.name)], [term]) + + +def parse_rulename(s): + "Parse rule names that may contain a template syntax (like rule{a, b, ...})" + name, args_str = re.match(r'(\w+)(?:{(.+)})?', s).groups() + args = args_str and [a.strip() for a in args_str.split(',')] + return name, args + + +class TreeMatcher: + """Match the elements of a tree node, based on an ontology + provided by a Lark grammar. + + Supports templates and inlined rules (`rule{a, b,..}` and `_rule`) + + Initiialize with an instance of Lark. + """ + + def __init__(self, parser): + # XXX TODO calling compile twice returns different results! + assert parser.options.maybe_placeholders == False + self.tokens, rules, _extra = parser.grammar.compile(parser.options.start) + + self.rules_for_root = defaultdict(list) + + self.rules = list(self._build_recons_rules(rules)) + self.rules.reverse() + + # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation. + self.rules = _best_rules_from_group(self.rules) + + self.parser = parser + self._parser_cache = {} + + def _build_recons_rules(self, rules): + "Convert tree-parsing/construction rules to tree-matching rules" + expand1s = {r.origin for r in rules if r.options.expand1} + + aliases = defaultdict(list) + for r in rules: + if r.alias: + aliases[r.origin].append(r.alias) + + rule_names = {r.origin for r in rules} + nonterminals = {sym for sym in rule_names + if sym.name.startswith('_') or sym in expand1s or sym in aliases} + + seen = set() + for r in rules: + recons_exp = [sym if sym in nonterminals else Terminal(sym.name) + for sym in r.expansion if not is_discarded_terminal(sym)] + + # Skip self-recursive constructs + if recons_exp == [r.origin] and r.alias is None: + continue + + sym = NonTerminal(r.alias) if r.alias else r.origin + rule = make_recons_rule(sym, recons_exp, r.expansion) + + if sym in expand1s and len(recons_exp) != 1: + self.rules_for_root[sym.name].append(rule) + + if sym.name not in seen: + yield make_recons_rule_to_term(sym, sym) + seen.add(sym.name) + else: + if sym.name.startswith('_') or sym in expand1s: + yield rule + else: + self.rules_for_root[sym.name].append(rule) + + for origin, rule_aliases in aliases.items(): + for alias in rule_aliases: + yield make_recons_rule_to_term(origin, NonTerminal(alias)) + yield make_recons_rule_to_term(origin, origin) + + def match_tree(self, tree, rulename): + """Match the elements of `tree` to the symbols of rule `rulename`. + + Args: + tree (Tree): the tree node to match + rulename ([type]): [description] + + Returns: + Tree: an unreduced tree that matches `rulename` + + Raises: + UnexpectedToken: If no match was found. + + Note: + It's the callers' responsibility match the tree recursively. + """ + if rulename: + # validate + name, _args = parse_rulename(rulename) + assert tree.data == name + else: + rulename = tree.data + + # TODO: ambiguity? + try: + parser = self._parser_cache[rulename] + except KeyError: + rules = self.rules + _best_rules_from_group(self.rules_for_root[rulename]) + + # TODO pass callbacks through dict, instead of alias? + callbacks = {rule: rule.alias for rule in rules} + conf = ParserConf(rules, callbacks, [rulename]) + parser = earley.Parser(conf, _match, resolve_ambiguity=True) + self._parser_cache[rulename] = parser + + # find a full derivation + unreduced_tree = parser.parse(tree.children, rulename) + assert unreduced_tree.data == rulename + return unreduced_tree