Refactored reconstructor out into tree_matcher. Functionality should stay the same.

5 years ago · afde561ac3
--- a/lark/reconstruct.py
+++ b/lark/reconstruct.py
@@ -1,18 +1,13 @@
 import unicodedata
 """Reconstruct text from a tree, based on Lark grammar"""

 from collections import defaultdict
 import unicodedata

 from .tree import Tree
 from .visitors import Transformer_InPlace
 from .common import ParserConf
 from .lexer import Token, PatternStr
 from .parsers import earley
 from .grammar import Rule, Terminal, NonTerminal


 from .grammar import Terminal, NonTerminal

 def is_discarded_terminal(t):
    return t.is_term and t.filter_out
 from .tree_matcher import TreeMatcher, is_discarded_terminal

 def is_iter_empty(i):
    try:
@@ -61,138 +56,35 @@ class WriteTokensTransformer(Transformer_InPlace):
        return to_write


 class MatchTree(Tree):
    pass

 class MakeMatchTree:
    def __init__(self, name, expansion):
        self.name = name
        self.expansion = expansion

    def __call__(self, args):
        t = MatchTree(self.name, args)
        t.meta.match_tree = True
        t.meta.orig_expansion = self.expansion
        return t

 def best_from_group(seq, group_key, cmp_key):
    d = {}
    for item in seq:
        key = group_key(item)
        if key in d:
            v1 = cmp_key(item)
            v2 = cmp_key(d[key])
            if v2 > v1:
                d[key] = item
        else:
            d[key] = item
    return list(d.values())


 def make_recons_rule(origin, expansion, old_expansion):
    return Rule(origin, expansion, alias=MakeMatchTree(origin.name, old_expansion))

 def make_recons_rule_to_term(origin, term):
    return make_recons_rule(origin, [Terminal(term.name)], [term])

 def _isalnum(x):
    # Categories defined here: https://www.python.org/dev/peps/pep-3131/
    return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']

 class Reconstructor:
 class Reconstructor(TreeMatcher):
    """
    A Reconstructor that will, given a full parse Tree, generate source code.
    Pass `term_subs`, a dictionary of [Terminal name as str] to [output text as str]
    to say what discarded Terminals should be written as.
    """
    def __init__(self, parser, term_subs=None):
        # XXX TODO calling compile twice returns different results!
        assert parser.options.maybe_placeholders == False
        if term_subs is None:
            term_subs = {}
        tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start)

        self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs)
        self.rules_for_root = defaultdict(list)

        self.rules = list(self._build_recons_rules(rules))
        self.rules.reverse()
    Note:
        The reconstructor cannot generate values from regexps. If you need to produce discarded
        regexes, such as newlines, use `term_subs` and provide default values for them.

        # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
        self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion))

        self.rules.sort(key=lambda r: len(r.expansion))
        self.parser = parser
        self._parser_cache = {}

    def _build_recons_rules(self, rules):
        expand1s = {r.origin for r in rules if r.options.expand1}

        aliases = defaultdict(list)
        for r in rules:
            if r.alias:
                aliases[r.origin].append( r.alias )

        rule_names = {r.origin for r in rules}
        nonterminals = {sym for sym in rule_names
                        if sym.name.startswith('_') or sym in expand1s or sym in aliases }

        seen = set()
        for r in rules:
            recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
                          for sym in r.expansion if not is_discarded_terminal(sym)]

            # Skip self-recursive constructs
            if recons_exp == [r.origin] and r.alias is None:
                continue

            sym = NonTerminal(r.alias) if r.alias else r.origin
            rule = make_recons_rule(sym, recons_exp, r.expansion)

            if sym in expand1s and len(recons_exp) != 1:
                self.rules_for_root[sym.name].append(rule)

                if sym.name not in seen:
                    yield make_recons_rule_to_term(sym, sym)
                    seen.add(sym.name)
            else:
                if sym.name.startswith('_') or sym in expand1s:
                    yield rule
                else:
                    self.rules_for_root[sym.name].append(rule)
    Paramters:
        parser: a Lark instance
        term_subs: a dictionary of [Terminal name as str] to [output text as str]
    """

        for origin, rule_aliases in aliases.items():
            for alias in rule_aliases:
                yield make_recons_rule_to_term(origin, NonTerminal(alias))
            yield make_recons_rule_to_term(origin, origin)
    def __init__(self, parser, term_subs=None):
        TreeMatcher.__init__(self, parser)

    def _match(self, term, token):
        if isinstance(token, Tree):
            return Terminal(token.data) == term
        elif isinstance(token, Token):
            return term == Terminal(token.type)
        assert False
        self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {})

    def _reconstruct(self, tree):
        # TODO: ambiguity?
        try:
            parser = self._parser_cache[tree.data]
        except KeyError:
            rules = self.rules + best_from_group(
                self.rules_for_root[tree.data], lambda r: r, lambda r: -len(r.expansion)
            )

            rules.sort(key=lambda r: len(r.expansion))

            callbacks = {rule: rule.alias for rule in rules}  # TODO pass callbacks through dict, instead of alias?
            parser = earley.Parser(ParserConf(rules, callbacks, [tree.data]), self._match, resolve_ambiguity=True)
            self._parser_cache[tree.data] = parser

        unreduced_tree = parser.parse(tree.children, tree.data)   # find a full derivation
        assert unreduced_tree.data == tree.data
        unreduced_tree = self.match_tree(tree, tree.data)

        res = self.write_tokens.transform(unreduced_tree)
        for item in res:
            if isinstance(item, Tree):
                # TODO use orig_expansion.rulename to support templates
                for x in self._reconstruct(item):
                    yield x
            else:
--- a/lark/tree_matcher.py
+++ b/lark/tree_matcher.py
@@ -0,0 +1,177 @@
 """Tree matcher based on Lark grammar"""

 import re
 from collections import defaultdict

 from lark import Tree, Token
 from lark.common import ParserConf
 from lark.parsers import earley
 from lark.grammar import Rule, Terminal, NonTerminal


 def is_discarded_terminal(t):
    return t.is_term and t.filter_out


 class _MakeTreeMatch:
    def __init__(self, name, expansion):
        self.name = name
        self.expansion = expansion

    def __call__(self, args):
        t = Tree(self.name, args)
        t.meta.match_tree = True
        t.meta.orig_expansion = self.expansion
        return t


 def _best_from_group(seq, group_key, cmp_key):
    d = {}
    for item in seq:
        key = group_key(item)
        if key in d:
            v1 = cmp_key(item)
            v2 = cmp_key(d[key])
            if v2 > v1:
                d[key] = item
        else:
            d[key] = item
    return list(d.values())


 def _best_rules_from_group(rules):
    rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion))
    rules.sort(key=lambda r: len(r.expansion))
    return rules


 def _match(term, token):
    if isinstance(token, Tree):
        name, _args = parse_rulename(term.name)
        return token.data == name
    elif isinstance(token, Token):
        return term == Terminal(token.type)
    assert False


 def make_recons_rule(origin, expansion, old_expansion):
    return Rule(origin, expansion, alias=_MakeTreeMatch(origin.name, old_expansion))


 def make_recons_rule_to_term(origin, term):
    return make_recons_rule(origin, [Terminal(term.name)], [term])


 def parse_rulename(s):
    "Parse rule names that may contain a template syntax (like rule{a, b, ...})"
    name, args_str = re.match(r'(\w+)(?:{(.+)})?', s).groups()
    args = args_str and [a.strip() for a in args_str.split(',')]
    return name, args


 class TreeMatcher:
    """Match the elements of a tree node, based on an ontology
    provided by a Lark grammar.

    Supports templates and inlined rules (`rule{a, b,..}` and `_rule`)

    Initiialize with an instance of Lark.
    """

    def __init__(self, parser):
        # XXX TODO calling compile twice returns different results!
        assert parser.options.maybe_placeholders == False
        self.tokens, rules, _extra = parser.grammar.compile(parser.options.start)

        self.rules_for_root = defaultdict(list)

        self.rules = list(self._build_recons_rules(rules))
        self.rules.reverse()

        # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
        self.rules = _best_rules_from_group(self.rules)

        self.parser = parser
        self._parser_cache = {}

    def _build_recons_rules(self, rules):
        "Convert tree-parsing/construction rules to tree-matching rules"
        expand1s = {r.origin for r in rules if r.options.expand1}

        aliases = defaultdict(list)
        for r in rules:
            if r.alias:
                aliases[r.origin].append(r.alias)

        rule_names = {r.origin for r in rules}
        nonterminals = {sym for sym in rule_names
                        if sym.name.startswith('_') or sym in expand1s or sym in aliases}

        seen = set()
        for r in rules:
            recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
                          for sym in r.expansion if not is_discarded_terminal(sym)]

            # Skip self-recursive constructs
            if recons_exp == [r.origin] and r.alias is None:
                continue

            sym = NonTerminal(r.alias) if r.alias else r.origin
            rule = make_recons_rule(sym, recons_exp, r.expansion)

            if sym in expand1s and len(recons_exp) != 1:
                self.rules_for_root[sym.name].append(rule)

                if sym.name not in seen:
                    yield make_recons_rule_to_term(sym, sym)
                    seen.add(sym.name)
            else:
                if sym.name.startswith('_') or sym in expand1s:
                    yield rule
                else:
                    self.rules_for_root[sym.name].append(rule)

        for origin, rule_aliases in aliases.items():
            for alias in rule_aliases:
                yield make_recons_rule_to_term(origin, NonTerminal(alias))
            yield make_recons_rule_to_term(origin, origin)

    def match_tree(self, tree, rulename):
        """Match the elements of `tree` to the symbols of rule `rulename`.

        Args:
            tree (Tree): the tree node to match
            rulename ([type]): [description]

        Returns:
            Tree: an unreduced tree that matches `rulename`

        Raises:
            UnexpectedToken: If no match was found.

        Note:
            It's the callers' responsibility match the tree recursively.
        """
        if rulename:
            # validate
            name, _args = parse_rulename(rulename)
            assert tree.data == name
        else:
            rulename = tree.data

        # TODO: ambiguity?
        try:
            parser = self._parser_cache[rulename]
        except KeyError:
            rules = self.rules + _best_rules_from_group(self.rules_for_root[rulename])

            # TODO pass callbacks through dict, instead of alias?
            callbacks = {rule: rule.alias for rule in rules}
            conf = ParserConf(rules, callbacks, [rulename])
            parser = earley.Parser(conf, _match, resolve_ambiguity=True)
            self._parser_cache[rulename] = parser

        # find a full derivation
        unreduced_tree = parser.parse(tree.children, rulename)
        assert unreduced_tree.data == rulename
        return unreduced_tree