Browse Source

Merge pull request #667 from lark-parser/tree_matcher

Refactored reconstructor out into tree_matcher
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.10.0
Erez Shinan 5 years ago
committed by GitHub
parent
commit
a88d3b0ee2
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 195 additions and 126 deletions
  1. +18
    -126
      lark/reconstruct.py
  2. +177
    -0
      lark/tree_matcher.py

+ 18
- 126
lark/reconstruct.py View File

@@ -1,18 +1,13 @@
import unicodedata
"""Reconstruct text from a tree, based on Lark grammar"""

from collections import defaultdict
import unicodedata

from .tree import Tree
from .visitors import Transformer_InPlace
from .common import ParserConf
from .lexer import Token, PatternStr
from .parsers import earley
from .grammar import Rule, Terminal, NonTerminal


from .grammar import Terminal, NonTerminal

def is_discarded_terminal(t):
return t.is_term and t.filter_out
from .tree_matcher import TreeMatcher, is_discarded_terminal

def is_iter_empty(i):
try:
@@ -61,138 +56,35 @@ class WriteTokensTransformer(Transformer_InPlace):
return to_write


class MatchTree(Tree):
pass

class MakeMatchTree:
def __init__(self, name, expansion):
self.name = name
self.expansion = expansion

def __call__(self, args):
t = MatchTree(self.name, args)
t.meta.match_tree = True
t.meta.orig_expansion = self.expansion
return t

def best_from_group(seq, group_key, cmp_key):
d = {}
for item in seq:
key = group_key(item)
if key in d:
v1 = cmp_key(item)
v2 = cmp_key(d[key])
if v2 > v1:
d[key] = item
else:
d[key] = item
return list(d.values())


def make_recons_rule(origin, expansion, old_expansion):
return Rule(origin, expansion, alias=MakeMatchTree(origin.name, old_expansion))

def make_recons_rule_to_term(origin, term):
return make_recons_rule(origin, [Terminal(term.name)], [term])

def _isalnum(x):
# Categories defined here: https://www.python.org/dev/peps/pep-3131/
return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']

class Reconstructor:
class Reconstructor(TreeMatcher):
"""
A Reconstructor that will, given a full parse Tree, generate source code.
Pass `term_subs`, a dictionary of [Terminal name as str] to [output text as str]
to say what discarded Terminals should be written as.
"""
def __init__(self, parser, term_subs=None):
# XXX TODO calling compile twice returns different results!
assert parser.options.maybe_placeholders == False
if term_subs is None:
term_subs = {}
tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start)

self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs)
self.rules_for_root = defaultdict(list)

self.rules = list(self._build_recons_rules(rules))
self.rules.reverse()
Note:
The reconstructor cannot generate values from regexps. If you need to produce discarded
regexes, such as newlines, use `term_subs` and provide default values for them.

# Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion))

self.rules.sort(key=lambda r: len(r.expansion))
self.parser = parser
self._parser_cache = {}

def _build_recons_rules(self, rules):
expand1s = {r.origin for r in rules if r.options.expand1}

aliases = defaultdict(list)
for r in rules:
if r.alias:
aliases[r.origin].append( r.alias )

rule_names = {r.origin for r in rules}
nonterminals = {sym for sym in rule_names
if sym.name.startswith('_') or sym in expand1s or sym in aliases }

seen = set()
for r in rules:
recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
for sym in r.expansion if not is_discarded_terminal(sym)]

# Skip self-recursive constructs
if recons_exp == [r.origin] and r.alias is None:
continue

sym = NonTerminal(r.alias) if r.alias else r.origin
rule = make_recons_rule(sym, recons_exp, r.expansion)

if sym in expand1s and len(recons_exp) != 1:
self.rules_for_root[sym.name].append(rule)

if sym.name not in seen:
yield make_recons_rule_to_term(sym, sym)
seen.add(sym.name)
else:
if sym.name.startswith('_') or sym in expand1s:
yield rule
else:
self.rules_for_root[sym.name].append(rule)
Paramters:
parser: a Lark instance
term_subs: a dictionary of [Terminal name as str] to [output text as str]
"""

for origin, rule_aliases in aliases.items():
for alias in rule_aliases:
yield make_recons_rule_to_term(origin, NonTerminal(alias))
yield make_recons_rule_to_term(origin, origin)
def __init__(self, parser, term_subs=None):
TreeMatcher.__init__(self, parser)

def _match(self, term, token):
if isinstance(token, Tree):
return Terminal(token.data) == term
elif isinstance(token, Token):
return term == Terminal(token.type)
assert False
self.write_tokens = WriteTokensTransformer({t.name:t for t in self.tokens}, term_subs or {})

def _reconstruct(self, tree):
# TODO: ambiguity?
try:
parser = self._parser_cache[tree.data]
except KeyError:
rules = self.rules + best_from_group(
self.rules_for_root[tree.data], lambda r: r, lambda r: -len(r.expansion)
)

rules.sort(key=lambda r: len(r.expansion))

callbacks = {rule: rule.alias for rule in rules} # TODO pass callbacks through dict, instead of alias?
parser = earley.Parser(ParserConf(rules, callbacks, [tree.data]), self._match, resolve_ambiguity=True)
self._parser_cache[tree.data] = parser

unreduced_tree = parser.parse(tree.children, tree.data) # find a full derivation
assert unreduced_tree.data == tree.data
unreduced_tree = self.match_tree(tree, tree.data)

res = self.write_tokens.transform(unreduced_tree)
for item in res:
if isinstance(item, Tree):
# TODO use orig_expansion.rulename to support templates
for x in self._reconstruct(item):
yield x
else:


+ 177
- 0
lark/tree_matcher.py View File

@@ -0,0 +1,177 @@
"""Tree matcher based on Lark grammar"""

import re
from collections import defaultdict

from . import Tree, Token
from .common import ParserConf
from .parsers import earley
from .grammar import Rule, Terminal, NonTerminal


def is_discarded_terminal(t):
return t.is_term and t.filter_out


class _MakeTreeMatch:
def __init__(self, name, expansion):
self.name = name
self.expansion = expansion

def __call__(self, args):
t = Tree(self.name, args)
t.meta.match_tree = True
t.meta.orig_expansion = self.expansion
return t


def _best_from_group(seq, group_key, cmp_key):
d = {}
for item in seq:
key = group_key(item)
if key in d:
v1 = cmp_key(item)
v2 = cmp_key(d[key])
if v2 > v1:
d[key] = item
else:
d[key] = item
return list(d.values())


def _best_rules_from_group(rules):
rules = _best_from_group(rules, lambda r: r, lambda r: -len(r.expansion))
rules.sort(key=lambda r: len(r.expansion))
return rules


def _match(term, token):
if isinstance(token, Tree):
name, _args = parse_rulename(term.name)
return token.data == name
elif isinstance(token, Token):
return term == Terminal(token.type)
assert False


def make_recons_rule(origin, expansion, old_expansion):
return Rule(origin, expansion, alias=_MakeTreeMatch(origin.name, old_expansion))


def make_recons_rule_to_term(origin, term):
return make_recons_rule(origin, [Terminal(term.name)], [term])


def parse_rulename(s):
"Parse rule names that may contain a template syntax (like rule{a, b, ...})"
name, args_str = re.match(r'(\w+)(?:{(.+)})?', s).groups()
args = args_str and [a.strip() for a in args_str.split(',')]
return name, args


class TreeMatcher:
"""Match the elements of a tree node, based on an ontology
provided by a Lark grammar.

Supports templates and inlined rules (`rule{a, b,..}` and `_rule`)

Initiialize with an instance of Lark.
"""

def __init__(self, parser):
# XXX TODO calling compile twice returns different results!
assert parser.options.maybe_placeholders == False
self.tokens, rules, _extra = parser.grammar.compile(parser.options.start)

self.rules_for_root = defaultdict(list)

self.rules = list(self._build_recons_rules(rules))
self.rules.reverse()

# Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
self.rules = _best_rules_from_group(self.rules)

self.parser = parser
self._parser_cache = {}

def _build_recons_rules(self, rules):
"Convert tree-parsing/construction rules to tree-matching rules"
expand1s = {r.origin for r in rules if r.options.expand1}

aliases = defaultdict(list)
for r in rules:
if r.alias:
aliases[r.origin].append(r.alias)

rule_names = {r.origin for r in rules}
nonterminals = {sym for sym in rule_names
if sym.name.startswith('_') or sym in expand1s or sym in aliases}

seen = set()
for r in rules:
recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
for sym in r.expansion if not is_discarded_terminal(sym)]

# Skip self-recursive constructs
if recons_exp == [r.origin] and r.alias is None:
continue

sym = NonTerminal(r.alias) if r.alias else r.origin
rule = make_recons_rule(sym, recons_exp, r.expansion)

if sym in expand1s and len(recons_exp) != 1:
self.rules_for_root[sym.name].append(rule)

if sym.name not in seen:
yield make_recons_rule_to_term(sym, sym)
seen.add(sym.name)
else:
if sym.name.startswith('_') or sym in expand1s:
yield rule
else:
self.rules_for_root[sym.name].append(rule)

for origin, rule_aliases in aliases.items():
for alias in rule_aliases:
yield make_recons_rule_to_term(origin, NonTerminal(alias))
yield make_recons_rule_to_term(origin, origin)

def match_tree(self, tree, rulename):
"""Match the elements of `tree` to the symbols of rule `rulename`.

Args:
tree (Tree): the tree node to match
rulename ([type]): [description]

Returns:
Tree: an unreduced tree that matches `rulename`

Raises:
UnexpectedToken: If no match was found.

Note:
It's the callers' responsibility match the tree recursively.
"""
if rulename:
# validate
name, _args = parse_rulename(rulename)
assert tree.data == name
else:
rulename = tree.data

# TODO: ambiguity?
try:
parser = self._parser_cache[rulename]
except KeyError:
rules = self.rules + _best_rules_from_group(self.rules_for_root[rulename])

# TODO pass callbacks through dict, instead of alias?
callbacks = {rule: rule.alias for rule in rules}
conf = ParserConf(rules, callbacks, [rulename])
parser = earley.Parser(conf, _match, resolve_ambiguity=True)
self._parser_cache[rulename] = parser

# find a full derivation
unreduced_tree = parser.parse(tree.children, rulename)
assert unreduced_tree.data == rulename
return unreduced_tree

Loading…
Cancel
Save