This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

213 lines
7.1 KiB

  1. import unicodedata
  2. from collections import defaultdict
  3. from .tree import Tree
  4. from .visitors import Transformer_InPlace
  5. from .common import ParserConf
  6. from .lexer import Token, PatternStr
  7. from .parsers import earley
  8. from .grammar import Rule, Terminal, NonTerminal
  9. def is_discarded_terminal(t):
  10. return t.is_term and t.filter_out
  11. def is_iter_empty(i):
  12. try:
  13. _ = next(i)
  14. return False
  15. except StopIteration:
  16. return True
  17. class WriteTokensTransformer(Transformer_InPlace):
  18. "Inserts discarded tokens into their correct place, according to the rules of grammar"
  19. def __init__(self, tokens, term_subs):
  20. self.tokens = tokens
  21. self.term_subs = term_subs
  22. def __default__(self, data, children, meta):
  23. if not getattr(meta, 'match_tree', False):
  24. return Tree(data, children)
  25. iter_args = iter(children)
  26. to_write = []
  27. for sym in meta.orig_expansion:
  28. if is_discarded_terminal(sym):
  29. try:
  30. v = self.term_subs[sym.name](sym)
  31. except KeyError:
  32. t = self.tokens[sym.name]
  33. if not isinstance(t.pattern, PatternStr):
  34. raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t)
  35. v = t.pattern.value
  36. to_write.append(v)
  37. else:
  38. x = next(iter_args)
  39. if isinstance(x, list):
  40. to_write += x
  41. else:
  42. if isinstance(x, Token):
  43. assert Terminal(x.type) == sym, x
  44. else:
  45. assert NonTerminal(x.data) == sym, (sym, x)
  46. to_write.append(x)
  47. assert is_iter_empty(iter_args)
  48. return to_write
  49. class MatchTree(Tree):
  50. pass
  51. class MakeMatchTree:
  52. def __init__(self, name, expansion):
  53. self.name = name
  54. self.expansion = expansion
  55. def __call__(self, args):
  56. t = MatchTree(self.name, args)
  57. t.meta.match_tree = True
  58. t.meta.orig_expansion = self.expansion
  59. return t
  60. def best_from_group(seq, group_key, cmp_key):
  61. d = {}
  62. for item in seq:
  63. key = group_key(item)
  64. if key in d:
  65. v1 = cmp_key(item)
  66. v2 = cmp_key(d[key])
  67. if v2 > v1:
  68. d[key] = item
  69. else:
  70. d[key] = item
  71. return list(d.values())
  72. def make_recons_rule(origin, expansion, old_expansion):
  73. return Rule(origin, expansion, alias=MakeMatchTree(origin.name, old_expansion))
  74. def make_recons_rule_to_term(origin, term):
  75. return make_recons_rule(origin, [Terminal(term.name)], [term])
  76. def _isalnum(x):
  77. # Categories defined here: https://www.python.org/dev/peps/pep-3131/
  78. return unicodedata.category(x) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc']
  79. class Reconstructor:
  80. """
  81. A Reconstructor that will, given a full parse Tree, generate source code.
  82. Pass `term_subs`, a dictionary of [Terminal name as str] to [output text as str]
  83. to say what discarded Terminals should be written as.
  84. """
  85. def __init__(self, parser, term_subs=None):
  86. # XXX TODO calling compile twice returns different results!
  87. assert parser.options.maybe_placeholders == False
  88. if term_subs is None:
  89. term_subs = {}
  90. tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start)
  91. self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs)
  92. self.rules_for_root = defaultdict(list)
  93. self.rules = list(self._build_recons_rules(rules))
  94. self.rules.reverse()
  95. # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation.
  96. self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion))
  97. self.rules.sort(key=lambda r: len(r.expansion))
  98. self.parser = parser
  99. self._parser_cache = {}
  100. def _build_recons_rules(self, rules):
  101. expand1s = {r.origin for r in rules if r.options.expand1}
  102. aliases = defaultdict(list)
  103. for r in rules:
  104. if r.alias:
  105. aliases[r.origin].append( r.alias )
  106. rule_names = {r.origin for r in rules}
  107. nonterminals = {sym for sym in rule_names
  108. if sym.name.startswith('_') or sym in expand1s or sym in aliases }
  109. seen = set()
  110. for r in rules:
  111. recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
  112. for sym in r.expansion if not is_discarded_terminal(sym)]
  113. # Skip self-recursive constructs
  114. if recons_exp == [r.origin] and r.alias is None:
  115. continue
  116. sym = NonTerminal(r.alias) if r.alias else r.origin
  117. rule = make_recons_rule(sym, recons_exp, r.expansion)
  118. if sym in expand1s and len(recons_exp) != 1:
  119. self.rules_for_root[sym.name].append(rule)
  120. if sym.name not in seen:
  121. yield make_recons_rule_to_term(sym, sym)
  122. seen.add(sym.name)
  123. else:
  124. if sym.name.startswith('_') or sym in expand1s:
  125. yield rule
  126. else:
  127. self.rules_for_root[sym.name].append(rule)
  128. for origin, rule_aliases in aliases.items():
  129. for alias in rule_aliases:
  130. yield make_recons_rule_to_term(origin, NonTerminal(alias))
  131. yield make_recons_rule_to_term(origin, origin)
  132. def _match(self, term, token):
  133. if isinstance(token, Tree):
  134. return Terminal(token.data) == term
  135. elif isinstance(token, Token):
  136. return term == Terminal(token.type)
  137. assert False
  138. def _reconstruct(self, tree):
  139. # TODO: ambiguity?
  140. try:
  141. parser = self._parser_cache[tree.data]
  142. except KeyError:
  143. rules = self.rules + best_from_group(
  144. self.rules_for_root[tree.data], lambda r: r, lambda r: -len(r.expansion)
  145. )
  146. rules.sort(key=lambda r: len(r.expansion))
  147. callbacks = {rule: rule.alias for rule in rules} # TODO pass callbacks through dict, instead of alias?
  148. parser = earley.Parser(ParserConf(rules, callbacks, [tree.data]), self._match, resolve_ambiguity=True)
  149. self._parser_cache[tree.data] = parser
  150. unreduced_tree = parser.parse(tree.children, tree.data) # find a full derivation
  151. assert unreduced_tree.data == tree.data
  152. res = self.write_tokens.transform(unreduced_tree)
  153. for item in res:
  154. if isinstance(item, Tree):
  155. for x in self._reconstruct(item):
  156. yield x
  157. else:
  158. yield item
  159. def reconstruct(self, tree, postproc=None):
  160. x = self._reconstruct(tree)
  161. if postproc:
  162. x = postproc(x)
  163. y = []
  164. prev_item = ''
  165. for item in x:
  166. if prev_item and item and _isalnum(prev_item[-1]) and _isalnum(item[0]):
  167. y.append(' ')
  168. y.append(item)
  169. prev_item = item
  170. return ''.join(y)