This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

168 lines
5.6 KiB

  1. from collections import defaultdict
  2. from .tree import Tree
  3. from .visitors import Transformer_InPlace
  4. from .common import ParserConf
  5. from .lexer import Token, PatternStr
  6. from .parsers import earley
  7. from .grammar import Rule, Terminal, NonTerminal
  8. def is_discarded_terminal(t):
  9. return t.is_term and t.filter_out
  10. def is_iter_empty(i):
  11. try:
  12. _ = next(i)
  13. return False
  14. except StopIteration:
  15. return True
  16. class WriteTokensTransformer(Transformer_InPlace):
  17. "Inserts discarded tokens into their correct place, according to the rules of grammar"
  18. def __init__(self, tokens, term_subs):
  19. self.tokens = tokens
  20. self.term_subs = term_subs
  21. def __default__(self, data, children, meta):
  22. # if not isinstance(t, MatchTree):
  23. # return t
  24. if not getattr(meta, 'match_tree', False):
  25. return Tree(data, children)
  26. iter_args = iter(children)
  27. to_write = []
  28. for sym in meta.orig_expansion:
  29. if is_discarded_terminal(sym):
  30. try:
  31. v = self.term_subs[sym.name](sym)
  32. except KeyError:
  33. t = self.tokens[sym.name]
  34. if not isinstance(t.pattern, PatternStr):
  35. raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t)
  36. v = t.pattern.value
  37. to_write.append(v)
  38. else:
  39. x = next(iter_args)
  40. if isinstance(x, list):
  41. to_write += x
  42. else:
  43. if isinstance(x, Token):
  44. assert Terminal(x.type) == sym, x
  45. else:
  46. assert NonTerminal(x.data) == sym, (sym, x)
  47. to_write.append(x)
  48. assert is_iter_empty(iter_args)
  49. return to_write
  50. class MatchTree(Tree):
  51. pass
  52. class MakeMatchTree:
  53. def __init__(self, name, expansion):
  54. self.name = name
  55. self.expansion = expansion
  56. def __call__(self, args):
  57. t = MatchTree(self.name, args)
  58. t.meta.match_tree = True
  59. t.meta.orig_expansion = self.expansion
  60. return t
  61. def best_from_group(seq, group_key, cmp_key):
  62. d = {}
  63. for item in seq:
  64. key = group_key(item)
  65. if key in d:
  66. v1 = cmp_key(item)
  67. v2 = cmp_key(d[key])
  68. if v2 > v1:
  69. d[key] = item
  70. else:
  71. d[key] = item
  72. return list(d.values())
  73. class Reconstructor:
  74. def __init__(self, parser, term_subs={}):
  75. # XXX TODO calling compile twice returns different results!
  76. assert parser.options.maybe_placeholders == False
  77. tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start)
  78. self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs)
  79. self.rules = list(self._build_recons_rules(rules))
  80. self.rules.reverse()
  81. # print(len(self.rules))
  82. self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion))
  83. # print(len(self.rules))
  84. # self.rules = list(set(list(self._build_recons_rules(rules))))
  85. self.rules.sort(key=lambda r: len(r.expansion))
  86. callbacks = {rule: rule.alias for rule in self.rules} # TODO pass callbacks through dict, instead of alias?
  87. self.parser = earley.Parser(ParserConf(self.rules, callbacks, parser.options.start),
  88. self._match, resolve_ambiguity=True)
  89. def _build_recons_rules(self, rules):
  90. expand1s = {r.origin for r in rules if r.options.expand1}
  91. aliases = defaultdict(list)
  92. for r in rules:
  93. if r.alias:
  94. aliases[r.origin].append( r.alias )
  95. rule_names = {r.origin for r in rules}
  96. nonterminals = {sym for sym in rule_names
  97. if sym.name.startswith('_') or sym in expand1s or sym in aliases }
  98. for r in rules:
  99. recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
  100. for sym in r.expansion if not is_discarded_terminal(sym)]
  101. # Skip self-recursive constructs
  102. if recons_exp == [r.origin]:
  103. continue
  104. sym = NonTerminal(r.alias) if r.alias else r.origin
  105. yield Rule(sym, recons_exp, alias=MakeMatchTree(sym.name, r.expansion))
  106. for origin, rule_aliases in aliases.items():
  107. for alias in rule_aliases:
  108. yield Rule(origin, [Terminal(alias)], alias=MakeMatchTree(origin.name, [NonTerminal(alias)]))
  109. yield Rule(origin, [Terminal(origin.name)], alias=MakeMatchTree(origin.name, [origin]))
  110. def _match(self, term, token):
  111. if isinstance(token, Tree):
  112. return Terminal(token.data) == term
  113. elif isinstance(token, Token):
  114. return term == Terminal(token.type)
  115. assert False
  116. def _reconstruct(self, tree):
  117. # TODO: ambiguity?
  118. unreduced_tree = self.parser.parse(tree.children, tree.data) # find a full derivation
  119. assert unreduced_tree.data == tree.data
  120. res = self.write_tokens.transform(unreduced_tree)
  121. for item in res:
  122. if isinstance(item, Tree):
  123. for x in self._reconstruct(item):
  124. yield x
  125. else:
  126. yield item
  127. def reconstruct(self, tree):
  128. x = self._reconstruct(tree)
  129. y = []
  130. prev_item = ''
  131. for item in x:
  132. if prev_item and item and prev_item[-1].isalnum() and item[0].isalnum():
  133. y.append(' ')
  134. y.append(item)
  135. prev_item = item
  136. return ''.join(y)