This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

274 linhas
8.7 KiB

  1. "This module implements an Earley Parser"
  2. # The parser uses a parse-forest to keep track of derivations and ambiguations.
  3. # When the parse ends successfully, a disambiguation stage resolves all ambiguity
  4. # (right now ambiguity resolution is not developed beyond the needs of lark)
  5. # Afterwards the parse tree is reduced (transformed) according to user callbacks.
  6. # I use the no-recursion version of Transformer and Visitor, because the tree might be
  7. # deeper than Python's recursion limit (a bit absurd, but that's life)
  8. #
  9. # The algorithm keeps track of each state set, using a corresponding Column instance.
  10. # Column keeps track of new items using NewsList instances.
  11. #
  12. # Author: Erez Shinan (2017)
  13. # Email : erezshin@gmail.com
  14. from functools import cmp_to_key
  15. from ..utils import compare
  16. from ..common import ParseError, UnexpectedToken, Terminal
  17. from .grammar_analysis import GrammarAnalyzer
  18. from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse
  19. class EndToken:
  20. type = '$end'
  21. class Derivation(Tree):
  22. def __init__(self, rule, items=None):
  23. Tree.__init__(self, 'drv', items or [])
  24. self.rule = rule
  25. END_TOKEN = EndToken()
  26. class Item(object):
  27. def __init__(self, rule, ptr, start, tree):
  28. self.rule = rule
  29. self.ptr = ptr
  30. self.start = start
  31. self.tree = tree if tree is not None else Derivation(self.rule)
  32. @property
  33. def expect(self):
  34. return self.rule.expansion[self.ptr]
  35. @property
  36. def is_complete(self):
  37. return self.ptr == len(self.rule.expansion)
  38. def advance(self, tree):
  39. assert self.tree.data == 'drv'
  40. new_tree = Derivation(self.rule, self.tree.children + [tree])
  41. return Item(self.rule, self.ptr+1, self.start, new_tree)
  42. def __eq__(self, other):
  43. return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
  44. def __hash__(self):
  45. return hash((self.rule, self.ptr, id(self.start)))
  46. def __repr__(self):
  47. before = list(map(str, self.rule.expansion[:self.ptr]))
  48. after = list(map(str, self.rule.expansion[self.ptr:]))
  49. return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after))
  50. class NewsList(list):
  51. "Keeps track of newly added items (append-only)"
  52. def __init__(self, initial=None):
  53. list.__init__(self, initial or [])
  54. self.last_iter = 0
  55. def get_news(self):
  56. i = self.last_iter
  57. self.last_iter = len(self)
  58. return self[i:]
  59. class Column:
  60. "An entry in the table, aka Earley Chart"
  61. def __init__(self, i):
  62. self.i = i
  63. self.to_reduce = NewsList()
  64. self.to_predict = NewsList()
  65. self.to_scan = NewsList()
  66. self.item_count = 0
  67. self.added = set()
  68. self.completed = {}
  69. def add(self, items):
  70. """Sort items into scan/predict/reduce newslists
  71. Makes sure only unique items are added.
  72. """
  73. added = self.added
  74. for item in items:
  75. if item.is_complete:
  76. # XXX Potential bug: What happens if there's ambiguity in an empty rule?
  77. if item.rule.expansion and item in self.completed:
  78. old_tree = self.completed[item].tree
  79. if old_tree.data != '_ambig':
  80. new_tree = old_tree.copy()
  81. new_tree.rule = old_tree.rule
  82. old_tree.set('_ambig', [new_tree])
  83. if item.tree.children[0] is old_tree: # XXX a little hacky!
  84. raise ParseError("Infinite recursion in grammar!")
  85. old_tree.children.append(item.tree)
  86. else:
  87. self.completed[item] = item
  88. self.to_reduce.append(item)
  89. else:
  90. if item not in added:
  91. added.add(item)
  92. if isinstance(item.expect, Terminal):
  93. self.to_scan.append(item)
  94. else:
  95. self.to_predict.append(item)
  96. self.item_count += 1 # Only count if actually added
  97. def __nonzero__(self):
  98. return bool(self.item_count)
  99. class Parser:
  100. def __init__(self, rules, start, callback, resolve_ambiguity=True):
  101. self.analysis = GrammarAnalyzer(rules, start)
  102. self.start = start
  103. self.resolve_ambiguity = resolve_ambiguity
  104. self.postprocess = {}
  105. self.predictions = {}
  106. for rule in self.analysis.rules:
  107. if rule.origin != '$root': # XXX kinda ugly
  108. a = rule.alias
  109. self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
  110. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  111. def parse(self, stream, start=None):
  112. # Define parser functions
  113. start = start or self.start
  114. def predict(nonterm, i):
  115. assert not isinstance(nonterm, Terminal), nonterm
  116. return [Item(rule, 0, i, None) for rule in self.predictions[nonterm]]
  117. def complete(item):
  118. name = item.rule.origin
  119. return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
  120. def process_column(i, token, cur_set):
  121. next_set = Column(i)
  122. while True:
  123. to_predict = {x.expect for x in cur_set.to_predict.get_news()
  124. if x.ptr} # if not part of an already predicted batch
  125. to_reduce = cur_set.to_reduce.get_news()
  126. if not (to_predict or to_reduce):
  127. break
  128. for nonterm in to_predict:
  129. cur_set.add( predict(nonterm, cur_set) )
  130. for item in to_reduce:
  131. cur_set.add( complete(item) )
  132. if token is not END_TOKEN:
  133. to_scan = cur_set.to_scan.get_news()
  134. for item in to_scan:
  135. if item.expect.match(token):
  136. next_set.add([item.advance(token)])
  137. if not next_set and token is not END_TOKEN:
  138. expect = {i.expect for i in cur_set.to_scan}
  139. raise UnexpectedToken(token, expect, stream, i)
  140. return cur_set, next_set
  141. # Main loop starts
  142. column0 = Column(0)
  143. column0.add(predict(start, column0))
  144. cur_set = column0
  145. i = 0
  146. for token in stream:
  147. _, cur_set = process_column(i, token, cur_set)
  148. i += 1
  149. last_set, _ = process_column(i, END_TOKEN, cur_set)
  150. # Parse ended. Now build a parse tree
  151. solutions = [n.tree for n in last_set.to_reduce
  152. if n.rule.origin==start and n.start is column0]
  153. if not solutions:
  154. raise ParseError('Incomplete parse: Could not find a solution to input')
  155. elif len(solutions) == 1:
  156. tree = solutions[0]
  157. else:
  158. tree = Tree('_ambig', solutions)
  159. if self.resolve_ambiguity:
  160. ResolveAmbig().visit(tree)
  161. return ApplyCallbacks(self.postprocess).transform(tree)
  162. class ApplyCallbacks(Transformer_NoRecurse):
  163. def __init__(self, postprocess):
  164. self.postprocess = postprocess
  165. def drv(self, tree):
  166. children = tree.children
  167. callback = self.postprocess[tree.rule]
  168. if callback:
  169. return callback(children)
  170. else:
  171. return Tree(rule.origin, children)
  172. def _compare_rules(rule1, rule2):
  173. assert rule1.origin == rule2.origin
  174. c = compare( len(rule1.expansion), len(rule2.expansion))
  175. if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here
  176. c = -c
  177. return c
  178. def _compare_drv(tree1, tree2):
  179. if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)):
  180. return compare(tree1, tree2)
  181. c = _compare_rules(tree1.rule, tree2.rule)
  182. if c:
  183. return c
  184. # rules are "equal", so compare trees
  185. for t1, t2 in zip(tree1.children, tree2.children):
  186. c = _compare_drv(t1, t2)
  187. if c:
  188. return c
  189. return compare(len(tree1.children), len(tree2.children))
  190. class ResolveAmbig(Visitor_NoRecurse):
  191. def _ambig(self, tree):
  192. best = min(tree.children, key=cmp_to_key(_compare_drv))
  193. assert best.data == 'drv'
  194. tree.set('drv', best.children)
  195. tree.rule = best.rule # needed for applying callbacks
  196. # RULES = [
  197. # ('a', ['d']),
  198. # ('d', ['b']),
  199. # ('b', ['C']),
  200. # ('b', ['b', 'C']),
  201. # ('b', ['C', 'b']),
  202. # ]
  203. # p = Parser(RULES, 'a')
  204. # for x in p.parse('CC'):
  205. # print x.pretty()
  206. #---------------
  207. # RULES = [
  208. # ('s', ['a', 'a']),
  209. # ('a', ['b', 'b']),
  210. # ('b', ['C'], lambda (x,): x),
  211. # ('b', ['b', 'C']),
  212. # ]
  213. # p = Parser(RULES, 's', {})
  214. # print p.parse('CCCCC').pretty()