This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

299 wiersze
9.6 KiB

  1. "This module implements an Earley Parser"
  2. # The parser uses a parse-forest to keep track of derivations and ambiguations.
  3. # When the parse ends successfully, a disambiguation stage resolves all ambiguity
  4. # (right now ambiguity resolution is not developed beyond the needs of lark)
  5. # Afterwards the parse tree is reduced (transformed) according to user callbacks.
  6. # I use the no-recursion version of Transformer and Visitor, because the tree might be
  7. # deeper than Python's recursion limit (a bit absurd, but that's life)
  8. #
  9. # The algorithm keeps track of each state set, using a corresponding Column instance.
  10. # Column keeps track of new items using NewsList instances.
  11. #
  12. # Author: Erez Shinan (2017)
  13. # Email : erezshin@gmail.com
  14. from functools import cmp_to_key
  15. from ..utils import compare
  16. from ..common import ParseError, UnexpectedToken, Terminal
  17. from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse
  18. from .grammar_analysis import GrammarAnalyzer
  19. class EndToken:
  20. type = '$end'
  21. class Derivation(Tree):
  22. def __init__(self, rule, items=None):
  23. Tree.__init__(self, 'drv', items or [])
  24. self.rule = rule
  25. def _pretty_label(self): # Nicer pretty for debugging the parser
  26. return self.rule.origin if self.rule else self.data
  27. END_TOKEN = EndToken()
  28. class Item(object):
  29. "An Earley Item, the atom of the algorithm."
  30. def __init__(self, rule, ptr, start, tree):
  31. self.rule = rule
  32. self.ptr = ptr
  33. self.start = start
  34. self.tree = tree if tree is not None else Derivation(self.rule)
  35. @property
  36. def expect(self):
  37. return self.rule.expansion[self.ptr]
  38. @property
  39. def is_complete(self):
  40. return self.ptr == len(self.rule.expansion)
  41. def advance(self, tree):
  42. assert self.tree.data == 'drv'
  43. new_tree = Derivation(self.rule, self.tree.children + [tree])
  44. return Item(self.rule, self.ptr+1, self.start, new_tree)
  45. def __eq__(self, other):
  46. return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
  47. def __hash__(self):
  48. return hash((self.rule, self.ptr, id(self.start)))
  49. def __repr__(self):
  50. before = list(map(str, self.rule.expansion[:self.ptr]))
  51. after = list(map(str, self.rule.expansion[self.ptr:]))
  52. return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after))
  53. class NewsList(list):
  54. "Keeps track of newly added items (append-only)"
  55. def __init__(self, initial=None):
  56. list.__init__(self, initial or [])
  57. self.last_iter = 0
  58. def get_news(self):
  59. i = self.last_iter
  60. self.last_iter = len(self)
  61. return self[i:]
  62. class Column:
  63. "An entry in the table, aka Earley Chart. Contains lists of items."
  64. def __init__(self, i):
  65. self.i = i
  66. self.to_reduce = NewsList()
  67. self.to_predict = NewsList()
  68. self.to_scan = NewsList()
  69. self.item_count = 0
  70. self.added = set()
  71. self.completed = {}
  72. def add(self, items):
  73. """Sort items into scan/predict/reduce newslists
  74. Makes sure only unique items are added.
  75. """
  76. for item in items:
  77. if item.is_complete:
  78. # XXX Potential bug: What happens if there's ambiguity in an empty rule?
  79. if item.rule.expansion and item in self.completed:
  80. old_tree = self.completed[item].tree
  81. if old_tree.data != '_ambig':
  82. new_tree = old_tree.copy()
  83. new_tree.rule = old_tree.rule
  84. old_tree.set('_ambig', [new_tree])
  85. old_tree.rule = None # No longer a 'drv' node
  86. if item.tree.children[0] is old_tree: # XXX a little hacky!
  87. raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule)
  88. old_tree.children.append(item.tree)
  89. else:
  90. self.completed[item] = item
  91. self.to_reduce.append(item)
  92. else:
  93. if item not in self.added:
  94. self.added.add(item)
  95. if isinstance(item.expect, Terminal):
  96. self.to_scan.append(item)
  97. else:
  98. self.to_predict.append(item)
  99. self.item_count += 1 # Only count if actually added
  100. def __nonzero__(self):
  101. return bool(self.item_count)
  102. class Parser:
  103. def __init__(self, rules, start_symbol, callback, resolve_ambiguity=True):
  104. self.analysis = GrammarAnalyzer(rules, start_symbol)
  105. self.start_symbol = start_symbol
  106. self.resolve_ambiguity = resolve_ambiguity
  107. self.postprocess = {}
  108. self.predictions = {}
  109. for rule in self.analysis.rules:
  110. if rule.origin != '$root': # XXX kinda ugly
  111. a = rule.alias
  112. self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
  113. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  114. def parse(self, stream, start_symbol=None):
  115. # Define parser functions
  116. start_symbol = start_symbol or self.start_symbol
  117. def predict(nonterm, column):
  118. assert not isinstance(nonterm, Terminal), nonterm
  119. return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
  120. def complete(item):
  121. name = item.rule.origin
  122. return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
  123. def predict_and_complete(column):
  124. while True:
  125. to_predict = {x.expect for x in column.to_predict.get_news()
  126. if x.ptr} # if not part of an already predicted batch
  127. to_reduce = column.to_reduce.get_news()
  128. if not (to_predict or to_reduce):
  129. break
  130. for nonterm in to_predict:
  131. column.add( predict(nonterm, column) )
  132. for item in to_reduce:
  133. column.add( complete(item) )
  134. def scan(i, token, column):
  135. to_scan = column.to_scan.get_news()
  136. next_set = Column(i)
  137. next_set.add(item.advance(token) for item in to_scan if item.expect.match(token))
  138. if not next_set:
  139. expect = {i.expect for i in column.to_scan}
  140. raise UnexpectedToken(token, expect, stream, i)
  141. return next_set
  142. # Main loop starts
  143. column0 = Column(0)
  144. column0.add(predict(start_symbol, column0))
  145. column = column0
  146. for i, token in enumerate(stream):
  147. predict_and_complete(column)
  148. column = scan(i, token, column)
  149. predict_and_complete(column)
  150. # Parse ended. Now build a parse tree
  151. solutions = [n.tree for n in column.to_reduce
  152. if n.rule.origin==start_symbol and n.start is column0]
  153. if not solutions:
  154. raise ParseError('Incomplete parse: Could not find a solution to input')
  155. elif len(solutions) == 1:
  156. tree = solutions[0]
  157. else:
  158. tree = Tree('_ambig', solutions)
  159. if self.resolve_ambiguity:
  160. ResolveAmbig().visit(tree)
  161. return ApplyCallbacks(self.postprocess).transform(tree)
  162. class ApplyCallbacks(Transformer_NoRecurse):
  163. def __init__(self, postprocess):
  164. self.postprocess = postprocess
  165. def drv(self, tree):
  166. children = tree.children
  167. callback = self.postprocess[tree.rule]
  168. if callback:
  169. return callback(children)
  170. else:
  171. return Tree(rule.origin, children)
  172. def _compare_rules(rule1, rule2):
  173. assert rule1.origin == rule2.origin
  174. c = compare( len(rule1.expansion), len(rule2.expansion))
  175. if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here
  176. c = -c
  177. return c
  178. def _compare_drv(tree1, tree2):
  179. if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)):
  180. return compare(tree1, tree2)
  181. try:
  182. rule1, rule2 = tree1.rule, tree2.rule
  183. except AttributeError:
  184. # Probably trees that don't take part in this parse (better way to distinguish?)
  185. return compare(tree1, tree2)
  186. # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse,
  187. # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be
  188. # computationally inefficient. So we handle it here.
  189. if tree1.data == '_ambig':
  190. _resolve_ambig(tree1)
  191. if tree2.data == '_ambig':
  192. _resolve_ambig(tree2)
  193. c = _compare_rules(tree1.rule, tree2.rule)
  194. if c:
  195. return c
  196. # rules are "equal", so compare trees
  197. for t1, t2 in zip(tree1.children, tree2.children):
  198. c = _compare_drv(t1, t2)
  199. if c:
  200. return c
  201. return compare(len(tree1.children), len(tree2.children))
  202. def _resolve_ambig(tree):
  203. assert tree.data == '_ambig'
  204. best = min(tree.children, key=cmp_to_key(_compare_drv))
  205. assert best.data == 'drv'
  206. tree.set('drv', best.children)
  207. tree.rule = best.rule # needed for applying callbacks
  208. assert tree.data != '_ambig'
  209. class ResolveAmbig(Visitor_NoRecurse):
  210. def _ambig(self, tree):
  211. _resolve_ambig(tree)
  212. # RULES = [
  213. # ('a', ['d']),
  214. # ('d', ['b']),
  215. # ('b', ['C']),
  216. # ('b', ['b', 'C']),
  217. # ('b', ['C', 'b']),
  218. # ]
  219. # p = Parser(RULES, 'a')
  220. # for x in p.parse('CC'):
  221. # print x.pretty()
  222. #---------------
  223. # RULES = [
  224. # ('s', ['a', 'a']),
  225. # ('a', ['b', 'b']),
  226. # ('b', ['C'], lambda (x,): x),
  227. # ('b', ['b', 'C']),
  228. # ]
  229. # p = Parser(RULES, 's', {})
  230. # print p.parse('CCCCC').pretty()