This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

542 lines
21 KiB

  1. """"This module implements an SPPF implementation
  2. This is used as the primary output mechanism for the Earley parser
  3. in order to store complex ambiguities.
  4. Full reference and more details is here:
  5. http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
  6. """
  7. from random import randint
  8. from math import isinf
  9. from collections import deque
  10. from operator import attrgetter
  11. from importlib import import_module
  12. from ..lexer import Token
  13. from ..utils import logger
  14. from ..tree import Tree
  15. class ForestNode(object):
  16. pass
  17. class SymbolNode(ForestNode):
  18. """
  19. A Symbol Node represents a symbol (or Intermediate LR0).
  20. Symbol nodes are keyed by the symbol (s). For intermediate nodes
  21. s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol
  22. nodes, s will be a string representing the non-terminal origin (i.e.
  23. the left hand side of the rule).
  24. The children of a Symbol or Intermediate Node will always be Packed Nodes;
  25. with each Packed Node child representing a single derivation of a production.
  26. Hence a Symbol Node with a single child is unambiguous.
  27. """
  28. __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash')
  29. def __init__(self, s, start, end):
  30. self.s = s
  31. self.start = start
  32. self.end = end
  33. self._children = set()
  34. self.paths = set()
  35. self.paths_loaded = False
  36. ### We use inf here as it can be safely negated without resorting to conditionals,
  37. # unlike None or float('NaN'), and sorts appropriately.
  38. self.priority = float('-inf')
  39. self.is_intermediate = isinstance(s, tuple)
  40. self._hash = hash((self.s, self.start, self.end))
  41. def add_family(self, lr0, rule, start, left, right):
  42. self._children.add(PackedNode(self, lr0, rule, start, left, right))
  43. def add_path(self, transitive, node):
  44. self.paths.add((transitive, node))
  45. def load_paths(self):
  46. for transitive, node in self.paths:
  47. if transitive.next_titem is not None:
  48. vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end)
  49. vn.add_path(transitive.next_titem, node)
  50. self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
  51. else:
  52. self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
  53. self.paths_loaded = True
  54. @property
  55. def is_ambiguous(self):
  56. return len(self.children) > 1
  57. @property
  58. def children(self):
  59. if not self.paths_loaded: self.load_paths()
  60. return sorted(self._children, key=attrgetter('sort_key'))
  61. def __iter__(self):
  62. return iter(self._children)
  63. def __eq__(self, other):
  64. if not isinstance(other, SymbolNode):
  65. return False
  66. return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end)
  67. def __hash__(self):
  68. return self._hash
  69. def __repr__(self):
  70. if self.is_intermediate:
  71. rule = self.s[0]
  72. ptr = self.s[1]
  73. before = ( expansion.name for expansion in rule.expansion[:ptr] )
  74. after = ( expansion.name for expansion in rule.expansion[ptr:] )
  75. symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
  76. else:
  77. symbol = self.s.name
  78. return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority)
  79. class PackedNode(ForestNode):
  80. """
  81. A Packed Node represents a single derivation in a symbol node.
  82. """
  83. __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash')
  84. def __init__(self, parent, s, rule, start, left, right):
  85. self.parent = parent
  86. self.s = s
  87. self.start = start
  88. self.rule = rule
  89. self.left = left
  90. self.right = right
  91. self.priority = float('-inf')
  92. self._hash = hash((self.left, self.right))
  93. @property
  94. def is_empty(self):
  95. return self.left is None and self.right is None
  96. @property
  97. def sort_key(self):
  98. """
  99. Used to sort PackedNode children of SymbolNodes.
  100. A SymbolNode has multiple PackedNodes if it matched
  101. ambiguously. Hence, we use the sort order to identify
  102. the order in which ambiguous children should be considered.
  103. """
  104. return self.is_empty, -self.priority, self.rule.order
  105. @property
  106. def children(self):
  107. return [x for x in [self.left, self.right] if x is not None]
  108. def __iter__(self):
  109. return iter([self.left, self.right])
  110. def __eq__(self, other):
  111. if not isinstance(other, PackedNode):
  112. return False
  113. return self is other or (self.left == other.left and self.right == other.right)
  114. def __hash__(self):
  115. return self._hash
  116. def __repr__(self):
  117. if isinstance(self.s, tuple):
  118. rule = self.s[0]
  119. ptr = self.s[1]
  120. before = ( expansion.name for expansion in rule.expansion[:ptr] )
  121. after = ( expansion.name for expansion in rule.expansion[ptr:] )
  122. symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
  123. else:
  124. symbol = self.s.name
  125. return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order)
  126. class ForestVisitor(object):
  127. """
  128. An abstract base class for building forest visitors.
  129. Use this as a base when you need to walk the forest.
  130. """
  131. def visit_token_node(self, node): pass
  132. def visit_symbol_node_in(self, node): pass
  133. def visit_symbol_node_out(self, node): pass
  134. def visit_packed_node_in(self, node): pass
  135. def visit_packed_node_out(self, node): pass
  136. def on_cycle(self, node, get_path): pass
  137. def visit(self, root):
  138. def make_get_path(node):
  139. """Create a function that will return a path from `node` to
  140. the last visited node. Used for the `on_cycle` callback."""
  141. def get_path():
  142. index = len(path) - 1
  143. while id(path[index]) != id(node):
  144. index -= 1
  145. return path[index:]
  146. return get_path
  147. # Visiting is a list of IDs of all symbol/intermediate nodes currently in
  148. # the stack. It serves two purposes: to detect when we 'recurse' in and out
  149. # of a symbol/intermediate so that we can process both up and down. Also,
  150. # since the SPPF can have cycles it allows us to detect if we're trying
  151. # to recurse into a node that's already on the stack (infinite recursion).
  152. visiting = set()
  153. # a list of nodes that are currently being visited
  154. # used for the `on_cycle` callback
  155. path = list()
  156. # We do not use recursion here to walk the Forest due to the limited
  157. # stack size in python. Therefore input_stack is essentially our stack.
  158. input_stack = deque([root])
  159. # It is much faster to cache these as locals since they are called
  160. # many times in large parses.
  161. vpno = getattr(self, 'visit_packed_node_out')
  162. vpni = getattr(self, 'visit_packed_node_in')
  163. vsno = getattr(self, 'visit_symbol_node_out')
  164. vsni = getattr(self, 'visit_symbol_node_in')
  165. vino = getattr(self, 'visit_intermediate_node_out', vsno)
  166. vini = getattr(self, 'visit_intermediate_node_in', vsni)
  167. vtn = getattr(self, 'visit_token_node')
  168. oc = getattr(self, 'on_cycle')
  169. while input_stack:
  170. current = next(reversed(input_stack))
  171. try:
  172. next_node = next(current)
  173. except StopIteration:
  174. input_stack.pop()
  175. continue
  176. except TypeError:
  177. ### If the current object is not an iterator, pass through to Token/SymbolNode
  178. pass
  179. else:
  180. if next_node is None:
  181. continue
  182. if id(next_node) in visiting:
  183. oc(next_node, make_get_path(next_node))
  184. continue
  185. input_stack.append(next_node)
  186. continue
  187. if not isinstance(current, ForestNode):
  188. vtn(current)
  189. input_stack.pop()
  190. continue
  191. current_id = id(current)
  192. if current_id in visiting:
  193. if isinstance(current, PackedNode):
  194. vpno(current)
  195. elif current.is_intermediate:
  196. vino(current)
  197. else:
  198. vsno(current)
  199. input_stack.pop()
  200. path.pop()
  201. visiting.remove(current_id)
  202. continue
  203. else:
  204. visiting.add(current_id)
  205. path.append(current)
  206. if isinstance(current, PackedNode):
  207. next_node = vpni(current)
  208. elif current.is_intermediate:
  209. next_node = vini(current)
  210. else:
  211. next_node = vsni(current)
  212. if next_node is None:
  213. continue
  214. if not isinstance(next_node, ForestNode) and \
  215. not isinstance(next_node, Token):
  216. next_node = iter(next_node)
  217. elif id(next_node) in visiting:
  218. oc(next_node, make_get_path(next_node))
  219. continue
  220. input_stack.append(next_node)
  221. continue
  222. class ForestSumVisitor(ForestVisitor):
  223. """
  224. A visitor for prioritizing ambiguous parts of the Forest.
  225. This visitor is used when support for explicit priorities on
  226. rules is requested (whether normal, or invert). It walks the
  227. forest (or subsets thereof) and cascades properties upwards
  228. from the leaves.
  229. It would be ideal to do this during parsing, however this would
  230. require processing each Earley item multiple times. That's
  231. a big performance drawback; so running a forest walk is the
  232. lesser of two evils: there can be significantly more Earley
  233. items created during parsing than there are SPPF nodes in the
  234. final tree.
  235. """
  236. def visit_packed_node_in(self, node):
  237. return iter([node.left, node.right])
  238. def visit_symbol_node_in(self, node):
  239. return iter(node.children)
  240. def visit_packed_node_out(self, node):
  241. priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0
  242. priority += getattr(node.right, 'priority', 0)
  243. priority += getattr(node.left, 'priority', 0)
  244. node.priority = priority
  245. def visit_symbol_node_out(self, node):
  246. node.priority = max(child.priority for child in node.children)
  247. class ForestToTreeVisitor(ForestVisitor):
  248. """
  249. A Forest visitor which converts an SPPF forest to an unambiguous AST.
  250. The implementation in this visitor walks only the first ambiguous child
  251. of each symbol node. When it finds an ambiguous symbol node it first
  252. calls the forest_sum_visitor implementation to sort the children
  253. into preference order using the algorithms defined there; so the first
  254. child should always be the highest preference. The forest_sum_visitor
  255. implementation should be another ForestVisitor which sorts the children
  256. according to some priority mechanism.
  257. """
  258. __slots__ = ['forest_sum_visitor', 'callbacks', 'output_stack']
  259. def __init__(self, callbacks, forest_sum_visitor = None):
  260. assert callbacks
  261. self.forest_sum_visitor = forest_sum_visitor
  262. self.callbacks = callbacks
  263. def visit(self, root):
  264. self.output_stack = deque()
  265. return super(ForestToTreeVisitor, self).visit(root)
  266. def visit_token_node(self, node):
  267. self.output_stack[-1].append(node)
  268. def visit_symbol_node_in(self, node):
  269. if self.forest_sum_visitor and node.is_ambiguous and isinf(node.priority):
  270. self.forest_sum_visitor.visit(node)
  271. return next(iter(node.children))
  272. def visit_packed_node_in(self, node):
  273. if not node.parent.is_intermediate:
  274. self.output_stack.append([])
  275. return iter([node.left, node.right])
  276. def visit_packed_node_out(self, node):
  277. if not node.parent.is_intermediate:
  278. result = self.callbacks[node.rule](self.output_stack.pop())
  279. if self.output_stack:
  280. self.output_stack[-1].append(result)
  281. else:
  282. self.result = result
  283. class ForestToAmbiguousTreeVisitor(ForestToTreeVisitor):
  284. """
  285. A Forest visitor which converts an SPPF forest to an ambiguous AST.
  286. Because of the fundamental disparity between what can be stored in
  287. an SPPF and what can be stored in a Tree; this implementation is not
  288. complete. It correctly deals with ambiguities that occur on symbol nodes only,
  289. and cannot deal with ambiguities that occur on intermediate nodes.
  290. Usually, most parsers can be rewritten to avoid intermediate node
  291. ambiguities. Also, this implementation could be fixed, however
  292. the code to handle intermediate node ambiguities is messy and
  293. would not be performant. It is much better not to use this and
  294. instead to correctly disambiguate the forest and only store unambiguous
  295. parses in Trees. It is here just to provide some parity with the
  296. old ambiguity='explicit'.
  297. This is mainly used by the test framework, to make it simpler to write
  298. tests ensuring the SPPF contains the right results.
  299. """
  300. def __init__(self, callbacks, forest_sum_visitor = ForestSumVisitor):
  301. super(ForestToAmbiguousTreeVisitor, self).__init__(callbacks, forest_sum_visitor)
  302. def visit_token_node(self, node):
  303. self.output_stack[-1].children.append(node)
  304. def visit_symbol_node_in(self, node):
  305. if node.is_ambiguous:
  306. if self.forest_sum_visitor and isinf(node.priority):
  307. self.forest_sum_visitor.visit(node)
  308. if node.is_intermediate:
  309. # TODO Support ambiguous intermediate nodes!
  310. logger.warning("Ambiguous intermediate node in the SPPF: %s. "
  311. "Lark does not currently process these ambiguities; resolving with the first derivation.", node)
  312. return next(iter(node.children))
  313. else:
  314. self.output_stack.append(Tree('_ambig', []))
  315. return iter(node.children)
  316. def visit_symbol_node_out(self, node):
  317. if not node.is_intermediate and node.is_ambiguous:
  318. result = self.output_stack.pop()
  319. if self.output_stack:
  320. self.output_stack[-1].children.append(result)
  321. else:
  322. self.result = result
  323. def visit_packed_node_in(self, node):
  324. if not node.parent.is_intermediate:
  325. self.output_stack.append(Tree('drv', []))
  326. return iter([node.left, node.right])
  327. def visit_packed_node_out(self, node):
  328. if not node.parent.is_intermediate:
  329. result = self.callbacks[node.rule](self.output_stack.pop().children)
  330. if self.output_stack:
  331. self.output_stack[-1].children.append(result)
  332. else:
  333. self.result = result
  334. class CompleteForestToAmbiguousTreeVisitor(ForestToTreeVisitor):
  335. """
  336. An augmented version of ForestToAmbiguousTreeVisitor that is designed to
  337. handle ambiguous intermediate nodes as well as ambiguous symbol nodes.
  338. On the way down:
  339. - When an ambiguous intermediate node is encountered, an '_iambig' node
  340. is inserted into the tree.
  341. - Each possible derivation of an ambiguous intermediate node is represented
  342. by an '_inter' node added as a child of the corresponding '_iambig' node.
  343. On the way up, these nodes are propagated up the tree and collapsed
  344. into a single '_ambig' node for the nearest symbol node ancestor.
  345. This is achieved by the AmbiguousIntermediateExpander contained in
  346. the callbacks.
  347. """
  348. def _collapse_ambig(self, children):
  349. new_children = []
  350. for child in children:
  351. if child.data == '_ambig':
  352. new_children += child.children
  353. else:
  354. new_children.append(child)
  355. return new_children
  356. def visit_token_node(self, node):
  357. self.output_stack[-1].children.append(node)
  358. def visit_symbol_node_in(self, node):
  359. if node.is_ambiguous:
  360. if self.forest_sum_visitor and isinf(node.priority):
  361. self.forest_sum_visitor.visit(node)
  362. if node.is_intermediate:
  363. self.output_stack.append(Tree('_iambig', []))
  364. else:
  365. self.output_stack.append(Tree('_ambig', []))
  366. return iter(node.children)
  367. def visit_symbol_node_out(self, node):
  368. if node.is_ambiguous:
  369. result = self.output_stack.pop()
  370. if not node.is_intermediate:
  371. result = Tree('_ambig', self._collapse_ambig(result.children))
  372. if self.output_stack:
  373. self.output_stack[-1].children.append(result)
  374. else:
  375. self.result = result
  376. def visit_packed_node_in(self, node):
  377. if not node.parent.is_intermediate:
  378. self.output_stack.append(Tree('drv', []))
  379. elif node.parent.is_ambiguous:
  380. self.output_stack.append(Tree('_inter', []))
  381. return iter([node.left, node.right])
  382. def visit_packed_node_out(self, node):
  383. if not node.parent.is_intermediate:
  384. result = self.callbacks[node.rule](self.output_stack.pop().children)
  385. elif node.parent.is_ambiguous:
  386. result = self.output_stack.pop()
  387. else:
  388. return
  389. if self.output_stack:
  390. self.output_stack[-1].children.append(result)
  391. else:
  392. self.result = result
  393. class ForestToPyDotVisitor(ForestVisitor):
  394. """
  395. A Forest visitor which writes the SPPF to a PNG.
  396. The SPPF can get really large, really quickly because
  397. of the amount of meta-data it stores, so this is probably
  398. only useful for trivial trees and learning how the SPPF
  399. is structured.
  400. """
  401. def __init__(self, rankdir="TB"):
  402. self.pydot = import_module('pydot')
  403. self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir)
  404. def visit(self, root, filename):
  405. super(ForestToPyDotVisitor, self).visit(root)
  406. self.graph.write_png(filename)
  407. def visit_token_node(self, node):
  408. graph_node_id = str(id(node))
  409. graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
  410. graph_node_color = 0x808080
  411. graph_node_style = "\"filled,rounded\""
  412. graph_node_shape = "diamond"
  413. graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
  414. self.graph.add_node(graph_node)
  415. def visit_packed_node_in(self, node):
  416. graph_node_id = str(id(node))
  417. graph_node_label = repr(node)
  418. graph_node_color = 0x808080
  419. graph_node_style = "filled"
  420. graph_node_shape = "diamond"
  421. graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
  422. self.graph.add_node(graph_node)
  423. return iter([node.left, node.right])
  424. def visit_packed_node_out(self, node):
  425. graph_node_id = str(id(node))
  426. graph_node = self.graph.get_node(graph_node_id)[0]
  427. for child in [node.left, node.right]:
  428. if child is not None:
  429. child_graph_node_id = str(id(child))
  430. child_graph_node = self.graph.get_node(child_graph_node_id)[0]
  431. self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
  432. else:
  433. #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay.
  434. child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890))
  435. child_graph_node_style = "invis"
  436. child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None")
  437. child_edge_style = "invis"
  438. self.graph.add_node(child_graph_node)
  439. self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style))
  440. def visit_symbol_node_in(self, node):
  441. graph_node_id = str(id(node))
  442. graph_node_label = repr(node)
  443. graph_node_color = 0x808080
  444. graph_node_style = "\"filled\""
  445. if node.is_intermediate:
  446. graph_node_shape = "ellipse"
  447. else:
  448. graph_node_shape = "rectangle"
  449. graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
  450. self.graph.add_node(graph_node)
  451. return iter(node.children)
  452. def visit_symbol_node_out(self, node):
  453. graph_node_id = str(id(node))
  454. graph_node = self.graph.get_node(graph_node_id)[0]
  455. for child in node.children:
  456. child_graph_node_id = str(id(child))
  457. child_graph_node = self.graph.get_node(child_graph_node_id)[0]
  458. self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))