This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

573 lines
21 KiB

  1. """"This module implements an SPPF implementation
  2. This is used as the primary output mechanism for the Earley parser
  3. in order to store complex ambiguities.
  4. Full reference and more details is here:
  5. http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
  6. """
  7. from random import randint
  8. from math import isinf
  9. from collections import deque
  10. from operator import attrgetter
  11. from importlib import import_module
  12. from .. visitors import Discard
  13. from ..lexer import Token
  14. from ..utils import logger
  15. from ..tree import Tree
  16. class ForestNode(object):
  17. pass
  18. class SymbolNode(ForestNode):
  19. """
  20. A Symbol Node represents a symbol (or Intermediate LR0).
  21. Symbol nodes are keyed by the symbol (s). For intermediate nodes
  22. s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol
  23. nodes, s will be a string representing the non-terminal origin (i.e.
  24. the left hand side of the rule).
  25. The children of a Symbol or Intermediate Node will always be Packed Nodes;
  26. with each Packed Node child representing a single derivation of a production.
  27. Hence a Symbol Node with a single child is unambiguous.
  28. """
  29. __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash')
  30. def __init__(self, s, start, end):
  31. self.s = s
  32. self.start = start
  33. self.end = end
  34. self._children = set()
  35. self.paths = set()
  36. self.paths_loaded = False
  37. ### We use inf here as it can be safely negated without resorting to conditionals,
  38. # unlike None or float('NaN'), and sorts appropriately.
  39. self.priority = float('-inf')
  40. self.is_intermediate = isinstance(s, tuple)
  41. self._hash = hash((self.s, self.start, self.end))
  42. def add_family(self, lr0, rule, start, left, right):
  43. self._children.add(PackedNode(self, lr0, rule, start, left, right))
  44. def add_path(self, transitive, node):
  45. self.paths.add((transitive, node))
  46. def load_paths(self):
  47. for transitive, node in self.paths:
  48. if transitive.next_titem is not None:
  49. vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end)
  50. vn.add_path(transitive.next_titem, node)
  51. self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
  52. else:
  53. self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
  54. self.paths_loaded = True
  55. @property
  56. def is_ambiguous(self):
  57. return len(self.children) > 1
  58. @property
  59. def children(self):
  60. if not self.paths_loaded: self.load_paths()
  61. return sorted(self._children, key=attrgetter('sort_key'))
  62. def __iter__(self):
  63. return iter(self._children)
  64. def __eq__(self, other):
  65. if not isinstance(other, SymbolNode):
  66. return False
  67. return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end)
  68. def __hash__(self):
  69. return self._hash
  70. def __repr__(self):
  71. if self.is_intermediate:
  72. rule = self.s[0]
  73. ptr = self.s[1]
  74. before = ( expansion.name for expansion in rule.expansion[:ptr] )
  75. after = ( expansion.name for expansion in rule.expansion[ptr:] )
  76. symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
  77. else:
  78. symbol = self.s.name
  79. return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority)
  80. class PackedNode(ForestNode):
  81. """
  82. A Packed Node represents a single derivation in a symbol node.
  83. """
  84. __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash')
  85. def __init__(self, parent, s, rule, start, left, right):
  86. self.parent = parent
  87. self.s = s
  88. self.start = start
  89. self.rule = rule
  90. self.left = left
  91. self.right = right
  92. self.priority = float('-inf')
  93. self._hash = hash((self.left, self.right))
  94. @property
  95. def is_empty(self):
  96. return self.left is None and self.right is None
  97. @property
  98. def sort_key(self):
  99. """
  100. Used to sort PackedNode children of SymbolNodes.
  101. A SymbolNode has multiple PackedNodes if it matched
  102. ambiguously. Hence, we use the sort order to identify
  103. the order in which ambiguous children should be considered.
  104. """
  105. return self.is_empty, -self.priority, self.rule.order
  106. @property
  107. def children(self):
  108. return [x for x in [self.left, self.right] if x is not None]
  109. def __iter__(self):
  110. return iter([self.left, self.right])
  111. def __eq__(self, other):
  112. if not isinstance(other, PackedNode):
  113. return False
  114. return self is other or (self.left == other.left and self.right == other.right)
  115. def __hash__(self):
  116. return self._hash
  117. def __repr__(self):
  118. if isinstance(self.s, tuple):
  119. rule = self.s[0]
  120. ptr = self.s[1]
  121. before = ( expansion.name for expansion in rule.expansion[:ptr] )
  122. after = ( expansion.name for expansion in rule.expansion[ptr:] )
  123. symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
  124. else:
  125. symbol = self.s.name
  126. return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order)
  127. class ForestVisitor(object):
  128. """
  129. An abstract base class for building forest visitors.
  130. Use this as a base when you need to walk the forest.
  131. """
  132. def visit_token_node(self, node): pass
  133. def visit_symbol_node_in(self, node): pass
  134. def visit_symbol_node_out(self, node): pass
  135. def visit_packed_node_in(self, node): pass
  136. def visit_packed_node_out(self, node): pass
  137. def on_cycle(self, node, get_path): pass
  138. def visit(self, root):
  139. def make_get_path(node):
  140. """Create a function that will return a path from `node` to
  141. the last visited node. Used for the `on_cycle` callback."""
  142. def get_path():
  143. index = len(path) - 1
  144. while id(path[index]) != id(node):
  145. index -= 1
  146. return path[index:]
  147. return get_path
  148. # Visiting is a list of IDs of all symbol/intermediate nodes currently in
  149. # the stack. It serves two purposes: to detect when we 'recurse' in and out
  150. # of a symbol/intermediate so that we can process both up and down. Also,
  151. # since the SPPF can have cycles it allows us to detect if we're trying
  152. # to recurse into a node that's already on the stack (infinite recursion).
  153. visiting = set()
  154. # a list of nodes that are currently being visited
  155. # used for the `on_cycle` callback
  156. path = list()
  157. # We do not use recursion here to walk the Forest due to the limited
  158. # stack size in python. Therefore input_stack is essentially our stack.
  159. input_stack = deque([root])
  160. # It is much faster to cache these as locals since they are called
  161. # many times in large parses.
  162. vpno = getattr(self, 'visit_packed_node_out')
  163. vpni = getattr(self, 'visit_packed_node_in')
  164. vsno = getattr(self, 'visit_symbol_node_out')
  165. vsni = getattr(self, 'visit_symbol_node_in')
  166. vino = getattr(self, 'visit_intermediate_node_out', vsno)
  167. vini = getattr(self, 'visit_intermediate_node_in', vsni)
  168. vtn = getattr(self, 'visit_token_node')
  169. oc = getattr(self, 'on_cycle')
  170. while input_stack:
  171. current = next(reversed(input_stack))
  172. try:
  173. next_node = next(current)
  174. except StopIteration:
  175. input_stack.pop()
  176. continue
  177. except TypeError:
  178. ### If the current object is not an iterator, pass through to Token/SymbolNode
  179. pass
  180. else:
  181. if next_node is None:
  182. continue
  183. if id(next_node) in visiting:
  184. oc(next_node, make_get_path(next_node))
  185. continue
  186. input_stack.append(next_node)
  187. continue
  188. if not isinstance(current, ForestNode):
  189. vtn(current)
  190. input_stack.pop()
  191. continue
  192. current_id = id(current)
  193. if current_id in visiting:
  194. if isinstance(current, PackedNode):
  195. vpno(current)
  196. elif current.is_intermediate:
  197. vino(current)
  198. else:
  199. vsno(current)
  200. input_stack.pop()
  201. path.pop()
  202. visiting.remove(current_id)
  203. continue
  204. else:
  205. visiting.add(current_id)
  206. path.append(current)
  207. if isinstance(current, PackedNode):
  208. next_node = vpni(current)
  209. elif current.is_intermediate:
  210. next_node = vini(current)
  211. else:
  212. next_node = vsni(current)
  213. if next_node is None:
  214. continue
  215. if not isinstance(next_node, ForestNode) and \
  216. not isinstance(next_node, Token):
  217. next_node = iter(next_node)
  218. elif id(next_node) in visiting:
  219. oc(next_node, make_get_path(next_node))
  220. continue
  221. input_stack.append(next_node)
  222. continue
  223. class ForestTransformer(ForestVisitor):
  224. """The base class for a bottom-up forest transformation.
  225. Transformations are applied via inheritance and overriding of the
  226. following methods:
  227. transform_symbol_node
  228. transform_intermediate_node
  229. transform_packed_node
  230. transform_token_node
  231. `transform_token_node` receives a Token as an argument.
  232. All other methods receive the node that is being transformed and
  233. a list of the results of the transformations of that node's children.
  234. The return value of these methods are the resulting transformations.
  235. If `Discard` is raised in a transformation, no data from that node
  236. will be passed to its parent's transformation.
  237. """
  238. def __init__(self):
  239. # results of transformations
  240. self.data = dict()
  241. # used to track parent nodes
  242. self.node_stack = deque()
  243. def transform(self, root):
  244. """Perform a transformation on a Forest."""
  245. self.node_stack.append('result')
  246. self.data['result'] = []
  247. self.visit(root)
  248. assert len(self.data['result']) <= 1
  249. if self.data['result']:
  250. return self.data['result'][0]
  251. def transform_symbol_node(self, node, data):
  252. return node
  253. def transform_intermediate_node(self, node, data):
  254. return node
  255. def transform_packed_node(self, node, data):
  256. return node
  257. def transform_token_node(self, node):
  258. return node
  259. def visit_symbol_node_in(self, node):
  260. self.node_stack.append(id(node))
  261. self.data[id(node)] = []
  262. return node.children
  263. def visit_packed_node_in(self, node):
  264. self.node_stack.append(id(node))
  265. self.data[id(node)] = []
  266. return node.children
  267. def visit_token_node(self, node):
  268. try:
  269. transformed = self.transform_token_node(node)
  270. except Discard:
  271. pass
  272. else:
  273. self.data[self.node_stack[-1]].append(transformed)
  274. def visit_symbol_node_out(self, node):
  275. self.node_stack.pop()
  276. try:
  277. transformed = self.transform_symbol_node(node, self.data[id(node)])
  278. except Discard:
  279. pass
  280. else:
  281. self.data[self.node_stack[-1]].append(transformed)
  282. finally:
  283. del self.data[id(node)]
  284. def visit_intermediate_node_out(self, node):
  285. self.node_stack.pop()
  286. try:
  287. transformed = self.transform_intermediate_node(node, self.data[id(node)])
  288. except Discard:
  289. pass
  290. else:
  291. self.data[self.node_stack[-1]].append(transformed)
  292. finally:
  293. del self.data[id(node)]
  294. def visit_packed_node_out(self, node):
  295. self.node_stack.pop()
  296. try:
  297. transformed = self.transform_packed_node(node, self.data[id(node)])
  298. except Discard:
  299. pass
  300. else:
  301. self.data[self.node_stack[-1]].append(transformed)
  302. finally:
  303. del self.data[id(node)]
  304. class ForestSumVisitor(ForestVisitor):
  305. """
  306. A visitor for prioritizing ambiguous parts of the Forest.
  307. This visitor is used when support for explicit priorities on
  308. rules is requested (whether normal, or invert). It walks the
  309. forest (or subsets thereof) and cascades properties upwards
  310. from the leaves.
  311. It would be ideal to do this during parsing, however this would
  312. require processing each Earley item multiple times. That's
  313. a big performance drawback; so running a forest walk is the
  314. lesser of two evils: there can be significantly more Earley
  315. items created during parsing than there are SPPF nodes in the
  316. final tree.
  317. """
  318. def visit_packed_node_in(self, node):
  319. return iter([node.left, node.right])
  320. def visit_symbol_node_in(self, node):
  321. return iter(node.children)
  322. def visit_packed_node_out(self, node):
  323. priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0
  324. priority += getattr(node.right, 'priority', 0)
  325. priority += getattr(node.left, 'priority', 0)
  326. node.priority = priority
  327. def visit_symbol_node_out(self, node):
  328. node.priority = max(child.priority for child in node.children)
  329. class ForestToParseTree(ForestTransformer):
  330. """Used by the earley parser when ambiguity equals 'resolve' or
  331. 'explicit'. Transforms an SPPF into an (ambiguous) parse tree.
  332. tree_class: The Tree class to use for construction
  333. callbacks: A dictionary of rules to functions that output a tree
  334. prioritizer: A ForestVisitor that manipulates the priorities of
  335. ForestNodes
  336. resolve_ambiguity: If True, ambiguities will be resolved based on
  337. priorities. Otherwise, `_ambig` nodes will be in the resulting
  338. tree.
  339. """
  340. def __init__(self, tree_class=Tree, callbacks=dict(), prioritizer=ForestSumVisitor(), resolve_ambiguity=True):
  341. super(ForestToParseTree, self).__init__()
  342. self.tree_class = tree_class
  343. self.callbacks = callbacks
  344. self.prioritizer = prioritizer
  345. self.resolve_ambiguity = resolve_ambiguity
  346. self._on_cycle_retreat = False
  347. def on_cycle(self, node, get_path):
  348. logger.warning("Cycle encountered in the SPPF at node: %s. "
  349. "As infinite ambiguities cannot be represented in a tree, "
  350. "this family of derivations will be discarded.", node)
  351. if self.resolve_ambiguity:
  352. # TODO: choose a different path if cycle is encountered
  353. logger.warning("At this time, using ambiguity resolution for SPPFs "
  354. "with cycles may result in None being returned.")
  355. self._on_cycle_retreat = True
  356. def _check_cycle(self, node):
  357. if self._on_cycle_retreat:
  358. raise Discard
  359. def _collapse_ambig(self, children):
  360. new_children = []
  361. for child in children:
  362. if hasattr(child, 'data') and child.data == '_ambig':
  363. new_children += child.children
  364. else:
  365. new_children.append(child)
  366. return new_children
  367. def _call_rule_func(self, node, data):
  368. # called when transforming children of symbol nodes
  369. # data is a list of trees or tokens that correspond to the
  370. # symbol's rule expansion
  371. return self.callbacks[node.rule](data)
  372. def _call_ambig_func(self, node, data):
  373. # called when transforming a symbol node
  374. # data is a list of trees where each tree's data is
  375. # equal to the name of the symbol or one of its aliases.
  376. if len(data) > 1:
  377. return self.tree_class('_ambig', data)
  378. elif data:
  379. return data[0]
  380. raise Discard
  381. def transform_symbol_node(self, node, data):
  382. self._check_cycle(node)
  383. data = self._collapse_ambig(data)
  384. return self._call_ambig_func(node, data)
  385. def transform_intermediate_node(self, node, data):
  386. self._check_cycle(node)
  387. if len(data) > 1:
  388. children = [self.tree_class('_inter', c) for c in data]
  389. return self.tree_class('_iambig', children)
  390. return data[0]
  391. def transform_packed_node(self, node, data):
  392. self._check_cycle(node)
  393. children = list()
  394. assert len(data) <= 2
  395. if node.left:
  396. if node.left.is_intermediate and isinstance(data[0], list):
  397. children += data[0]
  398. else:
  399. children.append(data[0])
  400. if len(data) > 1:
  401. children.append(data[1])
  402. elif data:
  403. children.append(data[0])
  404. if node.parent.is_intermediate:
  405. return children
  406. return self._call_rule_func(node, children)
  407. def visit_symbol_node_in(self, node):
  408. self._on_cycle_retreat = False
  409. super(ForestToParseTree, self).visit_symbol_node_in(node)
  410. if self.prioritizer and node.is_ambiguous and isinf(node.priority):
  411. self.prioritizer.visit(node)
  412. if self.resolve_ambiguity:
  413. return node.children[0]
  414. return node.children
  415. def visit_packed_node_in(self, node):
  416. self._on_cycle_retreat = False
  417. return super(ForestToParseTree, self).visit_packed_node_in(node)
  418. def visit_token_node(self, node):
  419. self._on_cycle_retreat = False
  420. return super(ForestToParseTree, self).visit_token_node(node)
  421. class ForestToPyDotVisitor(ForestVisitor):
  422. """
  423. A Forest visitor which writes the SPPF to a PNG.
  424. The SPPF can get really large, really quickly because
  425. of the amount of meta-data it stores, so this is probably
  426. only useful for trivial trees and learning how the SPPF
  427. is structured.
  428. """
  429. def __init__(self, rankdir="TB"):
  430. self.pydot = import_module('pydot')
  431. self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir)
  432. def visit(self, root, filename):
  433. super(ForestToPyDotVisitor, self).visit(root)
  434. self.graph.write_png(filename)
  435. def visit_token_node(self, node):
  436. graph_node_id = str(id(node))
  437. graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
  438. graph_node_color = 0x808080
  439. graph_node_style = "\"filled,rounded\""
  440. graph_node_shape = "diamond"
  441. graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
  442. self.graph.add_node(graph_node)
  443. def visit_packed_node_in(self, node):
  444. graph_node_id = str(id(node))
  445. graph_node_label = repr(node)
  446. graph_node_color = 0x808080
  447. graph_node_style = "filled"
  448. graph_node_shape = "diamond"
  449. graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
  450. self.graph.add_node(graph_node)
  451. return iter([node.left, node.right])
  452. def visit_packed_node_out(self, node):
  453. graph_node_id = str(id(node))
  454. graph_node = self.graph.get_node(graph_node_id)[0]
  455. for child in [node.left, node.right]:
  456. if child is not None:
  457. child_graph_node_id = str(id(child))
  458. child_graph_node = self.graph.get_node(child_graph_node_id)[0]
  459. self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
  460. else:
  461. #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay.
  462. child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890))
  463. child_graph_node_style = "invis"
  464. child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None")
  465. child_edge_style = "invis"
  466. self.graph.add_node(child_graph_node)
  467. self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style))
  468. def visit_symbol_node_in(self, node):
  469. graph_node_id = str(id(node))
  470. graph_node_label = repr(node)
  471. graph_node_color = 0x808080
  472. graph_node_style = "\"filled\""
  473. if node.is_intermediate:
  474. graph_node_shape = "ellipse"
  475. else:
  476. graph_node_shape = "rectangle"
  477. graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
  478. self.graph.add_node(graph_node)
  479. return iter(node.children)
  480. def visit_symbol_node_out(self, node):
  481. graph_node_id = str(id(node))
  482. graph_node = self.graph.get_node(graph_node_id)[0]
  483. for child in node.children:
  484. child_graph_node_id = str(id(child))
  485. child_graph_node = self.graph.get_node(child_graph_node_id)[0]
  486. self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))