This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

676 lines
25 KiB

  1. """"This module implements an SPPF implementation
  2. This is used as the primary output mechanism for the Earley parser
  3. in order to store complex ambiguities.
  4. Full reference and more details is here:
  5. http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
  6. """
  7. from random import randint
  8. from math import isinf
  9. from collections import deque
  10. from operator import attrgetter
  11. from importlib import import_module
  12. from functools import partial
  13. from ..parse_tree_builder import AmbiguousIntermediateExpander
  14. from ..visitors import Discard
  15. from ..lexer import Token
  16. from ..utils import logger
  17. from ..tree import Tree
  18. class ForestNode(object):
  19. pass
  20. class SymbolNode(ForestNode):
  21. """
  22. A Symbol Node represents a symbol (or Intermediate LR0).
  23. Symbol nodes are keyed by the symbol (s). For intermediate nodes
  24. s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol
  25. nodes, s will be a string representing the non-terminal origin (i.e.
  26. the left hand side of the rule).
  27. The children of a Symbol or Intermediate Node will always be Packed Nodes;
  28. with each Packed Node child representing a single derivation of a production.
  29. Hence a Symbol Node with a single child is unambiguous.
  30. """
  31. __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash')
  32. def __init__(self, s, start, end):
  33. self.s = s
  34. self.start = start
  35. self.end = end
  36. self._children = set()
  37. self.paths = set()
  38. self.paths_loaded = False
  39. ### We use inf here as it can be safely negated without resorting to conditionals,
  40. # unlike None or float('NaN'), and sorts appropriately.
  41. self.priority = float('-inf')
  42. self.is_intermediate = isinstance(s, tuple)
  43. self._hash = hash((self.s, self.start, self.end))
  44. def add_family(self, lr0, rule, start, left, right):
  45. self._children.add(PackedNode(self, lr0, rule, start, left, right))
  46. def add_path(self, transitive, node):
  47. self.paths.add((transitive, node))
  48. def load_paths(self):
  49. for transitive, node in self.paths:
  50. if transitive.next_titem is not None:
  51. vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end)
  52. vn.add_path(transitive.next_titem, node)
  53. self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
  54. else:
  55. self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
  56. self.paths_loaded = True
  57. @property
  58. def is_ambiguous(self):
  59. return len(self.children) > 1
  60. @property
  61. def children(self):
  62. if not self.paths_loaded: self.load_paths()
  63. return sorted(self._children, key=attrgetter('sort_key'))
  64. def __iter__(self):
  65. return iter(self._children)
  66. def __eq__(self, other):
  67. if not isinstance(other, SymbolNode):
  68. return False
  69. return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end)
  70. def __hash__(self):
  71. return self._hash
  72. def __repr__(self):
  73. if self.is_intermediate:
  74. rule = self.s[0]
  75. ptr = self.s[1]
  76. before = ( expansion.name for expansion in rule.expansion[:ptr] )
  77. after = ( expansion.name for expansion in rule.expansion[ptr:] )
  78. symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
  79. else:
  80. symbol = self.s.name
  81. return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority)
  82. class PackedNode(ForestNode):
  83. """
  84. A Packed Node represents a single derivation in a symbol node.
  85. """
  86. __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash')
  87. def __init__(self, parent, s, rule, start, left, right):
  88. self.parent = parent
  89. self.s = s
  90. self.start = start
  91. self.rule = rule
  92. self.left = left
  93. self.right = right
  94. self.priority = float('-inf')
  95. self._hash = hash((self.left, self.right))
  96. @property
  97. def is_empty(self):
  98. return self.left is None and self.right is None
  99. @property
  100. def sort_key(self):
  101. """
  102. Used to sort PackedNode children of SymbolNodes.
  103. A SymbolNode has multiple PackedNodes if it matched
  104. ambiguously. Hence, we use the sort order to identify
  105. the order in which ambiguous children should be considered.
  106. """
  107. return self.is_empty, -self.priority, self.rule.order
  108. @property
  109. def children(self):
  110. return [x for x in [self.left, self.right] if x is not None]
  111. def __iter__(self):
  112. yield self.left
  113. yield self.right
  114. def __eq__(self, other):
  115. if not isinstance(other, PackedNode):
  116. return False
  117. return self is other or (self.left == other.left and self.right == other.right)
  118. def __hash__(self):
  119. return self._hash
  120. def __repr__(self):
  121. if isinstance(self.s, tuple):
  122. rule = self.s[0]
  123. ptr = self.s[1]
  124. before = ( expansion.name for expansion in rule.expansion[:ptr] )
  125. after = ( expansion.name for expansion in rule.expansion[ptr:] )
  126. symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
  127. else:
  128. symbol = self.s.name
  129. return "({}, {}, {}, {})".format(symbol, self.start, self.priority, self.rule.order)
  130. class ForestVisitor(object):
  131. """
  132. An abstract base class for building forest visitors.
  133. Use this as a base when you need to walk the forest.
  134. """
  135. def get_cycle_in_path(self, node, path):
  136. index = len(path) - 1
  137. while id(path[index]) != id(node):
  138. index -= 1
  139. return path[index:]
  140. def visit_token_node(self, node): pass
  141. def visit_symbol_node_in(self, node): pass
  142. def visit_symbol_node_out(self, node): pass
  143. def visit_packed_node_in(self, node): pass
  144. def visit_packed_node_out(self, node): pass
  145. def on_cycle(self, node, path):
  146. """Called when a cycle is encountered. `node` is the node that causes
  147. the cycle. `path` the list of nodes being visited: nodes that have been
  148. entered but not exited. The first element is the root in a forest
  149. visit, and the last element is the node visited most recently.
  150. `path` should be treated as read-only. The utility function
  151. `get_cycle_in_path` may be used to obtain a slice of `path` that only
  152. contains the nodes that make up the cycle."""
  153. pass
  154. def visit(self, root):
  155. # Visiting is a list of IDs of all symbol/intermediate nodes currently in
  156. # the stack. It serves two purposes: to detect when we 'recurse' in and out
  157. # of a symbol/intermediate so that we can process both up and down. Also,
  158. # since the SPPF can have cycles it allows us to detect if we're trying
  159. # to recurse into a node that's already on the stack (infinite recursion).
  160. visiting = set()
  161. # a list of nodes that are currently being visited
  162. # used for the `on_cycle` callback
  163. path = []
  164. # We do not use recursion here to walk the Forest due to the limited
  165. # stack size in python. Therefore input_stack is essentially our stack.
  166. input_stack = deque([root])
  167. # It is much faster to cache these as locals since they are called
  168. # many times in large parses.
  169. vpno = getattr(self, 'visit_packed_node_out')
  170. vpni = getattr(self, 'visit_packed_node_in')
  171. vsno = getattr(self, 'visit_symbol_node_out')
  172. vsni = getattr(self, 'visit_symbol_node_in')
  173. vino = getattr(self, 'visit_intermediate_node_out', vsno)
  174. vini = getattr(self, 'visit_intermediate_node_in', vsni)
  175. vtn = getattr(self, 'visit_token_node')
  176. oc = getattr(self, 'on_cycle')
  177. while input_stack:
  178. current = next(reversed(input_stack))
  179. try:
  180. next_node = next(current)
  181. except StopIteration:
  182. input_stack.pop()
  183. continue
  184. except TypeError:
  185. ### If the current object is not an iterator, pass through to Token/SymbolNode
  186. pass
  187. else:
  188. if next_node is None:
  189. continue
  190. if id(next_node) in visiting:
  191. oc(next_node, path)
  192. continue
  193. input_stack.append(next_node)
  194. continue
  195. if not isinstance(current, ForestNode):
  196. vtn(current)
  197. input_stack.pop()
  198. continue
  199. current_id = id(current)
  200. if current_id in visiting:
  201. if isinstance(current, PackedNode):
  202. vpno(current)
  203. elif current.is_intermediate:
  204. vino(current)
  205. else:
  206. vsno(current)
  207. input_stack.pop()
  208. path.pop()
  209. visiting.remove(current_id)
  210. continue
  211. else:
  212. visiting.add(current_id)
  213. path.append(current)
  214. if isinstance(current, PackedNode):
  215. next_node = vpni(current)
  216. elif current.is_intermediate:
  217. next_node = vini(current)
  218. else:
  219. next_node = vsni(current)
  220. if next_node is None:
  221. continue
  222. if not isinstance(next_node, ForestNode) and \
  223. not isinstance(next_node, Token):
  224. next_node = iter(next_node)
  225. elif id(next_node) in visiting:
  226. oc(next_node, path)
  227. continue
  228. input_stack.append(next_node)
  229. continue
  230. class ForestTransformer(ForestVisitor):
  231. """The base class for a bottom-up forest transformation.
  232. Transformations are applied via inheritance and overriding of the
  233. following methods:
  234. transform_symbol_node
  235. transform_intermediate_node
  236. transform_packed_node
  237. transform_token_node
  238. `transform_token_node` receives a Token as an argument.
  239. All other methods receive the node that is being transformed and
  240. a list of the results of the transformations of that node's children.
  241. The return value of these methods are the resulting transformations.
  242. If `Discard` is raised in a transformation, no data from that node
  243. will be passed to its parent's transformation.
  244. """
  245. def __init__(self):
  246. # results of transformations
  247. self.data = dict()
  248. # used to track parent nodes
  249. self.node_stack = deque()
  250. def transform(self, root):
  251. """Perform a transformation on a Forest."""
  252. self.node_stack.append('result')
  253. self.data['result'] = []
  254. self.visit(root)
  255. assert len(self.data['result']) <= 1
  256. if self.data['result']:
  257. return self.data['result'][0]
  258. def transform_symbol_node(self, node, data):
  259. return node
  260. def transform_intermediate_node(self, node, data):
  261. return node
  262. def transform_packed_node(self, node, data):
  263. return node
  264. def transform_token_node(self, node):
  265. return node
  266. def visit_symbol_node_in(self, node):
  267. self.node_stack.append(id(node))
  268. self.data[id(node)] = []
  269. return node.children
  270. def visit_packed_node_in(self, node):
  271. self.node_stack.append(id(node))
  272. self.data[id(node)] = []
  273. return node.children
  274. def visit_token_node(self, node):
  275. try:
  276. transformed = self.transform_token_node(node)
  277. except Discard:
  278. pass
  279. else:
  280. self.data[self.node_stack[-1]].append(transformed)
  281. def visit_symbol_node_out(self, node):
  282. self.node_stack.pop()
  283. try:
  284. transformed = self.transform_symbol_node(node, self.data[id(node)])
  285. except Discard:
  286. pass
  287. else:
  288. self.data[self.node_stack[-1]].append(transformed)
  289. finally:
  290. del self.data[id(node)]
  291. def visit_intermediate_node_out(self, node):
  292. self.node_stack.pop()
  293. try:
  294. transformed = self.transform_intermediate_node(node, self.data[id(node)])
  295. except Discard:
  296. pass
  297. else:
  298. self.data[self.node_stack[-1]].append(transformed)
  299. finally:
  300. del self.data[id(node)]
  301. def visit_packed_node_out(self, node):
  302. self.node_stack.pop()
  303. try:
  304. transformed = self.transform_packed_node(node, self.data[id(node)])
  305. except Discard:
  306. pass
  307. else:
  308. self.data[self.node_stack[-1]].append(transformed)
  309. finally:
  310. del self.data[id(node)]
  311. class ForestSumVisitor(ForestVisitor):
  312. """
  313. A visitor for prioritizing ambiguous parts of the Forest.
  314. This visitor is used when support for explicit priorities on
  315. rules is requested (whether normal, or invert). It walks the
  316. forest (or subsets thereof) and cascades properties upwards
  317. from the leaves.
  318. It would be ideal to do this during parsing, however this would
  319. require processing each Earley item multiple times. That's
  320. a big performance drawback; so running a forest walk is the
  321. lesser of two evils: there can be significantly more Earley
  322. items created during parsing than there are SPPF nodes in the
  323. final tree.
  324. """
  325. def visit_packed_node_in(self, node):
  326. yield node.left
  327. yield node.right
  328. def visit_symbol_node_in(self, node):
  329. return iter(node.children)
  330. def visit_packed_node_out(self, node):
  331. priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0
  332. priority += getattr(node.right, 'priority', 0)
  333. priority += getattr(node.left, 'priority', 0)
  334. node.priority = priority
  335. def visit_symbol_node_out(self, node):
  336. node.priority = max(child.priority for child in node.children)
  337. class PackedData():
  338. """Used in transformationss of packed nodes to distinguish the data
  339. that comes from the left child and the right child.
  340. """
  341. def __init__(self, node, data):
  342. self.left = None
  343. self.right = None
  344. if data:
  345. if node.left:
  346. self.left = data[0]
  347. if len(data) > 1 and node.right:
  348. self.right = data[1]
  349. elif node.right:
  350. self.right = data[0]
  351. class ForestToParseTree(ForestTransformer):
  352. """Used by the earley parser when ambiguity equals 'resolve' or
  353. 'explicit'. Transforms an SPPF into an (ambiguous) parse tree.
  354. tree_class: The Tree class to use for construction
  355. callbacks: A dictionary of rules to functions that output a tree
  356. prioritizer: A ForestVisitor that manipulates the priorities of
  357. ForestNodes
  358. resolve_ambiguity: If True, ambiguities will be resolved based on
  359. priorities. Otherwise, `_ambig` nodes will be in the resulting
  360. tree.
  361. """
  362. def __init__(self, tree_class=Tree, callbacks=dict(), prioritizer=ForestSumVisitor(), resolve_ambiguity=True):
  363. super(ForestToParseTree, self).__init__()
  364. self.tree_class = tree_class
  365. self.callbacks = callbacks
  366. self.prioritizer = prioritizer
  367. self.resolve_ambiguity = resolve_ambiguity
  368. self._on_cycle_retreat = False
  369. def on_cycle(self, node, path):
  370. logger.warning("Cycle encountered in the SPPF at node: %s. "
  371. "As infinite ambiguities cannot be represented in a tree, "
  372. "this family of derivations will be discarded.", node)
  373. if self.resolve_ambiguity:
  374. # TODO: choose a different path if cycle is encountered
  375. logger.warning("At this time, using ambiguity resolution for SPPFs "
  376. "with cycles may result in None being returned.")
  377. self._on_cycle_retreat = True
  378. def _check_cycle(self, node):
  379. if self._on_cycle_retreat:
  380. raise Discard()
  381. def _collapse_ambig(self, children):
  382. new_children = []
  383. for child in children:
  384. if hasattr(child, 'data') and child.data == '_ambig':
  385. new_children += child.children
  386. else:
  387. new_children.append(child)
  388. return new_children
  389. def _call_rule_func(self, node, data):
  390. # called when transforming children of symbol nodes
  391. # data is a list of trees or tokens that correspond to the
  392. # symbol's rule expansion
  393. return self.callbacks[node.rule](data)
  394. def _call_ambig_func(self, node, data):
  395. # called when transforming a symbol node
  396. # data is a list of trees where each tree's data is
  397. # equal to the name of the symbol or one of its aliases.
  398. if len(data) > 1:
  399. return self.tree_class('_ambig', data)
  400. elif data:
  401. return data[0]
  402. raise Discard()
  403. def transform_symbol_node(self, node, data):
  404. self._check_cycle(node)
  405. data = self._collapse_ambig(data)
  406. return self._call_ambig_func(node, data)
  407. def transform_intermediate_node(self, node, data):
  408. self._check_cycle(node)
  409. if len(data) > 1:
  410. children = [self.tree_class('_inter', c) for c in data]
  411. return self.tree_class('_iambig', children)
  412. return data[0]
  413. def transform_packed_node(self, node, data):
  414. self._check_cycle(node)
  415. children = []
  416. assert len(data) <= 2
  417. data = PackedData(node, data)
  418. if data.left is not None:
  419. if node.left.is_intermediate and isinstance(data.left, list):
  420. children += data.left
  421. else:
  422. children.append(data.left)
  423. if data.right is not None:
  424. children.append(data.right)
  425. if node.parent.is_intermediate:
  426. return children
  427. return self._call_rule_func(node, children)
  428. def visit_symbol_node_in(self, node):
  429. self._on_cycle_retreat = False
  430. super(ForestToParseTree, self).visit_symbol_node_in(node)
  431. if self.prioritizer and node.is_ambiguous and isinf(node.priority):
  432. self.prioritizer.visit(node)
  433. if self.resolve_ambiguity:
  434. return node.children[0]
  435. return node.children
  436. def visit_packed_node_in(self, node):
  437. self._on_cycle_retreat = False
  438. return super(ForestToParseTree, self).visit_packed_node_in(node)
  439. def visit_token_node(self, node):
  440. self._on_cycle_retreat = False
  441. return super(ForestToParseTree, self).visit_token_node(node)
  442. def handles_ambiguity(func):
  443. """Decorator for methods of subclasses of TreeForestTransformer.
  444. Denotes that the method should receive a list of transformed derivations."""
  445. func.handles_ambiguity = True
  446. return func
  447. class TreeForestTransformer(ForestToParseTree):
  448. """A ForestTransformer with a tree-Transformer-like interface.
  449. By default, it will construct a tree.
  450. Methods provided via inheritance are called based on the rule/symbol
  451. names of nodes in the forest.
  452. Methods that act on rules will receive a list of the results of the
  453. transformations of the rule's children. By default, trees and tokens.
  454. Methods that act on tokens will receive a Token.
  455. Alternatively, methods that act on rules may be annotated with
  456. `handles_ambiguity`. In this case, the function will receive a list
  457. of all the transformations of all the derivations of the rule.
  458. By default, a list of trees where each tree.data is equal to the
  459. rule name or one of its aliases.
  460. Non-tree transformations are made possible by override of
  461. `__default__`, `__default_token__`, and `__default_ambig__`.
  462. """
  463. def __init__(self, tree_class=Tree, prioritizer=ForestSumVisitor(), resolve_ambiguity=True):
  464. super(TreeForestTransformer, self).__init__(tree_class, dict(), prioritizer, resolve_ambiguity)
  465. def __default__(self, name, data):
  466. """Default operation on tree (for override).
  467. Returns a tree with name with data as children.
  468. """
  469. return self.tree_class(name, data)
  470. def __default_ambig__(self, name, data):
  471. """Default operation on ambiguous rule (for override).
  472. Wraps data in an '_ambig_ node if it contains more than
  473. one element.'
  474. """
  475. if len(data) > 1:
  476. return self.tree_class('_ambig', data)
  477. elif data:
  478. return data[0]
  479. raise Discard()
  480. def __default_token__(self, node):
  481. """Default operation on Token (for override).
  482. Returns node
  483. """
  484. return node
  485. def transform_token_node(self, node):
  486. return getattr(self, node.type, self.__default_token__)(node)
  487. def _call_rule_func(self, node, data):
  488. name = node.rule.alias or node.rule.options.template_source or node.rule.origin.name
  489. user_func = getattr(self, name, self.__default__)
  490. if user_func == self.__default__ or hasattr(user_func, 'handles_ambiguity'):
  491. user_func = partial(self.__default__, name)
  492. if not self.resolve_ambiguity:
  493. wrapper = partial(AmbiguousIntermediateExpander, self.tree_class)
  494. user_func = wrapper(user_func)
  495. return user_func(data)
  496. def _call_ambig_func(self, node, data):
  497. name = node.s.name
  498. user_func = getattr(self, name, self.__default_ambig__)
  499. if user_func == self.__default_ambig__ or not hasattr(user_func, 'handles_ambiguity'):
  500. user_func = partial(self.__default_ambig__, name)
  501. return user_func(data)
  502. class ForestToPyDotVisitor(ForestVisitor):
  503. """
  504. A Forest visitor which writes the SPPF to a PNG.
  505. The SPPF can get really large, really quickly because
  506. of the amount of meta-data it stores, so this is probably
  507. only useful for trivial trees and learning how the SPPF
  508. is structured.
  509. """
  510. def __init__(self, rankdir="TB"):
  511. self.pydot = import_module('pydot')
  512. self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir)
  513. def visit(self, root, filename):
  514. super(ForestToPyDotVisitor, self).visit(root)
  515. self.graph.write_png(filename)
  516. def visit_token_node(self, node):
  517. graph_node_id = str(id(node))
  518. graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
  519. graph_node_color = 0x808080
  520. graph_node_style = "\"filled,rounded\""
  521. graph_node_shape = "diamond"
  522. graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
  523. self.graph.add_node(graph_node)
  524. def visit_packed_node_in(self, node):
  525. graph_node_id = str(id(node))
  526. graph_node_label = repr(node)
  527. graph_node_color = 0x808080
  528. graph_node_style = "filled"
  529. graph_node_shape = "diamond"
  530. graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
  531. self.graph.add_node(graph_node)
  532. yield node.left
  533. yield node.right
  534. def visit_packed_node_out(self, node):
  535. graph_node_id = str(id(node))
  536. graph_node = self.graph.get_node(graph_node_id)[0]
  537. for child in [node.left, node.right]:
  538. if child is not None:
  539. child_graph_node_id = str(id(child))
  540. child_graph_node = self.graph.get_node(child_graph_node_id)[0]
  541. self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
  542. else:
  543. #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay.
  544. child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890))
  545. child_graph_node_style = "invis"
  546. child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None")
  547. child_edge_style = "invis"
  548. self.graph.add_node(child_graph_node)
  549. self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style))
  550. def visit_symbol_node_in(self, node):
  551. graph_node_id = str(id(node))
  552. graph_node_label = repr(node)
  553. graph_node_color = 0x808080
  554. graph_node_style = "\"filled\""
  555. if node.is_intermediate:
  556. graph_node_shape = "ellipse"
  557. else:
  558. graph_node_shape = "rectangle"
  559. graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
  560. self.graph.add_node(graph_node)
  561. return iter(node.children)
  562. def visit_symbol_node_out(self, node):
  563. graph_node_id = str(id(node))
  564. graph_node = self.graph.get_node(graph_node_id)[0]
  565. for child in node.children:
  566. child_graph_node_id = str(id(child))
  567. child_graph_node = self.graph.get_node(child_graph_node_id)[0]
  568. self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))