This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

124 lines
4.6 KiB

  1. "This module implements an experimental Earley Parser with a dynamic lexer"
  2. # The parser uses a parse-forest to keep track of derivations and ambiguations.
  3. # When the parse ends successfully, a disambiguation stage resolves all ambiguity
  4. # (right now ambiguity resolution is not developed beyond the needs of lark)
  5. # Afterwards the parse tree is reduced (transformed) according to user callbacks.
  6. # I use the no-recursion version of Transformer and Visitor, because the tree might be
  7. # deeper than Python's recursion limit (a bit absurd, but that's life)
  8. #
  9. # The algorithm keeps track of each state set, using a corresponding Column instance.
  10. # Column keeps track of new items using NewsList instances.
  11. #
  12. # Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
  13. # uses regular expressions by necessity, achieving high-performance while maintaining all of
  14. # Earley's power in parsing any CFG.
  15. #
  16. #
  17. # Author: Erez Shinan (2017)
  18. # Email : erezshin@gmail.com
  19. from collections import defaultdict
  20. from ..common import ParseError, UnexpectedToken, Terminal
  21. from ..tree import Tree
  22. from .grammar_analysis import GrammarAnalyzer
  23. from earley import ResolveAmbig, ApplyCallbacks, Item, NewsList, Derivation, END_TOKEN, Column
  24. class Parser:
  25. def __init__(self, rules, start_symbol, callback, resolve_ambiguity=True, ignore=()):
  26. self.analysis = GrammarAnalyzer(rules, start_symbol)
  27. self.start_symbol = start_symbol
  28. self.resolve_ambiguity = resolve_ambiguity
  29. self.ignore = list(ignore)
  30. self.postprocess = {}
  31. self.predictions = {}
  32. for rule in self.analysis.rules:
  33. if rule.origin != '$root': # XXX kinda ugly
  34. a = rule.alias
  35. self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
  36. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  37. def parse(self, stream, start_symbol=None):
  38. # Define parser functions
  39. start_symbol = start_symbol or self.start_symbol
  40. delayed_matches = defaultdict(list)
  41. def predict(nonterm, column):
  42. assert not isinstance(nonterm, Terminal), nonterm
  43. return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]
  44. def complete(item):
  45. name = item.rule.origin
  46. return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]
  47. def predict_and_complete(column):
  48. while True:
  49. to_predict = {x.expect for x in column.to_predict.get_news()
  50. if x.ptr} # if not part of an already predicted batch
  51. to_reduce = column.to_reduce.get_news()
  52. if not (to_predict or to_reduce):
  53. break
  54. for nonterm in to_predict:
  55. column.add( predict(nonterm, column) )
  56. for item in to_reduce:
  57. column.add( complete(item) )
  58. def scan(i, token, column):
  59. for x in self.ignore:
  60. m = x.match(stream, i)
  61. if m:
  62. return column
  63. to_scan = column.to_scan.get_news()
  64. for item in to_scan:
  65. m = item.expect.match(stream, i)
  66. if m:
  67. delayed_matches[m.end()].append(item.advance(m.group(0)))
  68. s = m.group(0)
  69. for j in range(1, len(s)):
  70. m = item.expect.match(s[:-j])
  71. if m:
  72. delayed_matches[m.end()].append(item.advance(m.group(0)))
  73. next_set = Column(i+1)
  74. next_set.add(delayed_matches[i+1])
  75. del delayed_matches[i+1] # No longer needed, so unburden memory
  76. return next_set
  77. # Main loop starts
  78. column0 = Column(0)
  79. column0.add(predict(start_symbol, column0))
  80. column = column0
  81. for i, token in enumerate(stream):
  82. predict_and_complete(column)
  83. column = scan(i, token, column)
  84. predict_and_complete(column)
  85. # Parse ended. Now build a parse tree
  86. solutions = [n.tree for n in column.to_reduce
  87. if n.rule.origin==start_symbol and n.start is column0]
  88. if not solutions:
  89. raise ParseError('Incomplete parse: Could not find a solution to input')
  90. elif len(solutions) == 1:
  91. tree = solutions[0]
  92. else:
  93. tree = Tree('_ambig', solutions)
  94. if self.resolve_ambiguity:
  95. ResolveAmbig().visit(tree)
  96. return ApplyCallbacks(self.postprocess).transform(tree)