This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

169 lines
5.4 KiB

  1. "This module implements an Earley Parser"
  2. # The algorithm keeps track of each state set, using a corresponding Column instance.
  3. # Column keeps track of new items using NewsList instances.
  4. #
  5. # Author: Erez Shinan (2017)
  6. # Email : erezshin@gmail.com
  7. from ..common import ParseError, UnexpectedToken, is_terminal
  8. from .grammar_analysis import GrammarAnalyzer
  9. class EndToken:
  10. type = '$end'
  11. END_TOKEN = EndToken()
  12. class Item(object):
  13. def __init__(self, rule, ptr, start, data):
  14. self.rule = rule
  15. self.ptr = ptr
  16. self.start = start
  17. self.data = data
  18. @property
  19. def expect(self):
  20. return self.rule.expansion[self.ptr]
  21. @property
  22. def is_complete(self):
  23. return self.ptr == len(self.rule.expansion)
  24. def advance(self, data):
  25. return Item(self.rule, self.ptr+1, self.start, self.data + [data])
  26. def __eq__(self, other):
  27. return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
  28. def __hash__(self):
  29. return hash((self.rule, self.ptr, id(self.start)))
  30. def __repr__(self):
  31. before = map(str, self.rule.expansion[:self.ptr])
  32. after = map(str, self.rule.expansion[self.ptr:])
  33. return '<(%d) %s : %s * %s>' % (self.start, self.rule.origin, ' '.join(before), ' '.join(after))
  34. class NewsList(list):
  35. "Keeps track of newly added items (append-only)"
  36. def __init__(self, initial=None):
  37. list.__init__(self, initial or [])
  38. self.last_iter = 0
  39. def get_news(self):
  40. i = self.last_iter
  41. self.last_iter = len(self)
  42. return self[i:]
  43. class Column:
  44. "An entry in the table, aka Earley Chart"
  45. def __init__(self):
  46. self.to_reduce = NewsList()
  47. self.to_predict = NewsList()
  48. self.to_scan = NewsList()
  49. self.item_count = 0
  50. self.added = set()
  51. def add(self, items):
  52. """Sort items into scan/predict/reduce newslists
  53. Makes sure only unique items are added.
  54. """
  55. added = self.added
  56. for item in items:
  57. if item.is_complete:
  58. # if item in added: # XXX This causes a bug with empty rules
  59. # continue # And might be unnecessary
  60. # added.add(item)
  61. self.to_reduce.append(item)
  62. else:
  63. if is_terminal(item.expect):
  64. self.to_scan.append(item)
  65. else:
  66. if item in added:
  67. continue
  68. added.add(item)
  69. self.to_predict.append(item)
  70. self.item_count += 1 # Only count if actually added
  71. def __nonzero__(self):
  72. return bool(self.item_count)
  73. class Parser:
  74. def __init__(self, parser_conf):
  75. self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
  76. self.start = parser_conf.start
  77. self.postprocess = {}
  78. self.predictions = {}
  79. for rule in self.analysis.rules:
  80. if rule.origin != '$root': # XXX kinda ugly
  81. a = rule.alias
  82. self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a)
  83. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  84. def parse(self, stream, start=None):
  85. # Define parser functions
  86. start = start or self.start
  87. def predict(nonterm, i):
  88. assert not is_terminal(nonterm), nonterm
  89. return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]]
  90. def complete(item):
  91. name = item.rule.origin
  92. item.data = self.postprocess[item.rule](item.data)
  93. return [i.advance(item.data) for i in item.start.to_predict if i.expect == name]
  94. def process_column(i, token, cur_set):
  95. next_set = Column()
  96. while True:
  97. to_predict = {x.expect for x in cur_set.to_predict.get_news()
  98. if x.ptr} # if not part of an already predicted batch
  99. to_reduce = cur_set.to_reduce.get_news()
  100. if not (to_predict or to_reduce):
  101. break
  102. for nonterm in to_predict:
  103. cur_set.add( predict(nonterm, cur_set) )
  104. for item in to_reduce:
  105. cur_set.add( complete(item) )
  106. if token is not END_TOKEN:
  107. for item in cur_set.to_scan.get_news():
  108. match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type
  109. if match:
  110. next_set.add([item.advance(stream[i])])
  111. if not next_set and token is not END_TOKEN:
  112. expect = {i.expect for i in cur_set.to_scan}
  113. raise UnexpectedToken(token, expect, stream, i)
  114. return cur_set, next_set
  115. # Main loop starts
  116. column0 = Column()
  117. column0.add(predict(start, column0))
  118. cur_set = column0
  119. for i, char in enumerate(stream):
  120. _, cur_set = process_column(i, char, cur_set)
  121. last_set, _ = process_column(len(stream), END_TOKEN, cur_set)
  122. # Parse ended. Now build a parse tree
  123. solutions = [n.data for n in last_set.to_reduce
  124. if n.rule.origin==start and n.start is column0]
  125. if not solutions:
  126. raise ParseError('Incomplete parse: Could not find a solution to input')
  127. return solutions