This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

170 lines
5.3 KiB

  1. "This module implements an Earley Parser"
  2. # The algorithm keeps track of each state set, using a corresponding Column instance.
  3. # Column keeps track of new items using NewsList instances.
  4. #
  5. # Author: Erez Shinan (2017)
  6. # Email : erezshin@gmail.com
  7. from ..common import ParseError, UnexpectedToken, is_terminal
  8. from .grammar_analysis import GrammarAnalyzer
  9. class EndToken:
  10. type = '$end'
  11. END_TOKEN = EndToken()
  12. class Item(object):
  13. def __init__(self, rule, ptr, start, data):
  14. self.rule = rule
  15. self.ptr = ptr
  16. self.start = start
  17. self.data = data
  18. @property
  19. def expect(self):
  20. return self.rule.expansion[self.ptr]
  21. @property
  22. def is_complete(self):
  23. return self.ptr == len(self.rule.expansion)
  24. def advance(self, data):
  25. return Item(self.rule, self.ptr+1, self.start, self.data + [data])
  26. def __eq__(self, other):
  27. return self.start == other.start and self.ptr == other.ptr and self.rule == other.rule
  28. def __hash__(self):
  29. return hash((self.rule, self.ptr, self.start))
  30. def __repr__(self):
  31. before = map(str, self.rule.expansion[:self.ptr])
  32. after = map(str, self.rule.expansion[self.ptr:])
  33. return '<(%d) %s : %s * %s>' % (self.start, self.rule.origin, ' '.join(before), ' '.join(after))
  34. class NewsList(list):
  35. "Keeps track of newly added items (append-only)"
  36. def __init__(self, initial=None):
  37. list.__init__(self, initial or [])
  38. self.last_iter = 0
  39. def get_news(self):
  40. i = self.last_iter
  41. self.last_iter = len(self)
  42. return self[i:]
  43. class Column:
  44. "An entry in the table, aka Earley Chart"
  45. def __init__(self):
  46. self.to_reduce = NewsList()
  47. self.to_predict = NewsList()
  48. self.to_scan = NewsList()
  49. self.item_count = 0
  50. self.added = set()
  51. def add(self, items):
  52. """Sort items into scan/predict/reduce newslists
  53. Makes sure only unique items are added.
  54. """
  55. added = self.added
  56. for item in items:
  57. if item.is_complete:
  58. if item in added:
  59. continue
  60. self.to_reduce.append(item)
  61. added.add(item)
  62. else:
  63. if is_terminal(item.expect):
  64. self.to_scan.append(item)
  65. else:
  66. if item in added:
  67. continue
  68. self.to_predict.append(item)
  69. added.add(item)
  70. self.item_count += 1 # Only count if actually added
  71. def __nonzero__(self):
  72. return bool(self.item_count)
  73. class Parser:
  74. def __init__(self, parser_conf):
  75. self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
  76. self.start = parser_conf.start
  77. self.postprocess = {}
  78. self.predictions = {}
  79. for rule in self.analysis.rules:
  80. if rule.origin != '$root': # XXX kinda ugly
  81. a = rule.alias
  82. self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a)
  83. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  84. def parse(self, stream, start=None):
  85. # Define parser functions
  86. start = start or self.start
  87. def predict(nonterm, i):
  88. assert not is_terminal(nonterm), nonterm
  89. return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]]
  90. def complete(item, table):
  91. name = item.rule.origin
  92. item.data = self.postprocess[item.rule](item.data)
  93. return [i.advance(item.data) for i in table[item.start].to_predict if i.expect == name]
  94. def process_column(i, token):
  95. assert i == len(table)-1
  96. cur_set = table[i]
  97. next_set = Column()
  98. while True:
  99. to_predict = {x.expect for x in cur_set.to_predict.get_news()
  100. if x.ptr} # if not part of an already predicted batch
  101. to_reduce = cur_set.to_reduce.get_news()
  102. if not (to_predict or to_reduce):
  103. break
  104. for nonterm in to_predict:
  105. cur_set.add( predict(nonterm, i) )
  106. for item in to_reduce:
  107. cur_set.add( complete(item, table) )
  108. if token is not END_TOKEN:
  109. for item in cur_set.to_scan.get_news():
  110. match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type
  111. if match:
  112. next_set.add([item.advance(stream[i])])
  113. if not next_set and token is not END_TOKEN:
  114. expect = {i.expect for i in cur_set.to_scan}
  115. raise UnexpectedToken(token, expect, stream, i)
  116. table.append(next_set)
  117. # Main loop starts
  118. table = [Column()]
  119. table[0].add(predict(start, 0))
  120. for i, char in enumerate(stream):
  121. process_column(i, char)
  122. process_column(len(stream), END_TOKEN)
  123. # Parse ended. Now build a parse tree
  124. solutions = [n.data for n in table[len(stream)].to_reduce
  125. if n.rule.origin==start and n.start==0]
  126. if not solutions:
  127. raise ParseError('Incomplete parse: Could not find a solution to input')
  128. return solutions