This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

181 rivejä
5.8 KiB

  1. "This module implements an Earley Parser"
  2. # The algorithm keeps track of each state set, using a corresponding Column instance.
  3. # Column keeps track of new items using NewsList instances.
  4. #
  5. # Author: Erez Shinan (2017)
  6. # Email : erezshin@gmail.com
  7. from ..common import ParseError, UnexpectedToken, is_terminal
  8. from .grammar_analysis import GrammarAnalyzer
  9. class EndToken:
  10. type = '$end'
  11. END_TOKEN = EndToken()
  12. class Item(object):
  13. def __init__(self, rule, ptr, start, data):
  14. self.rule = rule
  15. self.ptr = ptr
  16. self.start = start
  17. self.data = data
  18. @property
  19. def expect(self):
  20. return self.rule.expansion[self.ptr]
  21. @property
  22. def is_complete(self):
  23. return self.ptr == len(self.rule.expansion)
  24. def advance(self, data):
  25. return Item(self.rule, self.ptr+1, self.start, self.data + [data])
  26. def __eq__(self, other):
  27. return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule
  28. def __hash__(self):
  29. return hash((self.rule, self.ptr, id(self.start)))
  30. def __repr__(self):
  31. before = map(str, self.rule.expansion[:self.ptr])
  32. after = map(str, self.rule.expansion[self.ptr:])
  33. return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after))
  34. class NewsList(list):
  35. "Keeps track of newly added items (append-only)"
  36. def __init__(self, initial=None):
  37. list.__init__(self, initial or [])
  38. self.last_iter = 0
  39. def get_news(self):
  40. i = self.last_iter
  41. self.last_iter = len(self)
  42. return self[i:]
  43. class Column:
  44. "An entry in the table, aka Earley Chart"
  45. def __init__(self):
  46. self.to_reduce = NewsList()
  47. self.to_predict = NewsList()
  48. self.to_scan = NewsList()
  49. self.item_count = 0
  50. self.added = set()
  51. def add(self, items):
  52. """Sort items into scan/predict/reduce newslists
  53. Makes sure only unique items are added.
  54. """
  55. added = self.added
  56. for item in items:
  57. if item.is_complete:
  58. # (We must allow repetition of empty rules)
  59. # if item.rule.expansion:
  60. # This is an important test to avoid infinite-loops,
  61. # For example for the rule:
  62. # a: a | "b"
  63. # If we can detect these cases statically, we can remove
  64. # this test an gain a tiny performance boost
  65. #
  66. # if item in added:
  67. # continue
  68. # added.add(item)
  69. self.to_reduce.append(item)
  70. else:
  71. if is_terminal(item.expect):
  72. self.to_scan.append(item)
  73. else:
  74. if item in added:
  75. continue
  76. added.add(item)
  77. self.to_predict.append(item)
  78. self.item_count += 1 # Only count if actually added
  79. def __nonzero__(self):
  80. return bool(self.item_count)
  81. class Parser:
  82. def __init__(self, parser_conf):
  83. self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start)
  84. self.start = parser_conf.start
  85. self.postprocess = {}
  86. self.predictions = {}
  87. for rule in self.analysis.rules:
  88. if rule.origin != '$root': # XXX kinda ugly
  89. a = rule.alias
  90. self.postprocess[rule] = a if callable(a) else getattr(parser_conf.callback, a)
  91. self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
  92. def parse(self, stream, start=None):
  93. # Define parser functions
  94. start = start or self.start
  95. def predict(nonterm, i):
  96. assert not is_terminal(nonterm), nonterm
  97. return [Item(rule, 0, i, []) for rule in self.predictions[nonterm]]
  98. def complete(item):
  99. name = item.rule.origin
  100. item.data = self.postprocess[item.rule](item.data)
  101. return [i.advance(item.data) for i in item.start.to_predict if i.expect == name]
  102. def process_column(i, token, cur_set):
  103. next_set = Column()
  104. while True:
  105. to_predict = {x.expect for x in cur_set.to_predict.get_news()
  106. if x.ptr} # if not part of an already predicted batch
  107. to_reduce = cur_set.to_reduce.get_news()
  108. if not (to_predict or to_reduce):
  109. break
  110. for nonterm in to_predict:
  111. cur_set.add( predict(nonterm, cur_set) )
  112. for item in to_reduce:
  113. cur_set.add( complete(item) )
  114. if token is not END_TOKEN:
  115. for item in cur_set.to_scan.get_news():
  116. match = item.expect[0](token) if callable(item.expect[0]) else item.expect[0] == token.type
  117. if match:
  118. next_set.add([item.advance(stream[i])])
  119. if not next_set and token is not END_TOKEN:
  120. expect = {i.expect[-1] for i in cur_set.to_scan}
  121. raise UnexpectedToken(token, expect, stream, i)
  122. return cur_set, next_set
  123. # Main loop starts
  124. column0 = Column()
  125. column0.add(predict(start, column0))
  126. cur_set = column0
  127. for i, char in enumerate(stream):
  128. _, cur_set = process_column(i, char, cur_set)
  129. last_set, _ = process_column(len(stream), END_TOKEN, cur_set)
  130. # Parse ended. Now build a parse tree
  131. solutions = [n.data for n in last_set.to_reduce
  132. if n.rule.origin==start and n.start is column0]
  133. if not solutions:
  134. raise ParseError('Incomplete parse: Could not find a solution to input')
  135. return solutions