This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

202 lines
6.9 KiB

  1. from collections import Counter, defaultdict
  2. from ..utils import bfs, fzset, classify
  3. from ..exceptions import GrammarError
  4. from ..grammar import Rule, Terminal, NonTerminal
  5. import time
  6. # optimizations were made so that there should never be two distinct equal RulePtrs
  7. # to help with hashtable lookup
  8. class RulePtr(object):
  9. __slots__ = ('rule', 'index', '_advance')
  10. def __init__(self, rule, index):
  11. assert isinstance(rule, Rule)
  12. assert index <= len(rule.expansion)
  13. self.rule = rule
  14. self.index = index
  15. self._advance = None
  16. def __repr__(self):
  17. before = [x.name for x in self.rule.expansion[:self.index]]
  18. after = [x.name for x in self.rule.expansion[self.index:]]
  19. return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after))
  20. @property
  21. def next(self):
  22. return self.rule.expansion[self.index]
  23. # don't create duplicate RulePtrs
  24. def advance(self, sym):
  25. assert self.next == sym
  26. a = self._advance
  27. if a is None:
  28. a = RulePtr(self.rule, self.index + 1)
  29. self._advance = a
  30. return a
  31. @property
  32. def is_satisfied(self):
  33. return self.index == len(self.rule.expansion)
  34. # state generation ensures no duplicate LR0ItemSets
  35. class LR0ItemSet(object):
  36. __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')
  37. def __init__(self, kernel, closure):
  38. self.kernel = fzset(kernel)
  39. self.closure = fzset(closure)
  40. self.transitions = {}
  41. self.lookaheads = defaultdict(set)
  42. def __repr__(self):
  43. return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
  44. def update_set(set1, set2):
  45. if not set2 or set1 > set2:
  46. return False
  47. copy = set(set1)
  48. set1 |= set2
  49. return set1 != copy
  50. def calculate_sets(rules):
  51. """Calculate FOLLOW sets.
  52. Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
  53. symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
  54. # foreach grammar rule X ::= Y(1) ... Y(k)
  55. # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
  56. # NULLABLE = NULLABLE union {X}
  57. # for i = 1 to k
  58. # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
  59. # FIRST(X) = FIRST(X) union FIRST(Y(i))
  60. # for j = i+1 to k
  61. # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
  62. # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
  63. # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
  64. # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
  65. # until none of NULLABLE,FIRST,FOLLOW changed in last iteration
  66. NULLABLE = set()
  67. FIRST = {}
  68. FOLLOW = {}
  69. for sym in symbols:
  70. FIRST[sym]={sym} if sym.is_term else set()
  71. FOLLOW[sym]=set()
  72. # Calculate NULLABLE and FIRST
  73. changed = True
  74. while changed:
  75. changed = False
  76. for rule in rules:
  77. if set(rule.expansion) <= NULLABLE:
  78. if update_set(NULLABLE, {rule.origin}):
  79. changed = True
  80. for i, sym in enumerate(rule.expansion):
  81. if set(rule.expansion[:i]) <= NULLABLE:
  82. if update_set(FIRST[rule.origin], FIRST[sym]):
  83. changed = True
  84. else:
  85. break
  86. # Calculate FOLLOW
  87. changed = True
  88. while changed:
  89. changed = False
  90. for rule in rules:
  91. for i, sym in enumerate(rule.expansion):
  92. if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE:
  93. if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
  94. changed = True
  95. for j in range(i+1, len(rule.expansion)):
  96. if set(rule.expansion[i+1:j]) <= NULLABLE:
  97. if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
  98. changed = True
  99. return FIRST, FOLLOW, NULLABLE
  100. class GrammarAnalyzer(object):
  101. def __init__(self, parser_conf, debug=False):
  102. self.debug = debug
  103. root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
  104. for start in parser_conf.start}
  105. rules = parser_conf.rules + list(root_rules.values())
  106. self.rules_by_origin = classify(rules, lambda r: r.origin)
  107. if len(rules) != len(set(rules)):
  108. duplicates = [item for item, count in Counter(rules).items() if count > 1]
  109. raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
  110. for r in rules:
  111. for sym in r.expansion:
  112. if not (sym.is_term or sym in self.rules_by_origin):
  113. raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation
  114. self.start_states = {start: self.expand_rule(root_rule.origin)
  115. for start, root_rule in root_rules.items()}
  116. self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
  117. for start, root_rule in root_rules.items()}
  118. lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
  119. for start in parser_conf.start}
  120. lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
  121. assert(len(lr0_rules) == len(set(lr0_rules)))
  122. self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
  123. # cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
  124. for root_rule in lr0_root_rules.values():
  125. root_rule._rp = RulePtr(root_rule, 0)
  126. self.lr0_start_states = {start: LR0ItemSet([root_rule._rp], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
  127. for start, root_rule in lr0_root_rules.items()}
  128. self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
  129. self.nonterminal_transitions = []
  130. self.directly_reads = defaultdict(set)
  131. self.reads = defaultdict(set)
  132. self.includes = defaultdict(set)
  133. self.lookback = defaultdict(set)
  134. def expand_rule(self, source_rule, rules_by_origin=None):
  135. "Returns all init_ptrs accessible by rule (recursive)"
  136. if rules_by_origin is None:
  137. rules_by_origin = self.rules_by_origin
  138. init_ptrs = set()
  139. def _expand_rule(rule):
  140. assert not rule.is_term, rule
  141. for r in rules_by_origin[rule]:
  142. # don't create duplicate RulePtr objects
  143. init_ptr = r._rp
  144. if init_ptr is None:
  145. init_ptr = RulePtr(r, 0)
  146. r._rp = init_ptr
  147. init_ptrs.add(init_ptr)
  148. if r.expansion: # if not empty rule
  149. new_r = init_ptr.next
  150. if not new_r.is_term:
  151. yield new_r
  152. for _ in bfs([source_rule], _expand_rule):
  153. pass
  154. return fzset(init_ptrs)