This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

292 lines
9.5 KiB

  1. from collections import Counter, defaultdict
  2. from ..utils import bfs, fzset, classify
  3. from ..exceptions import GrammarError
  4. from ..grammar import Rule, Terminal, NonTerminal
  5. import time
  6. t_firsts = 0
  7. t_xy = 0
  8. t_call = 0
  9. cache_hits = 0
  10. cache_misses = 0
  11. # used to be just a tuple (rp, la)
  12. # but by making it an object,
  13. # the hash and equality become trivial
  14. # (slightly faster for sets which are hashtables?)
  15. class RulePtrLookahead(object):
  16. __slots__ = 'rp', 'la'
  17. def __init__(self, rp, la):
  18. self.rp = rp
  19. self.la = la
  20. class RulePtr(object):
  21. __slots__ = ('rule', 'index', '_advance', '_lookaheads', '_next_rules_by_origin', '_first')
  22. def __init__(self, rule, index):
  23. assert isinstance(rule, Rule)
  24. assert index <= len(rule.expansion)
  25. self.rule = rule
  26. self.index = index
  27. #self._hash = hash((self.rule, self.index))
  28. #self._hash = None
  29. self._advance = None
  30. self._lookaheads = {}
  31. self._next_rules_by_origin = None
  32. self._first = None
  33. def __repr__(self):
  34. before = [x.name for x in self.rule.expansion[:self.index]]
  35. after = [x.name for x in self.rule.expansion[self.index:]]
  36. return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after))
  37. @property
  38. def next(self):
  39. return self.rule.expansion[self.index]
  40. # don't create duplicate RulePtrs
  41. def advance(self, sym):
  42. assert self.next == sym
  43. a = self._advance
  44. if a is None:
  45. a = RulePtr(self.rule, self.index + 1)
  46. self._advance = a
  47. return a
  48. @property
  49. def is_satisfied(self):
  50. return self.index == len(self.rule.expansion)
  51. def lookahead(self, la):
  52. rp_la = self._lookaheads.get(la, None)
  53. if rp_la is None:
  54. rp_la = RulePtrLookahead(self, la)
  55. self._lookaheads[la] = rp_la
  56. return rp_la
  57. def next_rules_by_origin(self, rules_by_origin):
  58. n = self._next_rules_by_origin
  59. if n is None:
  60. n = rules_by_origin[self.next]
  61. self._next_rules_by_origin = n
  62. return n
  63. # recursive form of lalr_analyis.py:343 (which is easier to understand IMO)
  64. # normally avoid recursion but this allows us to cache
  65. # each intermediate step in a corresponding RulePtr
  66. def first(self, i, firsts, nullable, t):
  67. global cache_hits
  68. global cache_misses
  69. global t_firsts
  70. global t_xy
  71. global t_call
  72. t_call += time.time() - t
  73. n = len(self.rule.expansion)
  74. if i == n:
  75. return ([], True)
  76. x = self._first
  77. t_x = time.time()
  78. if x is None:
  79. t0 = time.time()
  80. t_y = time.time()
  81. cache_misses += 1
  82. s = self.rule.expansion[i]
  83. l = list(firsts.get(s, []))
  84. b = (s in nullable)
  85. if b:
  86. t1 = time.time()
  87. t_firsts += t1 - t0
  88. l_b_2 = self.advance(s).first(i + 1, firsts, nullable, time.time())
  89. #l_b_2 = first(self.advance(self.next), i + 1, firsts, nullable, time.time())
  90. t0 = time.time()
  91. l.extend(l_b_2[0])
  92. b = l_b_2[1]
  93. x = (l, b)
  94. self._first = x
  95. t1 = time.time()
  96. t_firsts += t1 - t0
  97. else:
  98. t_y = time.time()
  99. cache_hits += 1
  100. t_xy += t_y - t_x
  101. return x
  102. # optimizations were made so that there should never be
  103. # two distinct equal RulePtrs
  104. # should help set/hashtable lookups?
  105. '''
  106. def __eq__(self, other):
  107. return self.rule == other.rule and self.index == other.index
  108. def __hash__(self):
  109. return self._hash
  110. '''
  111. class LR0ItemSet(object):
  112. __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads', '_hash')
  113. def __init__(self, kernel, closure):
  114. self.kernel = fzset(kernel)
  115. self.closure = fzset(closure)
  116. self.transitions = {}
  117. self.lookaheads = defaultdict(set)
  118. #self._hash = hash(self.kernel)
  119. # state generation ensures no duplicate LR0ItemSets
  120. '''
  121. def __eq__(self, other):
  122. return self.kernel == other.kernel
  123. def __hash__(self):
  124. return self._hash
  125. '''
  126. def __repr__(self):
  127. return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
  128. def update_set(set1, set2):
  129. if not set2 or set1 > set2:
  130. return False
  131. copy = set(set1)
  132. set1 |= set2
  133. return set1 != copy
  134. def calculate_sets(rules):
  135. """Calculate FOLLOW sets.
  136. Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
  137. symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
  138. # foreach grammar rule X ::= Y(1) ... Y(k)
  139. # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
  140. # NULLABLE = NULLABLE union {X}
  141. # for i = 1 to k
  142. # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
  143. # FIRST(X) = FIRST(X) union FIRST(Y(i))
  144. # for j = i+1 to k
  145. # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
  146. # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
  147. # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
  148. # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
  149. # until none of NULLABLE,FIRST,FOLLOW changed in last iteration
  150. NULLABLE = set()
  151. FIRST = {}
  152. FOLLOW = {}
  153. for sym in symbols:
  154. FIRST[sym]={sym} if sym.is_term else set()
  155. FOLLOW[sym]=set()
  156. # Calculate NULLABLE and FIRST
  157. changed = True
  158. while changed:
  159. changed = False
  160. for rule in rules:
  161. if set(rule.expansion) <= NULLABLE:
  162. if update_set(NULLABLE, {rule.origin}):
  163. changed = True
  164. for i, sym in enumerate(rule.expansion):
  165. if set(rule.expansion[:i]) <= NULLABLE:
  166. if update_set(FIRST[rule.origin], FIRST[sym]):
  167. changed = True
  168. else:
  169. break
  170. # Calculate FOLLOW
  171. changed = True
  172. while changed:
  173. changed = False
  174. for rule in rules:
  175. for i, sym in enumerate(rule.expansion):
  176. if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE:
  177. if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
  178. changed = True
  179. for j in range(i+1, len(rule.expansion)):
  180. if set(rule.expansion[i+1:j]) <= NULLABLE:
  181. if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
  182. changed = True
  183. return FIRST, FOLLOW, NULLABLE
  184. class GrammarAnalyzer(object):
  185. def __init__(self, parser_conf, debug=False):
  186. self.debug = debug
  187. root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
  188. for start in parser_conf.start}
  189. rules = parser_conf.rules + list(root_rules.values())
  190. self.rules_by_origin = classify(rules, lambda r: r.origin)
  191. if len(rules) != len(set(rules)):
  192. duplicates = [item for item, count in Counter(rules).items() if count > 1]
  193. raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
  194. for r in rules:
  195. for sym in r.expansion:
  196. if not (sym.is_term or sym in self.rules_by_origin):
  197. raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation
  198. self.start_states = {start: self.expand_rule(root_rule.origin)
  199. for start, root_rule in root_rules.items()}
  200. self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
  201. for start, root_rule in root_rules.items()}
  202. lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
  203. for start in parser_conf.start}
  204. lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
  205. assert(len(lr0_rules) == len(set(lr0_rules)))
  206. self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
  207. # cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
  208. for root_rule in lr0_root_rules.values():
  209. root_rule._rp = RulePtr(root_rule, 0)
  210. self.lr0_start_states = {start: LR0ItemSet([root_rule._rp], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
  211. for start, root_rule in lr0_root_rules.items()}
  212. self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
  213. # unused, did not help
  214. self.lr1_cache = {}
  215. self.lr1_cache2 = {}
  216. def expand_rule(self, source_rule, rules_by_origin=None):
  217. "Returns all init_ptrs accessible by rule (recursive)"
  218. if rules_by_origin is None:
  219. rules_by_origin = self.rules_by_origin
  220. init_ptrs = set()
  221. def _expand_rule(rule):
  222. assert not rule.is_term, rule
  223. for r in rules_by_origin[rule]:
  224. # don't create duplicate RulePtr objects
  225. init_ptr = r._rp
  226. if init_ptr is None:
  227. init_ptr = RulePtr(r, 0)
  228. r._rp = init_ptr
  229. init_ptrs.add(init_ptr)
  230. if r.expansion: # if not empty rule
  231. new_r = init_ptr.next
  232. if not new_r.is_term:
  233. yield new_r
  234. for _ in bfs([source_rule], _expand_rule):
  235. pass
  236. return fzset(init_ptrs)