|
- from collections import Counter, defaultdict
-
- from ..utils import bfs, fzset, classify
- from ..exceptions import GrammarError
- from ..grammar import Rule, Terminal, NonTerminal
- import time
-
- t_firsts = 0
- t_xy = 0
- t_call = 0
- cache_hits = 0
- cache_misses = 0
-
- # used to be just a tuple (rp, la)
- # but by making it an object,
- # the hash and equality become trivial
- # (slightly faster for sets which are hashtables?)
- class RulePtrLookahead(object):
- __slots__ = 'rp', 'la'
-
- def __init__(self, rp, la):
- self.rp = rp
- self.la = la
-
- class RulePtr(object):
- __slots__ = ('rule', 'index', '_advance', '_lookaheads', '_next_rules_by_origin', '_first')
-
- def __init__(self, rule, index):
- assert isinstance(rule, Rule)
- assert index <= len(rule.expansion)
- self.rule = rule
- self.index = index
- #self._hash = hash((self.rule, self.index))
- #self._hash = None
- self._advance = None
- self._lookaheads = {}
- self._next_rules_by_origin = None
- self._first = None
-
- def __repr__(self):
- before = [x.name for x in self.rule.expansion[:self.index]]
- after = [x.name for x in self.rule.expansion[self.index:]]
- return '<%s : %s * %s>' % (self.rule.origin.name, ' '.join(before), ' '.join(after))
-
- @property
- def next(self):
- return self.rule.expansion[self.index]
-
- # don't create duplicate RulePtrs
- def advance(self, sym):
- assert self.next == sym
- a = self._advance
- if a is None:
- a = RulePtr(self.rule, self.index + 1)
- self._advance = a
- return a
-
- @property
- def is_satisfied(self):
- return self.index == len(self.rule.expansion)
-
- def lookahead(self, la):
- rp_la = self._lookaheads.get(la, None)
- if rp_la is None:
- rp_la = RulePtrLookahead(self, la)
- self._lookaheads[la] = rp_la
- return rp_la
-
- def next_rules_by_origin(self, rules_by_origin):
- n = self._next_rules_by_origin
- if n is None:
- n = rules_by_origin[self.next]
- self._next_rules_by_origin = n
- return n
-
- # recursive form of lalr_analyis.py:343 (which is easier to understand IMO)
- # normally avoid recursion but this allows us to cache
- # each intermediate step in a corresponding RulePtr
- def first(self, i, firsts, nullable, t):
- global cache_hits
- global cache_misses
- global t_firsts
- global t_xy
- global t_call
- t_call += time.time() - t
- n = len(self.rule.expansion)
- if i == n:
- return ([], True)
- x = self._first
- t_x = time.time()
- if x is None:
- t0 = time.time()
- t_y = time.time()
- cache_misses += 1
- s = self.rule.expansion[i]
- l = list(firsts.get(s, []))
- b = (s in nullable)
- if b:
- t1 = time.time()
- t_firsts += t1 - t0
- l_b_2 = self.advance(s).first(i + 1, firsts, nullable, time.time())
- #l_b_2 = first(self.advance(self.next), i + 1, firsts, nullable, time.time())
- t0 = time.time()
- l.extend(l_b_2[0])
- b = l_b_2[1]
- x = (l, b)
- self._first = x
- t1 = time.time()
- t_firsts += t1 - t0
- else:
- t_y = time.time()
- cache_hits += 1
- t_xy += t_y - t_x
- return x
-
- # optimizations were made so that there should never be
- # two distinct equal RulePtrs
- # should help set/hashtable lookups?
- '''
- def __eq__(self, other):
- return self.rule == other.rule and self.index == other.index
- def __hash__(self):
- return self._hash
- '''
-
-
- class LR0ItemSet(object):
- __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads', '_hash')
-
- def __init__(self, kernel, closure):
- self.kernel = fzset(kernel)
- self.closure = fzset(closure)
- self.transitions = {}
- self.lookaheads = defaultdict(set)
- #self._hash = hash(self.kernel)
-
- # state generation ensures no duplicate LR0ItemSets
- '''
- def __eq__(self, other):
- return self.kernel == other.kernel
-
- def __hash__(self):
- return self._hash
- '''
-
- def __repr__(self):
- return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
-
-
- def update_set(set1, set2):
- if not set2 or set1 > set2:
- return False
-
- copy = set(set1)
- set1 |= set2
- return set1 != copy
-
- def calculate_sets(rules):
- """Calculate FOLLOW sets.
-
- Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
- symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
-
- # foreach grammar rule X ::= Y(1) ... Y(k)
- # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
- # NULLABLE = NULLABLE union {X}
- # for i = 1 to k
- # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
- # FIRST(X) = FIRST(X) union FIRST(Y(i))
- # for j = i+1 to k
- # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
- # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
- # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
- # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
- # until none of NULLABLE,FIRST,FOLLOW changed in last iteration
-
- NULLABLE = set()
- FIRST = {}
- FOLLOW = {}
- for sym in symbols:
- FIRST[sym]={sym} if sym.is_term else set()
- FOLLOW[sym]=set()
-
- # Calculate NULLABLE and FIRST
- changed = True
- while changed:
- changed = False
-
- for rule in rules:
- if set(rule.expansion) <= NULLABLE:
- if update_set(NULLABLE, {rule.origin}):
- changed = True
-
- for i, sym in enumerate(rule.expansion):
- if set(rule.expansion[:i]) <= NULLABLE:
- if update_set(FIRST[rule.origin], FIRST[sym]):
- changed = True
- else:
- break
-
- # Calculate FOLLOW
- changed = True
- while changed:
- changed = False
-
- for rule in rules:
- for i, sym in enumerate(rule.expansion):
- if i==len(rule.expansion)-1 or set(rule.expansion[i+1:]) <= NULLABLE:
- if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
- changed = True
-
- for j in range(i+1, len(rule.expansion)):
- if set(rule.expansion[i+1:j]) <= NULLABLE:
- if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
- changed = True
-
- return FIRST, FOLLOW, NULLABLE
-
-
- class GrammarAnalyzer(object):
- def __init__(self, parser_conf, debug=False):
- self.debug = debug
-
- root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')])
- for start in parser_conf.start}
-
- rules = parser_conf.rules + list(root_rules.values())
- self.rules_by_origin = classify(rules, lambda r: r.origin)
-
- if len(rules) != len(set(rules)):
- duplicates = [item for item, count in Counter(rules).items() if count > 1]
- raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
-
- for r in rules:
- for sym in r.expansion:
- if not (sym.is_term or sym in self.rules_by_origin):
- raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation
-
- self.start_states = {start: self.expand_rule(root_rule.origin)
- for start, root_rule in root_rules.items()}
-
- self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
- for start, root_rule in root_rules.items()}
-
- lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
- for start in parser_conf.start}
-
- lr0_rules = parser_conf.rules + list(lr0_root_rules.values())
- assert(len(lr0_rules) == len(set(lr0_rules)))
-
- self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)
-
- # cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
- for root_rule in lr0_root_rules.values():
- root_rule._rp = RulePtr(root_rule, 0)
- self.lr0_start_states = {start: LR0ItemSet([root_rule._rp], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
- for start, root_rule in lr0_root_rules.items()}
-
- self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)
-
- # unused, did not help
- self.lr1_cache = {}
- self.lr1_cache2 = {}
-
- def expand_rule(self, source_rule, rules_by_origin=None):
- "Returns all init_ptrs accessible by rule (recursive)"
-
- if rules_by_origin is None:
- rules_by_origin = self.rules_by_origin
-
- init_ptrs = set()
- def _expand_rule(rule):
- assert not rule.is_term, rule
-
- for r in rules_by_origin[rule]:
- # don't create duplicate RulePtr objects
- init_ptr = r._rp
- if init_ptr is None:
- init_ptr = RulePtr(r, 0)
- r._rp = init_ptr
- init_ptrs.add(init_ptr)
-
- if r.expansion: # if not empty rule
- new_r = init_ptr.next
- if not new_r.is_term:
- yield new_r
-
- for _ in bfs([source_rule], _expand_rule):
- pass
-
- return fzset(init_ptrs)
|