From 0c59cba3f5329381fc75a1a37a8426c15165b230 Mon Sep 17 00:00:00 2001 From: Raekye Date: Fri, 9 Aug 2019 03:26:27 -0400 Subject: [PATCH] implement DeRemer and Pennello's lookahead algorithm for LALR(1) --- lark/grammar.py | 4 +- lark/parsers/grammar_analysis.py | 110 +------- lark/parsers/lalr_analysis.py | 432 +++++++++++-------------------- lark/parsers/lalr_parser.py | 27 +- 4 files changed, 169 insertions(+), 404 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index f90cce4..3480651 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -28,7 +28,7 @@ class Symbol(Serialize): class Terminal(Symbol): - __serialize_fields__ = 'name', 'filter_out' + __serialize_fields__ = 'name', 'filter_out', '_hash' is_term = True @@ -44,7 +44,7 @@ class Terminal(Symbol): class NonTerminal(Symbol): - __serialize_fields__ = 'name', + __serialize_fields__ = 'name', '_hash' is_term = False diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 71a7bc5..b32f62f 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -5,37 +5,18 @@ from ..exceptions import GrammarError from ..grammar import Rule, Terminal, NonTerminal import time -t_firsts = 0 -t_xy = 0 -t_call = 0 -cache_hits = 0 -cache_misses = 0 - -# used to be just a tuple (rp, la) -# but by making it an object, -# the hash and equality become trivial -# (slightly faster for sets which are hashtables?) -class RulePtrLookahead(object): - __slots__ = 'rp', 'la' - - def __init__(self, rp, la): - self.rp = rp - self.la = la +# optimizations were made so that there should never be two distinct equal RulePtrs +# to help with hashtable lookup class RulePtr(object): - __slots__ = ('rule', 'index', '_advance', '_lookaheads', '_next_rules_by_origin', '_first') + __slots__ = ('rule', 'index', '_advance') def __init__(self, rule, index): assert isinstance(rule, Rule) assert index <= len(rule.expansion) self.rule = rule self.index = index - #self._hash = hash((self.rule, self.index)) - #self._hash = None self._advance = None - self._lookaheads = {} - self._next_rules_by_origin = None - self._first = None def __repr__(self): before = [x.name for x in self.rule.expansion[:self.index]] @@ -59,89 +40,16 @@ class RulePtr(object): def is_satisfied(self): return self.index == len(self.rule.expansion) - def lookahead(self, la): - rp_la = self._lookaheads.get(la, None) - if rp_la is None: - rp_la = RulePtrLookahead(self, la) - self._lookaheads[la] = rp_la - return rp_la - - def next_rules_by_origin(self, rules_by_origin): - n = self._next_rules_by_origin - if n is None: - n = rules_by_origin[self.next] - self._next_rules_by_origin = n - return n - - # recursive form of lalr_analyis.py:343 (which is easier to understand IMO) - # normally avoid recursion but this allows us to cache - # each intermediate step in a corresponding RulePtr - def first(self, i, firsts, nullable, t): - global cache_hits - global cache_misses - global t_firsts - global t_xy - global t_call - t_call += time.time() - t - n = len(self.rule.expansion) - if i == n: - return ([], True) - x = self._first - t_x = time.time() - if x is None: - t0 = time.time() - t_y = time.time() - cache_misses += 1 - s = self.rule.expansion[i] - l = list(firsts.get(s, [])) - b = (s in nullable) - if b: - t1 = time.time() - t_firsts += t1 - t0 - l_b_2 = self.advance(s).first(i + 1, firsts, nullable, time.time()) - #l_b_2 = first(self.advance(self.next), i + 1, firsts, nullable, time.time()) - t0 = time.time() - l.extend(l_b_2[0]) - b = l_b_2[1] - x = (l, b) - self._first = x - t1 = time.time() - t_firsts += t1 - t0 - else: - t_y = time.time() - cache_hits += 1 - t_xy += t_y - t_x - return x - - # optimizations were made so that there should never be - # two distinct equal RulePtrs - # should help set/hashtable lookups? - ''' - def __eq__(self, other): - return self.rule == other.rule and self.index == other.index - def __hash__(self): - return self._hash - ''' - +# state generation ensures no duplicate LR0ItemSets class LR0ItemSet(object): - __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads', '_hash') + __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads') def __init__(self, kernel, closure): self.kernel = fzset(kernel) self.closure = fzset(closure) self.transitions = {} self.lookaheads = defaultdict(set) - #self._hash = hash(self.kernel) - - # state generation ensures no duplicate LR0ItemSets - ''' - def __eq__(self, other): - return self.kernel == other.kernel - - def __hash__(self): - return self._hash - ''' def __repr__(self): return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure])) @@ -258,9 +166,11 @@ class GrammarAnalyzer(object): self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) - # unused, did not help - self.lr1_cache = {} - self.lr1_cache2 = {} + self.nonterminal_transitions = [] + self.directly_reads = defaultdict(set) + self.reads = defaultdict(set) + self.includes = defaultdict(set) + self.lookback = defaultdict(set) def expand_rule(self, source_rule, rules_by_origin=None): "Returns all init_ptrs accessible by rule (recursive)" diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index eb87e7a..4104713 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -12,9 +12,8 @@ from collections import defaultdict, deque from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError -from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet +from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from ..grammar import Rule -from . import grammar_analysis import time @@ -31,15 +30,6 @@ class Action: Shift = Action('Shift') Reduce = Action('Reduce') -t_set_0 = 0 -t_set_1 = 0 -t_expand = 0 -t_rules = 0 -t_append = 0 -t_z = 0 -t_begin = 0 -t_count = 0 -t_call = 0 class ParseTable: def __init__(self, states, start_states, end_states): @@ -95,9 +85,60 @@ class IntParseTable(ParseTable): ###} + +# digraph and traverse, see The Theory and Practice of Compiler Writing + +# computes F(x) = G(x) union (union { G(y) | x R y }) +# X: nodes +# R: relation (function mapping node -> list of nodes that satisfy the relation) +# G: set valued function +def digraph(X, R, G): + F = {} + S = [] + N = {} + for x in X: + N[x] = 0 + for x in X: + # this is always true for the first iteration, but N[x] may be updated in traverse below + if N[x] == 0: + traverse(x, S, N, X, R, G, F) + return F + +# x: single node +# S: stack +# N: weights +# X: nodes +# R: relation (see above) +# G: set valued function +# F: set valued function we are computing (map of input -> output) +def traverse(x, S, N, X, R, G, F): + S.append(x) + d = len(S) + N[x] = d + F[x] = G(x) + for y in R(x): + if N[y] == 0: + traverse(y, S, N, X, R, G, F) + n_x = N[x] + assert(n_x > 0) + n_y = N[y] + assert(n_y != 0) + if (n_y > 0) and (n_y < n_x): + N[x] = n_y + F[x].update(F[y]) + if N[x] == d: + f_x = F[x] + while True: + z = S.pop() + N[z] = -1 + F[z] = f_x + if z == x: + break + + class LALR_Analyzer(GrammarAnalyzer): - def generate_lr0_states(self): + def compute_lr0_states(self): self.states = set() # map of kernels to LR0ItemSets cache = {} @@ -125,297 +166,118 @@ class LALR_Analyzer(GrammarAnalyzer): for _ in bfs(self.lr0_start_states.values(), step): pass - def discover_lookaheads(self): - # lookaheads is now a member of LR0ItemSet, so don't need to look up a dictionary here - # state -> rule -> set of lookaheads - #self.lookaheads = defaultdict(lambda: defaultdict(set)) - # state -> rule -> list of (set of lookaheads) to propagate to - #self.propagates = defaultdict(lambda: defaultdict(list)) - self.propagates = {} - - t0 = time.time() - - t = Terminal('$END') - for s in self.lr0_start_states.values(): - for rp in s.kernel: - #self.lookaheads[s][rp].add(Terminal('$END')) - s.lookaheads[rp].add(t) - - t_closure = 0 - - # There is a 1 to 1 correspondance between LR0 and LALR1 states. - # We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items. - # use a terminal that does not exist in the grammar - t = Terminal('$#') - for s in self.states: - p = {} - self.propagates[s] = p - for rp in s.kernel: - q = [] - p[rp] = q - t2 = time.time() - z = self.generate_lr1_closure([rp.lookahead(t)], time.time()) - t3 = time.time() - t_closure += t3 - t2 - #for rp2, la in self.generate_lr1_closure([(rp, t)], time.time()): - for rp2_la in z: - rp2 = rp2_la.rp - la = rp2_la.la + def compute_reads_relations(self): + # handle start state + for root in self.lr0_start_states.values(): + assert(len(root.kernel) == 1) + for rp in root.kernel: + assert(rp.index == 0) + self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ]) + + for state in self.states: + seen = set() + for rp in state.closure: + if rp.is_satisfied: + continue + s = rp.next + # if s is a not a nonterminal + if not s in self.lr0_rules_by_origin: + continue + if s in seen: + continue + seen.add(s) + nt = (state, s) + self.nonterminal_transitions.append(nt) + dr = self.directly_reads[nt] + r = self.reads[nt] + next_state = state.transitions[s] + for rp2 in next_state.closure: if rp2.is_satisfied: continue - next_symbol = rp2.next - next_state = s.transitions[next_symbol] - rp3 = rp2.advance(next_symbol) - assert(rp3 in next_state.kernel) - #x = self.lookaheads[next_state][rp3] - x = next_state.lookaheads[rp3] - if la == t: - # we must propagate rp's lookaheads to rp3's lookahead set - q.append(x) + s2 = rp2.next + # if s2 is a terminal + if not s2 in self.lr0_rules_by_origin: + dr.add(s2) + if s2 in self.NULLABLE: + r.add((next_state, s2)) + + def compute_read_sets(self): + R = lambda nt: self.reads[nt] + G = lambda nt: self.directly_reads[nt] + self.read_sets = digraph(self.nonterminal_transitions, R, G) + + def compute_includes_lookback(self): + for nt in self.nonterminal_transitions: + state, nonterminal = nt + includes = [] + lookback = self.lookback[nt] + for rp in state.closure: + if rp.rule.origin != nonterminal: + continue + # traverse the states for rp(.rule) + state2 = state + for i in range(rp.index, len(rp.rule.expansion)): + s = rp.rule.expansion[i] + nt2 = (state2, s) + state2 = state2.transitions[s] + if not nt2 in self.reads: + continue + j = i + 1 + for j in range(i + 1, len(rp.rule.expansion)): + if not rp.rule.expansion[j] in self.NULLABLE: + break else: - # this lookahead is "generated spontaneously" for rp3 - x.add(la) - - t1 = time.time() - print('Discovering took {:.3f} (generating closure), {:.3f} (total)'.format(t_closure, t1 - t0)) - - def propagate_lookaheads(self): - changed = True - while changed: - changed = False - for s in self.states: - for rp in s.kernel: - # from (from is a keyword) - #f = self.lookaheads[s][rp] - f = s.lookaheads[rp] - # to - t = self.propagates[s][rp] - for x in t: - old = len(x) - x |= f - changed = changed or (len(x) != old) - - def generate_lalr1_states(self): - t0 = time.time() - # 1 to 1 correspondance between LR0 and LALR1 states - # We must fetch the lookaheads we calculated, - # to create the LALR1 kernels from the LR0 kernels. - # Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items. - # map of LR0 states to LALR1 states + includes.append(nt2) + # state2 is at the final state for rp.rule + if rp.index == 0: + for rp2 in state2.closure: + if (rp2.rule == rp.rule) and rp2.is_satisfied: + lookback.add((state2, rp2.rule)) + for nt2 in includes: + self.includes[nt2].add(nt) + + def compute_follow_sets(self): + R = lambda nt: self.includes[nt] + G = lambda nt: self.read_sets[nt] + self.follow_sets = digraph(self.nonterminal_transitions, R, G) + + def compute_lookaheads(self): + for nt, lookbacks in self.lookback.items(): + for state, rule in lookbacks: + for s in self.follow_sets[nt]: + state.lookaheads[s].add(rule) + + def compute_lalr1_states(self): m = {} - t_closure = 0 - z = 0 - for s in self.states: - z = max(z, len(s.closure)) - kernel = [] - for rp in s.kernel: - #las = self.lookaheads[s][rp] - las = s.lookaheads[rp] - assert(len(las) > 0) - for la in las: - kernel.append(rp.lookahead(la)) - t0_0 = time.time() - m[s] = self.generate_lr1_closure(kernel, time.time()) - t0_1 = time.time() - t_closure += t0_1 - t0_0 - - print('Generating lalr1 closure for lalr kernels took {:.3f}'.format(t_closure)) - print('Max lr0 state size was {}'.format(z)) - - t1 = time.time() - - self.states = {} - for s, v in m.items(): + for state in self.states: actions = {} - for la, next_state in s.transitions.items(): + for la, next_state in state.transitions.items(): actions[la] = (Shift, next_state.closure) - - sat, _ = classify_bool(v, lambda x: x.rp.is_satisfied) - reductions = classify(sat, lambda x: x.la, lambda x: x.rp) - for la, rps in reductions.items(): - if len(rps) > 1: - raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ]))) + for la, rules in state.lookaheads.items(): + if len(rules) > 1: + raise GrammarError('Collision in %s: %s' % (la, ', '.join([ str(r) for r in rules ]))) if la in actions: if self.debug: - logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", la.name) - logging.warning(' * %s', str(rps[0])) + logging.warning('Shift/reduce conflict for terminal %s: (resolving as shift)', la.name) + logging.warning(' * %s', list(rules)[0]) else: - actions[la] = (Reduce, rps[0].rule) + actions[la] = (Reduce, list(rules)[0]) + m[state] = { k.name: v for k, v in actions.items() } - self.states[s.closure] = {k.name: v for k, v in actions.items()} - - t2 = time.time() + self.states = { k.closure: v for k, v in m.items() } + # compute end states end_states = {} - for s in self.states: - for rp in s: + for state in self.states: + for rp in state: for start in self.lr0_start_states: if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied: assert(not start in end_states) - end_states[start] = s - - t3 = time.time() + end_states[start] = state - self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states) - - t4 = time.time() + self._parse_table = ParseTable(self.states, { start: state.closure for start, state in self.lr0_start_states.items() }, end_states) if self.debug: self.parse_table = self._parse_table else: self.parse_table = IntParseTable.from_ParseTable(self._parse_table) - - t5 = time.time() - - print(('Generating lalr1 states took ' + ', '.join([ '{:.3f}' ] * 5)).format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4)) - print('Generating firsts took {:.3f} (time actually calculating), {:.3f} (end to end), {:.3f} (just function call)'.format(grammar_analysis.t_firsts, grammar_analysis.t_xy, grammar_analysis.t_call)) - - def generate_lr1_closure(self, kernel, t_caller): - global t_call - global t_set_0 - global t_set_1 - global t_expand - global t_rules - global t_append - global t_z - global t_begin - global t_count - - t_start = time.time() - t_call += t_start - t_caller - - # cache the results of this function - # not many hits, no noticeable performance improvement - ''' - k = fzset(kernel) - cached = self.lr1_cache.get(k, None) - if not cached is None: - return cached - ''' - - closure = set() - closure_hash = {} - - y = 0 - - q = list(kernel) - while len(q) > 0: - t_a = time.time() - rp_la = q.pop() - #rp_la_hash = hash(rp_la) - t0 = time.time() - t_begin += t0 - t_a - # try to manually maintain hashtable, - # as a set of just hashes (ints) was notably faster - ''' - if rp_la_hash in closure_hash: - if rp_la in closure_hash[rp_la_hash]: - t0_0 = time.time() - t_set_0 += t0_0 - t0 - continue - t0_0 = time.time() - t_set_0 += t0_0 - t0 - else: - closure_hash[rp_la_hash] = [] - ''' - if rp_la in closure: - t0_0 = time.time() - t_set_0 += t0_0 - t0 - continue - t0_0 = time.time() - closure.add(rp_la) - #closure_hash[rp_la_hash].append(rp_la) - t1 = time.time() - t_set_0 += t0_0 - t0 - t_set_1 += t1 - t0_0 - rp = rp_la.rp - la = rp_la.la - - if rp.is_satisfied: - continue - if rp.next.is_term: - continue - - t2 = time.time() - - # cache these calculations inside each RulePtr - # see grammar_analysis.py:79 - l = [] - ''' - i = rp.index + 1 - n = len(rp.rule.expansion) - l2_i = self.lr1_cache2.get((rp.rule, i), None) - l2 = [] - if l2_i is None: - while i < n: - s = rp.rule.expansion[i] - l2.extend(self.FIRST.get(s, [])) - if not s in self.NULLABLE: - break - i += 1 - self.lr1_cache2[(rp.rule, i)] = (l2, i) - else: - l2 = l2_i[0] - i = l2_i[1] - - l.extend(l2) - ''' - # this function call seems really slow (see grammar_analysis.t_call above) - # tried making it not a method call so don't need to look up vtable - # still equally slow - l2, nullable = rp.first(rp.index + 1, self.FIRST, self.NULLABLE, time.time()) - #l2, nullable = grammar_analysis.first(rp, rp.index + 1, self.FIRST, self.NULLABLE, time.time()) - #l.extend(l2) - l = l2 - t3 = time.time() - - t_expand += t3 - t2 - - # if we don't modify l2 and add an extra check in the loop below, - # we don't have to copy it - # if all of rp.rule.expansion[rp.index + 1:] were nullable: - #if nullable: - # l.append(la) - - t4 = time.time() - x = rp.next_rules_by_origin(self.lr0_rules_by_origin) - t5 = time.time() - - # usually between 20-60? seen as high as ~175 - y = max(y, len(x) * len(l)) - #print('adding {} * {} rules to closure max {}'.format(len(x), len(l), y)) - for r in x: - for s in l: - # cache RulePtr(r, 0) in r (no duplicate RulePtr objects) - # cache r._rp in _rp (1 less object property lookup?) - _rp = r._rp - if _rp is None: - _rp = RulePtr(r, 0) - r._rp = _rp - q.append(_rp.lookahead(s)) - #q.append((r._rp, s)) - if nullable: - _rp = r._rp - if _rp is None: - _rp = RulePtr(r, 0) - r._rp = _rp - q.append(_rp.lookahead(la)) - #q.append((r._rp, la)) - - t6 = time.time() - t_rules += t5 - t4 - t_append += t6 - t5 - - #self.lr1_cache[k] = closure - - t_end = time.time() - t_z += t_end - t_start - - t_count += 1 - - if t_count % 1000 == 0: - print('\tGenerating lr1 closure took begin {:.3f}, set contains {:.3f}, set add {:.3f}, get first {:.3f}'.format(t_begin, t_set_0, t_set_1, t_expand)) - print('\tget next rules {:.3f}, append rules {:.3f}, total {:.3f}, call time {:.3f}, count {}'.format(t_rules, t_append, t_z, t_call, t_count)) - print('\tmax number of appends {}'.format(y)) - - return closure diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index b3985ae..657e795 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -17,20 +17,13 @@ class LALR_Parser(object): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) - t0 = time.time() - analysis.generate_lr0_states() - t1 = time.time() - analysis.discover_lookaheads() - t2 = time.time() - analysis.propagate_lookaheads() - t3 = time.time() - analysis.generate_lalr1_states() - t4 = time.time() - print('Generating lr0 states took {:.3f}'.format(t1 - t0)) - print('Discovering lookaheads took {:.3f}'.format(t2 - t1)) - print('Propagating lookaheads took took {:.3f}'.format(t3 - t2)) - print('Generating lalr states (closure) took {:.3f}'.format(t4 - t3)) - print('-' * 32) + analysis.compute_lr0_states() + analysis.compute_reads_relations() + analysis.compute_read_sets() + analysis.compute_includes_lookback() + analysis.compute_follow_sets() + analysis.compute_lookaheads() + analysis.compute_lalr1_states() callbacks = parser_conf.callbacks self._parse_table = analysis.parse_table @@ -80,9 +73,6 @@ class _Parser: raise UnexpectedToken(token, expected, state=state) def reduce(rule): - if state_stack[-1] == end_state: - return True - size = len(rule.expansion) if size: s = value_stack[-size:] @@ -98,6 +88,9 @@ class _Parser: state_stack.append(new_state) value_stack.append(value) + if state_stack[-1] == end_state: + return True + return False # Main LALR-parser loop