From 21c41e54a9728587bc2043d187b9230acad9fcec Mon Sep 17 00:00:00 2001 From: Raekye Date: Tue, 30 Jul 2019 19:49:23 -0400 Subject: [PATCH] lalr parser --- lark/parsers/grammar_analysis.py | 36 ++++++- lark/parsers/lalr_analysis.py | 171 +++++++++++++++++++++++++------ lark/parsers/lalr_parser.py | 21 ++-- 3 files changed, 184 insertions(+), 44 deletions(-) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 086349c..5a4d0e8 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -36,6 +36,23 @@ class RulePtr(object): def __hash__(self): return hash((self.rule, self.index)) +class LR0ItemSet(object): + __slots__ = ('kernel', 'closure', 'transitions') + + def __init__(self, kernel, closure): + self.kernel = fzset(kernel) + self.closure = fzset(closure) + self.transitions = {} + + def __eq__(self, other): + return self.kernel == other.kernel + + def __hash__(self): + return hash(self.kernel) + + def __repr__(self): + return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure])) + def update_set(set1, set2): if not set2: @@ -130,15 +147,29 @@ class GrammarAnalyzer(object): self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))}) for start, root_rule in root_rules.items()} + lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)]) + for start in parser_conf.start} + + lr0_rules = parser_conf.rules + list(lr0_root_rules.values()) + + self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin) + + self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin)) + for start, root_rule in lr0_root_rules.items()} + self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) - def expand_rule(self, rule): + def expand_rule(self, rule, rules_by_origin=None): "Returns all init_ptrs accessible by rule (recursive)" + + if rules_by_origin is None: + rules_by_origin = self.rules_by_origin + init_ptrs = set() def _expand_rule(rule): assert not rule.is_term, rule - for r in self.rules_by_origin[rule]: + for r in rules_by_origin[rule]: init_ptr = RulePtr(r, 0) init_ptrs.add(init_ptr) @@ -157,4 +188,3 @@ class GrammarAnalyzer(object): return {r} else: return {rp.next for rp in self.expand_rule(r) if rp.next.is_term} - diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index eef1f9b..61fe692 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -12,7 +12,7 @@ from collections import defaultdict from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError -from .grammar_analysis import GrammarAnalyzer, Terminal +from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet from ..grammar import Rule ###{standalone @@ -84,53 +84,158 @@ class IntParseTable(ParseTable): class LALR_Analyzer(GrammarAnalyzer): - def compute_lookahead(self): + def generate_lr0_states(self): + self.states = set() - self.states = {} def step(state): - lookahead = defaultdict(list) - sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) - for rp in sat: - for term in self.FOLLOW.get(rp.rule.origin, ()): - lookahead[term].append((Reduce, rp.rule)) + _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied) d = classify(unsat, lambda rp: rp.next) for sym, rps in d.items(): - rps = {rp.advance(sym) for rp in rps} + kernel = {rp.advance(sym) for rp in rps} + closure = set(kernel) - for rp in set(rps): + for rp in kernel: if not rp.is_satisfied and not rp.next.is_term: - rps |= self.expand_rule(rp.next) + closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin) - new_state = fzset(rps) - lookahead[sym].append((Shift, new_state)) + new_state = LR0ItemSet(kernel, closure) + state.transitions[sym] = new_state yield new_state - for k, v in lookahead.items(): - if len(v) > 1: - if self.debug: - logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", k.name) - for act, arg in v: - logging.warning(' * %s: %s', act, arg) - for x in v: - # XXX resolving shift/reduce into shift, like PLY - # Give a proper warning - if x[0] is Shift: - lookahead[k] = [x] - - for k, v in lookahead.items(): - if not len(v) == 1: - raise GrammarError("Collision in %s: %s" %(k, ', '.join(['\n * %s: %s' % x for x in v]))) - - self.states[state] = {k.name:v[0] for k, v in lookahead.items()} - - for _ in bfs(self.start_states.values(), step): + self.states.add(state) + + for _ in bfs(self.lr0_start_states.values(), step): pass - self._parse_table = ParseTable(self.states, self.start_states, self.end_states) + def discover_lookaheads(self): + # state -> rule -> set of lookaheads + self.lookaheads = defaultdict(lambda: defaultdict(set)) + # state -> rule -> list of (set of lookaheads) to propagate to + self.propagates = defaultdict(lambda: defaultdict(list)) + + for s in self.lr0_start_states.values(): + for rp in s.kernel: + self.lookaheads[s][rp].add(Terminal('$END')) + + # There is a 1 to 1 correspondance between LR0 and LALR1 states. + # We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items. + # use a terminal that does not exist in the grammar + t = Terminal('$#') + for s in self.states: + for rp in s.kernel: + for rp2, la in self.generate_lr1_closure([(rp, t)]): + if rp2.is_satisfied: + continue + next_symbol = rp2.next + next_state = s.transitions[next_symbol] + rp3 = rp2.advance(next_symbol) + assert(rp3 in next_state.kernel) + x = self.lookaheads[next_state][rp3] + if la == t: + # we must propagate rp's lookaheads to rp3's lookahead set + self.propagates[s][rp].append(x) + else: + # this lookahead is "generated spontaneously" for rp3 + x.add(la) + + def propagate_lookaheads(self): + changed = True + while changed: + changed = False + for s in self.states: + for rp in s.kernel: + # from (from is a keyword) + f = self.lookaheads[s][rp] + # to + t = self.propagates[s][rp] + for x in t: + old = len(x) + x |= f + changed = changed or (len(x) != old) + + def generate_lalr1_states(self): + # 1 to 1 correspondance between LR0 and LALR1 states + # We must fetch the lookaheads we calculated, + # to create the LALR1 kernels from the LR0 kernels. + # Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items. + # map of LR0 states to LALR1 states + m = {} + for s in self.states: + kernel = [] + for rp in s.kernel: + las = self.lookaheads[s][rp] + assert(len(las) > 0) + for la in las: + kernel.append((rp, la)) + m[s] = self.generate_lr1_closure(kernel) + + self.states = {} + for s, v in m.items(): + actions = {} + for la, next_state in s.transitions.items(): + actions[la] = (Shift, next_state.closure) + + sat, _ = classify_bool(v, lambda x: x[0].is_satisfied) + reductions = classify(sat, lambda x: x[1], lambda x: x[0]) + for la, rps in reductions.items(): + if len(rps) > 1: + raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ]))) + if la in actions: + if self.debug: + logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", la.name) + logging.warning(' * %s', str(rps[0])) + else: + actions[la] = (Reduce, rps[0].rule) + + self.states[s.closure] = {k.name: v for k, v in actions.items()} + + end_states = {} + for s in self.states: + for rp in s: + for start in self.lr0_start_states: + if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied: + assert(not start in end_states) + end_states[start] = s + + self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states) if self.debug: self.parse_table = self._parse_table else: self.parse_table = IntParseTable.from_ParseTable(self._parse_table) + def generate_lr1_closure(self, kernel): + closure = set() + + q = list(kernel) + while len(q) > 0: + rp, la = q.pop() + if (rp, la) in closure: + continue + closure.add((rp, la)) + + if rp.is_satisfied: + continue + if rp.next.is_term: + continue + + l = [] + i = rp.index + 1 + n = len(rp.rule.expansion) + while i < n: + s = rp.rule.expansion[i] + l.extend(self.FIRST.get(s, [])) + if not s in self.NULLABLE: + break + i += 1 + + # if all of rp.rule.expansion[rp.index + 1:] were nullable: + if i == n: + l.append(la) + + for r in self.lr0_rules_by_origin[rp.next]: + for s in l: + q.append((RulePtr(r, 0), s)) + + return closure diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 39dd5f3..6eb3839 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -6,7 +6,7 @@ from ..exceptions import UnexpectedToken from ..lexer import Token from ..utils import Enumerator, Serialize -from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable +from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable ###{standalone @@ -15,7 +15,10 @@ class LALR_Parser(object): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) - analysis.compute_lookahead() + analysis.generate_lr0_states() + analysis.discover_lookaheads() + analysis.propagate_lookaheads() + analysis.generate_lalr1_states() callbacks = parser_conf.callbacks self._parse_table = analysis.parse_table @@ -65,6 +68,9 @@ class _Parser: raise UnexpectedToken(token, expected, state=state) def reduce(rule): + if state_stack[-1] == end_state: + return True + size = len(rule.expansion) if size: s = value_stack[-size:] @@ -80,6 +86,8 @@ class _Parser: state_stack.append(new_state) value_stack.append(value) + return False + # Main LALR-parser loop for token in stream: while True: @@ -97,11 +105,8 @@ class _Parser: token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: _action, arg = get_action(token) - if _action is Shift: - assert arg == end_state - val ,= value_stack - return val - else: - reduce(arg) + assert(_action is Reduce) + if reduce(arg): + return value_stack[-1] ###}