From b95567c4a728ab30416f488a65fc8a0d90608288 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 20 Feb 2017 00:13:20 +0200 Subject: [PATCH] Another refactoring step --- lark/parsers/earley2.py | 2 +- lark/parsers/grammar_analysis.py | 212 ++++++++++++++++++++++++++++++ lark/parsers/lalr_analysis.py | 213 +------------------------------ 3 files changed, 214 insertions(+), 213 deletions(-) create mode 100644 lark/parsers/grammar_analysis.py diff --git a/lark/parsers/earley2.py b/lark/parsers/earley2.py index c41dfa5..7527248 100644 --- a/lark/parsers/earley2.py +++ b/lark/parsers/earley2.py @@ -1,5 +1,5 @@ from ..common import ParseError, UnexpectedToken, is_terminal -from lalr_analysis import GrammarAnalyzer +from grammar_analysis import GrammarAnalyzer from ..tree import Tree diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py new file mode 100644 index 0000000..bdd6c73 --- /dev/null +++ b/lark/parsers/grammar_analysis.py @@ -0,0 +1,212 @@ +import logging +from collections import defaultdict, deque + +from ..utils import classify, classify_bool, bfs, fzset +from ..common import GrammarError, is_terminal + +ACTION_SHIFT = 0 + +class Rule(object): + """ + origin : a symbol + expansion : a list of symbols + """ + def __init__(self, origin, expansion, alias=None): + self.origin = origin + self.expansion = expansion + self.alias = alias + + def __repr__(self): + return '<%s : %s>' % (self.origin, ' '.join(self.expansion)) + +class RulePtr(object): + def __init__(self, rule, index): + assert isinstance(rule, Rule) + assert index <= len(rule.expansion) + self.rule = rule + self.index = index + + def __repr__(self): + before = self.rule.expansion[:self.index] + after = self.rule.expansion[self.index:] + return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) + + @property + def next(self): + return self.rule.expansion[self.index] + + def advance(self, sym): + assert self.next == sym + return RulePtr(self.rule, self.index+1) + + @property + def is_satisfied(self): + return self.index == len(self.rule.expansion) + + def __eq__(self, other): + return self.rule == other.rule and self.index == other.index + def __hash__(self): + return hash((self.rule, self.index)) + + +def pairs(lst): + return zip(lst[:-1], lst[1:]) + +def update_set(set1, set2): + copy = set(set1) + set1 |= set2 + return set1 != copy + +def calculate_sets(rules): + """Calculate FOLLOW sets. + + Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" + symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules} + symbols.add('$root') # what about other unused rules? + + # foreach grammar rule X ::= Y(1) ... Y(k) + # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then + # NULLABLE = NULLABLE union {X} + # for i = 1 to k + # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then + # FIRST(X) = FIRST(X) union FIRST(Y(i)) + # for j = i+1 to k + # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) + # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) + # until none of NULLABLE,FIRST,FOLLOW changed in last iteration + + NULLABLE = set() + FIRST = {} + FOLLOW = {} + for sym in symbols: + FIRST[sym]={sym} if is_terminal(sym) else set() + FOLLOW[sym]=set() + + changed = True + while changed: + changed = False + + for rule in rules: + if set(rule.expansion) <= NULLABLE: + if update_set(NULLABLE, {rule.origin}): + changed = True + + for i, sym in enumerate(rule.expansion): + if set(rule.expansion[:i]) <= NULLABLE: + if update_set(FIRST[rule.origin], FIRST[sym]): + changed = True + if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: + if update_set(FOLLOW[sym], FOLLOW[rule.origin]): + changed = True + + for j in range(i+1, len(rule.expansion)): + if set(rule.expansion[i+1:j]) <= NULLABLE: + if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): + changed = True + + return FIRST, FOLLOW, NULLABLE + + +class GrammarAnalyzer(object): + def __init__(self, rule_tuples, start_symbol, debug=False): + self.start_symbol = start_symbol + self.debug = debug + rule_tuples = list(rule_tuples) + rule_tuples.append(('$root', [start_symbol, '$end'])) + rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] + + self.rules = set() + self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} + for origin, exp, alias in rule_tuples: + r = Rule( origin, exp, alias ) + self.rules.add(r) + self.rules_by_origin[origin].append(r) + + for r in self.rules: + for sym in r.expansion: + if not (is_terminal(sym) or sym in self.rules_by_origin): + raise GrammarError("Using an undefined rule: %s" % sym) + + self.init_state = self.expand_rule(start_symbol) + + self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) + + def expand_rule(self, rule): + "Returns all init_ptrs accessible by rule (recursive)" + init_ptrs = set() + def _expand_rule(rule): + assert not is_terminal(rule) + + for r in self.rules_by_origin[rule]: + init_ptr = RulePtr(r, 0) + init_ptrs.add(init_ptr) + + if r.expansion: # if not empty rule + new_r = init_ptr.next + if not is_terminal(new_r): + yield new_r + + _ = list(bfs([rule], _expand_rule)) + + return fzset(init_ptrs) + + def _first(self, r): + if is_terminal(r): + return {r} + else: + return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} + + def analyze(self): + + self.states = {} + def step(state): + lookahead = defaultdict(list) + sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) + for rp in sat: + for term in self.FOLLOW.get(rp.rule.origin, ()): + lookahead[term].append(('reduce', rp.rule)) + + d = classify(unsat, lambda rp: rp.next) + for sym, rps in d.items(): + rps = {rp.advance(sym) for rp in rps} + + for rp in set(rps): + if not rp.is_satisfied and not is_terminal(rp.next): + rps |= self.expand_rule(rp.next) + + lookahead[sym].append(('shift', fzset(rps))) + yield fzset(rps) + + for k, v in lookahead.items(): + if len(v) > 1: + if self.debug: + logging.warn("Shift/reduce conflict for %s: %s. Resolving as shift.", k, v) + for x in v: + # XXX resolving shift/reduce into shift, like PLY + # Give a proper warning + if x[0] == 'shift': + lookahead[k] = [x] + + for k, v in lookahead.items(): + assert len(v) == 1, ("Collision", k, v) + + self.states[state] = {k:v[0] for k, v in lookahead.items()} + + for _ in bfs([self.init_state], step): + pass + + # -- + self.enum = list(self.states) + self.enum_rev = {s:i for i,s in enumerate(self.enum)} + self.states_idx = {} + + for s, la in self.states.items(): + la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' + else (v[0], (v[1], len(v[1].expansion))) # Reduce + for k,v in la.items()} + self.states_idx[ self.enum_rev[s] ] = la + + + self.init_state_idx = self.enum_rev[self.init_state] diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index bdd6c73..4384ca8 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -1,212 +1 @@ -import logging -from collections import defaultdict, deque - -from ..utils import classify, classify_bool, bfs, fzset -from ..common import GrammarError, is_terminal - -ACTION_SHIFT = 0 - -class Rule(object): - """ - origin : a symbol - expansion : a list of symbols - """ - def __init__(self, origin, expansion, alias=None): - self.origin = origin - self.expansion = expansion - self.alias = alias - - def __repr__(self): - return '<%s : %s>' % (self.origin, ' '.join(self.expansion)) - -class RulePtr(object): - def __init__(self, rule, index): - assert isinstance(rule, Rule) - assert index <= len(rule.expansion) - self.rule = rule - self.index = index - - def __repr__(self): - before = self.rule.expansion[:self.index] - after = self.rule.expansion[self.index:] - return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) - - @property - def next(self): - return self.rule.expansion[self.index] - - def advance(self, sym): - assert self.next == sym - return RulePtr(self.rule, self.index+1) - - @property - def is_satisfied(self): - return self.index == len(self.rule.expansion) - - def __eq__(self, other): - return self.rule == other.rule and self.index == other.index - def __hash__(self): - return hash((self.rule, self.index)) - - -def pairs(lst): - return zip(lst[:-1], lst[1:]) - -def update_set(set1, set2): - copy = set(set1) - set1 |= set2 - return set1 != copy - -def calculate_sets(rules): - """Calculate FOLLOW sets. - - Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" - symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules} - symbols.add('$root') # what about other unused rules? - - # foreach grammar rule X ::= Y(1) ... Y(k) - # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then - # NULLABLE = NULLABLE union {X} - # for i = 1 to k - # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then - # FIRST(X) = FIRST(X) union FIRST(Y(i)) - # for j = i+1 to k - # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then - # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) - # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then - # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) - # until none of NULLABLE,FIRST,FOLLOW changed in last iteration - - NULLABLE = set() - FIRST = {} - FOLLOW = {} - for sym in symbols: - FIRST[sym]={sym} if is_terminal(sym) else set() - FOLLOW[sym]=set() - - changed = True - while changed: - changed = False - - for rule in rules: - if set(rule.expansion) <= NULLABLE: - if update_set(NULLABLE, {rule.origin}): - changed = True - - for i, sym in enumerate(rule.expansion): - if set(rule.expansion[:i]) <= NULLABLE: - if update_set(FIRST[rule.origin], FIRST[sym]): - changed = True - if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: - if update_set(FOLLOW[sym], FOLLOW[rule.origin]): - changed = True - - for j in range(i+1, len(rule.expansion)): - if set(rule.expansion[i+1:j]) <= NULLABLE: - if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): - changed = True - - return FIRST, FOLLOW, NULLABLE - - -class GrammarAnalyzer(object): - def __init__(self, rule_tuples, start_symbol, debug=False): - self.start_symbol = start_symbol - self.debug = debug - rule_tuples = list(rule_tuples) - rule_tuples.append(('$root', [start_symbol, '$end'])) - rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] - - self.rules = set() - self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} - for origin, exp, alias in rule_tuples: - r = Rule( origin, exp, alias ) - self.rules.add(r) - self.rules_by_origin[origin].append(r) - - for r in self.rules: - for sym in r.expansion: - if not (is_terminal(sym) or sym in self.rules_by_origin): - raise GrammarError("Using an undefined rule: %s" % sym) - - self.init_state = self.expand_rule(start_symbol) - - self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) - - def expand_rule(self, rule): - "Returns all init_ptrs accessible by rule (recursive)" - init_ptrs = set() - def _expand_rule(rule): - assert not is_terminal(rule) - - for r in self.rules_by_origin[rule]: - init_ptr = RulePtr(r, 0) - init_ptrs.add(init_ptr) - - if r.expansion: # if not empty rule - new_r = init_ptr.next - if not is_terminal(new_r): - yield new_r - - _ = list(bfs([rule], _expand_rule)) - - return fzset(init_ptrs) - - def _first(self, r): - if is_terminal(r): - return {r} - else: - return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} - - def analyze(self): - - self.states = {} - def step(state): - lookahead = defaultdict(list) - sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) - for rp in sat: - for term in self.FOLLOW.get(rp.rule.origin, ()): - lookahead[term].append(('reduce', rp.rule)) - - d = classify(unsat, lambda rp: rp.next) - for sym, rps in d.items(): - rps = {rp.advance(sym) for rp in rps} - - for rp in set(rps): - if not rp.is_satisfied and not is_terminal(rp.next): - rps |= self.expand_rule(rp.next) - - lookahead[sym].append(('shift', fzset(rps))) - yield fzset(rps) - - for k, v in lookahead.items(): - if len(v) > 1: - if self.debug: - logging.warn("Shift/reduce conflict for %s: %s. Resolving as shift.", k, v) - for x in v: - # XXX resolving shift/reduce into shift, like PLY - # Give a proper warning - if x[0] == 'shift': - lookahead[k] = [x] - - for k, v in lookahead.items(): - assert len(v) == 1, ("Collision", k, v) - - self.states[state] = {k:v[0] for k, v in lookahead.items()} - - for _ in bfs([self.init_state], step): - pass - - # -- - self.enum = list(self.states) - self.enum_rev = {s:i for i,s in enumerate(self.enum)} - self.states_idx = {} - - for s, la in self.states.items(): - la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' - else (v[0], (v[1], len(v[1].expansion))) # Reduce - for k,v in la.items()} - self.states_idx[ self.enum_rev[s] ] = la - - - self.init_state_idx = self.enum_rev[self.init_state] +from grammar_analysis import GrammarAnalyzer, ACTION_SHIFT