Browse Source

lalr parser

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.8.0
Raekye 5 years ago
parent
commit
21c41e54a9
3 changed files with 184 additions and 44 deletions
  1. +33
    -3
      lark/parsers/grammar_analysis.py
  2. +138
    -33
      lark/parsers/lalr_analysis.py
  3. +13
    -8
      lark/parsers/lalr_parser.py

+ 33
- 3
lark/parsers/grammar_analysis.py View File

@@ -36,6 +36,23 @@ class RulePtr(object):
def __hash__(self):
return hash((self.rule, self.index))

class LR0ItemSet(object):
__slots__ = ('kernel', 'closure', 'transitions')

def __init__(self, kernel, closure):
self.kernel = fzset(kernel)
self.closure = fzset(closure)
self.transitions = {}

def __eq__(self, other):
return self.kernel == other.kernel

def __hash__(self):
return hash(self.kernel)

def __repr__(self):
return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))


def update_set(set1, set2):
if not set2:
@@ -130,15 +147,29 @@ class GrammarAnalyzer(object):
self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))})
for start, root_rule in root_rules.items()}

lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)])
for start in parser_conf.start}

lr0_rules = parser_conf.rules + list(lr0_root_rules.values())

self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin)

self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin))
for start, root_rule in lr0_root_rules.items()}

self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)

def expand_rule(self, rule):
def expand_rule(self, rule, rules_by_origin=None):
"Returns all init_ptrs accessible by rule (recursive)"

if rules_by_origin is None:
rules_by_origin = self.rules_by_origin

init_ptrs = set()
def _expand_rule(rule):
assert not rule.is_term, rule

for r in self.rules_by_origin[rule]:
for r in rules_by_origin[rule]:
init_ptr = RulePtr(r, 0)
init_ptrs.add(init_ptr)

@@ -157,4 +188,3 @@ class GrammarAnalyzer(object):
return {r}
else:
return {rp.next for rp in self.expand_rule(r) if rp.next.is_term}


+ 138
- 33
lark/parsers/lalr_analysis.py View File

@@ -12,7 +12,7 @@ from collections import defaultdict
from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal
from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet
from ..grammar import Rule

###{standalone
@@ -84,53 +84,158 @@ class IntParseTable(ParseTable):

class LALR_Analyzer(GrammarAnalyzer):

def compute_lookahead(self):
def generate_lr0_states(self):
self.states = set()

self.states = {}
def step(state):
lookahead = defaultdict(list)
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
for rp in sat:
for term in self.FOLLOW.get(rp.rule.origin, ()):
lookahead[term].append((Reduce, rp.rule))
_, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)

d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items():
rps = {rp.advance(sym) for rp in rps}
kernel = {rp.advance(sym) for rp in rps}
closure = set(kernel)

for rp in set(rps):
for rp in kernel:
if not rp.is_satisfied and not rp.next.is_term:
rps |= self.expand_rule(rp.next)
closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)

new_state = fzset(rps)
lookahead[sym].append((Shift, new_state))
new_state = LR0ItemSet(kernel, closure)
state.transitions[sym] = new_state
yield new_state

for k, v in lookahead.items():
if len(v) > 1:
if self.debug:
logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", k.name)
for act, arg in v:
logging.warning(' * %s: %s', act, arg)
for x in v:
# XXX resolving shift/reduce into shift, like PLY
# Give a proper warning
if x[0] is Shift:
lookahead[k] = [x]

for k, v in lookahead.items():
if not len(v) == 1:
raise GrammarError("Collision in %s: %s" %(k, ', '.join(['\n * %s: %s' % x for x in v])))

self.states[state] = {k.name:v[0] for k, v in lookahead.items()}

for _ in bfs(self.start_states.values(), step):
self.states.add(state)

for _ in bfs(self.lr0_start_states.values(), step):
pass

self._parse_table = ParseTable(self.states, self.start_states, self.end_states)
def discover_lookaheads(self):
# state -> rule -> set of lookaheads
self.lookaheads = defaultdict(lambda: defaultdict(set))
# state -> rule -> list of (set of lookaheads) to propagate to
self.propagates = defaultdict(lambda: defaultdict(list))

for s in self.lr0_start_states.values():
for rp in s.kernel:
self.lookaheads[s][rp].add(Terminal('$END'))

# There is a 1 to 1 correspondance between LR0 and LALR1 states.
# We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items.
# use a terminal that does not exist in the grammar
t = Terminal('$#')
for s in self.states:
for rp in s.kernel:
for rp2, la in self.generate_lr1_closure([(rp, t)]):
if rp2.is_satisfied:
continue
next_symbol = rp2.next
next_state = s.transitions[next_symbol]
rp3 = rp2.advance(next_symbol)
assert(rp3 in next_state.kernel)
x = self.lookaheads[next_state][rp3]
if la == t:
# we must propagate rp's lookaheads to rp3's lookahead set
self.propagates[s][rp].append(x)
else:
# this lookahead is "generated spontaneously" for rp3
x.add(la)

def propagate_lookaheads(self):
changed = True
while changed:
changed = False
for s in self.states:
for rp in s.kernel:
# from (from is a keyword)
f = self.lookaheads[s][rp]
# to
t = self.propagates[s][rp]
for x in t:
old = len(x)
x |= f
changed = changed or (len(x) != old)

def generate_lalr1_states(self):
# 1 to 1 correspondance between LR0 and LALR1 states
# We must fetch the lookaheads we calculated,
# to create the LALR1 kernels from the LR0 kernels.
# Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items.
# map of LR0 states to LALR1 states
m = {}
for s in self.states:
kernel = []
for rp in s.kernel:
las = self.lookaheads[s][rp]
assert(len(las) > 0)
for la in las:
kernel.append((rp, la))
m[s] = self.generate_lr1_closure(kernel)

self.states = {}
for s, v in m.items():
actions = {}
for la, next_state in s.transitions.items():
actions[la] = (Shift, next_state.closure)

sat, _ = classify_bool(v, lambda x: x[0].is_satisfied)
reductions = classify(sat, lambda x: x[1], lambda x: x[0])
for la, rps in reductions.items():
if len(rps) > 1:
raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ])))
if la in actions:
if self.debug:
logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", la.name)
logging.warning(' * %s', str(rps[0]))
else:
actions[la] = (Reduce, rps[0].rule)

self.states[s.closure] = {k.name: v for k, v in actions.items()}

end_states = {}
for s in self.states:
for rp in s:
for start in self.lr0_start_states:
if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
assert(not start in end_states)
end_states[start] = s

self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states)

if self.debug:
self.parse_table = self._parse_table
else:
self.parse_table = IntParseTable.from_ParseTable(self._parse_table)

def generate_lr1_closure(self, kernel):
closure = set()

q = list(kernel)
while len(q) > 0:
rp, la = q.pop()
if (rp, la) in closure:
continue
closure.add((rp, la))

if rp.is_satisfied:
continue
if rp.next.is_term:
continue

l = []
i = rp.index + 1
n = len(rp.rule.expansion)
while i < n:
s = rp.rule.expansion[i]
l.extend(self.FIRST.get(s, []))
if not s in self.NULLABLE:
break
i += 1

# if all of rp.rule.expansion[rp.index + 1:] were nullable:
if i == n:
l.append(la)

for r in self.lr0_rules_by_origin[rp.next]:
for s in l:
q.append((RulePtr(r, 0), s))

return closure

+ 13
- 8
lark/parsers/lalr_parser.py View File

@@ -6,7 +6,7 @@ from ..exceptions import UnexpectedToken
from ..lexer import Token
from ..utils import Enumerator, Serialize

from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable
from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable


###{standalone
@@ -15,7 +15,10 @@ class LALR_Parser(object):
assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
analysis = LALR_Analyzer(parser_conf, debug=debug)
analysis.compute_lookahead()
analysis.generate_lr0_states()
analysis.discover_lookaheads()
analysis.propagate_lookaheads()
analysis.generate_lalr1_states()
callbacks = parser_conf.callbacks

self._parse_table = analysis.parse_table
@@ -65,6 +68,9 @@ class _Parser:
raise UnexpectedToken(token, expected, state=state)

def reduce(rule):
if state_stack[-1] == end_state:
return True

size = len(rule.expansion)
if size:
s = value_stack[-size:]
@@ -80,6 +86,8 @@ class _Parser:
state_stack.append(new_state)
value_stack.append(value)

return False

# Main LALR-parser loop
for token in stream:
while True:
@@ -97,11 +105,8 @@ class _Parser:
token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
while True:
_action, arg = get_action(token)
if _action is Shift:
assert arg == end_state
val ,= value_stack
return val
else:
reduce(arg)
assert(_action is Reduce)
if reduce(arg):
return value_stack[-1]

###}

Loading…
Cancel
Save