Browse Source

implement DeRemer and Pennello's lookahead algorithm for LALR(1)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.8.0
Raekye 5 years ago
parent
commit
0c59cba3f5
4 changed files with 169 additions and 404 deletions
  1. +2
    -2
      lark/grammar.py
  2. +10
    -100
      lark/parsers/grammar_analysis.py
  3. +147
    -285
      lark/parsers/lalr_analysis.py
  4. +10
    -17
      lark/parsers/lalr_parser.py

+ 2
- 2
lark/grammar.py View File

@@ -28,7 +28,7 @@ class Symbol(Serialize):


class Terminal(Symbol):
__serialize_fields__ = 'name', 'filter_out'
__serialize_fields__ = 'name', 'filter_out', '_hash'

is_term = True

@@ -44,7 +44,7 @@ class Terminal(Symbol):


class NonTerminal(Symbol):
__serialize_fields__ = 'name',
__serialize_fields__ = 'name', '_hash'

is_term = False



+ 10
- 100
lark/parsers/grammar_analysis.py View File

@@ -5,37 +5,18 @@ from ..exceptions import GrammarError
from ..grammar import Rule, Terminal, NonTerminal
import time

t_firsts = 0
t_xy = 0
t_call = 0
cache_hits = 0
cache_misses = 0

# used to be just a tuple (rp, la)
# but by making it an object,
# the hash and equality become trivial
# (slightly faster for sets which are hashtables?)
class RulePtrLookahead(object):
__slots__ = 'rp', 'la'

def __init__(self, rp, la):
self.rp = rp
self.la = la

# optimizations were made so that there should never be two distinct equal RulePtrs
# to help with hashtable lookup
class RulePtr(object):
__slots__ = ('rule', 'index', '_advance', '_lookaheads', '_next_rules_by_origin', '_first')
__slots__ = ('rule', 'index', '_advance')

def __init__(self, rule, index):
assert isinstance(rule, Rule)
assert index <= len(rule.expansion)
self.rule = rule
self.index = index
#self._hash = hash((self.rule, self.index))
#self._hash = None
self._advance = None
self._lookaheads = {}
self._next_rules_by_origin = None
self._first = None

def __repr__(self):
before = [x.name for x in self.rule.expansion[:self.index]]
@@ -59,89 +40,16 @@ class RulePtr(object):
def is_satisfied(self):
return self.index == len(self.rule.expansion)

def lookahead(self, la):
rp_la = self._lookaheads.get(la, None)
if rp_la is None:
rp_la = RulePtrLookahead(self, la)
self._lookaheads[la] = rp_la
return rp_la

def next_rules_by_origin(self, rules_by_origin):
n = self._next_rules_by_origin
if n is None:
n = rules_by_origin[self.next]
self._next_rules_by_origin = n
return n

# recursive form of lalr_analyis.py:343 (which is easier to understand IMO)
# normally avoid recursion but this allows us to cache
# each intermediate step in a corresponding RulePtr
def first(self, i, firsts, nullable, t):
global cache_hits
global cache_misses
global t_firsts
global t_xy
global t_call
t_call += time.time() - t
n = len(self.rule.expansion)
if i == n:
return ([], True)
x = self._first
t_x = time.time()
if x is None:
t0 = time.time()
t_y = time.time()
cache_misses += 1
s = self.rule.expansion[i]
l = list(firsts.get(s, []))
b = (s in nullable)
if b:
t1 = time.time()
t_firsts += t1 - t0
l_b_2 = self.advance(s).first(i + 1, firsts, nullable, time.time())
#l_b_2 = first(self.advance(self.next), i + 1, firsts, nullable, time.time())
t0 = time.time()
l.extend(l_b_2[0])
b = l_b_2[1]
x = (l, b)
self._first = x
t1 = time.time()
t_firsts += t1 - t0
else:
t_y = time.time()
cache_hits += 1
t_xy += t_y - t_x
return x

# optimizations were made so that there should never be
# two distinct equal RulePtrs
# should help set/hashtable lookups?
'''
def __eq__(self, other):
return self.rule == other.rule and self.index == other.index
def __hash__(self):
return self._hash
'''


# state generation ensures no duplicate LR0ItemSets
class LR0ItemSet(object):
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads', '_hash')
__slots__ = ('kernel', 'closure', 'transitions', 'lookaheads')

def __init__(self, kernel, closure):
self.kernel = fzset(kernel)
self.closure = fzset(closure)
self.transitions = {}
self.lookaheads = defaultdict(set)
#self._hash = hash(self.kernel)

# state generation ensures no duplicate LR0ItemSets
'''
def __eq__(self, other):
return self.kernel == other.kernel

def __hash__(self):
return self._hash
'''

def __repr__(self):
return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure]))
@@ -258,9 +166,11 @@ class GrammarAnalyzer(object):

self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules)

# unused, did not help
self.lr1_cache = {}
self.lr1_cache2 = {}
self.nonterminal_transitions = []
self.directly_reads = defaultdict(set)
self.reads = defaultdict(set)
self.includes = defaultdict(set)
self.lookback = defaultdict(set)

def expand_rule(self, source_rule, rules_by_origin=None):
"Returns all init_ptrs accessible by rule (recursive)"


+ 147
- 285
lark/parsers/lalr_analysis.py View File

@@ -12,9 +12,8 @@ from collections import defaultdict, deque
from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet
from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet
from ..grammar import Rule
from . import grammar_analysis

import time

@@ -31,15 +30,6 @@ class Action:
Shift = Action('Shift')
Reduce = Action('Reduce')

t_set_0 = 0
t_set_1 = 0
t_expand = 0
t_rules = 0
t_append = 0
t_z = 0
t_begin = 0
t_count = 0
t_call = 0

class ParseTable:
def __init__(self, states, start_states, end_states):
@@ -95,9 +85,60 @@ class IntParseTable(ParseTable):

###}


# digraph and traverse, see The Theory and Practice of Compiler Writing

# computes F(x) = G(x) union (union { G(y) | x R y })
# X: nodes
# R: relation (function mapping node -> list of nodes that satisfy the relation)
# G: set valued function
def digraph(X, R, G):
F = {}
S = []
N = {}
for x in X:
N[x] = 0
for x in X:
# this is always true for the first iteration, but N[x] may be updated in traverse below
if N[x] == 0:
traverse(x, S, N, X, R, G, F)
return F

# x: single node
# S: stack
# N: weights
# X: nodes
# R: relation (see above)
# G: set valued function
# F: set valued function we are computing (map of input -> output)
def traverse(x, S, N, X, R, G, F):
S.append(x)
d = len(S)
N[x] = d
F[x] = G(x)
for y in R(x):
if N[y] == 0:
traverse(y, S, N, X, R, G, F)
n_x = N[x]
assert(n_x > 0)
n_y = N[y]
assert(n_y != 0)
if (n_y > 0) and (n_y < n_x):
N[x] = n_y
F[x].update(F[y])
if N[x] == d:
f_x = F[x]
while True:
z = S.pop()
N[z] = -1
F[z] = f_x
if z == x:
break


class LALR_Analyzer(GrammarAnalyzer):

def generate_lr0_states(self):
def compute_lr0_states(self):
self.states = set()
# map of kernels to LR0ItemSets
cache = {}
@@ -125,297 +166,118 @@ class LALR_Analyzer(GrammarAnalyzer):
for _ in bfs(self.lr0_start_states.values(), step):
pass

def discover_lookaheads(self):
# lookaheads is now a member of LR0ItemSet, so don't need to look up a dictionary here
# state -> rule -> set of lookaheads
#self.lookaheads = defaultdict(lambda: defaultdict(set))
# state -> rule -> list of (set of lookaheads) to propagate to
#self.propagates = defaultdict(lambda: defaultdict(list))
self.propagates = {}

t0 = time.time()

t = Terminal('$END')
for s in self.lr0_start_states.values():
for rp in s.kernel:
#self.lookaheads[s][rp].add(Terminal('$END'))
s.lookaheads[rp].add(t)

t_closure = 0

# There is a 1 to 1 correspondance between LR0 and LALR1 states.
# We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items.
# use a terminal that does not exist in the grammar
t = Terminal('$#')
for s in self.states:
p = {}
self.propagates[s] = p
for rp in s.kernel:
q = []
p[rp] = q
t2 = time.time()
z = self.generate_lr1_closure([rp.lookahead(t)], time.time())
t3 = time.time()
t_closure += t3 - t2
#for rp2, la in self.generate_lr1_closure([(rp, t)], time.time()):
for rp2_la in z:
rp2 = rp2_la.rp
la = rp2_la.la
def compute_reads_relations(self):
# handle start state
for root in self.lr0_start_states.values():
assert(len(root.kernel) == 1)
for rp in root.kernel:
assert(rp.index == 0)
self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ])

for state in self.states:
seen = set()
for rp in state.closure:
if rp.is_satisfied:
continue
s = rp.next
# if s is a not a nonterminal
if not s in self.lr0_rules_by_origin:
continue
if s in seen:
continue
seen.add(s)
nt = (state, s)
self.nonterminal_transitions.append(nt)
dr = self.directly_reads[nt]
r = self.reads[nt]
next_state = state.transitions[s]
for rp2 in next_state.closure:
if rp2.is_satisfied:
continue
next_symbol = rp2.next
next_state = s.transitions[next_symbol]
rp3 = rp2.advance(next_symbol)
assert(rp3 in next_state.kernel)
#x = self.lookaheads[next_state][rp3]
x = next_state.lookaheads[rp3]
if la == t:
# we must propagate rp's lookaheads to rp3's lookahead set
q.append(x)
s2 = rp2.next
# if s2 is a terminal
if not s2 in self.lr0_rules_by_origin:
dr.add(s2)
if s2 in self.NULLABLE:
r.add((next_state, s2))

def compute_read_sets(self):
R = lambda nt: self.reads[nt]
G = lambda nt: self.directly_reads[nt]
self.read_sets = digraph(self.nonterminal_transitions, R, G)

def compute_includes_lookback(self):
for nt in self.nonterminal_transitions:
state, nonterminal = nt
includes = []
lookback = self.lookback[nt]
for rp in state.closure:
if rp.rule.origin != nonterminal:
continue
# traverse the states for rp(.rule)
state2 = state
for i in range(rp.index, len(rp.rule.expansion)):
s = rp.rule.expansion[i]
nt2 = (state2, s)
state2 = state2.transitions[s]
if not nt2 in self.reads:
continue
j = i + 1
for j in range(i + 1, len(rp.rule.expansion)):
if not rp.rule.expansion[j] in self.NULLABLE:
break
else:
# this lookahead is "generated spontaneously" for rp3
x.add(la)

t1 = time.time()
print('Discovering took {:.3f} (generating closure), {:.3f} (total)'.format(t_closure, t1 - t0))

def propagate_lookaheads(self):
changed = True
while changed:
changed = False
for s in self.states:
for rp in s.kernel:
# from (from is a keyword)
#f = self.lookaheads[s][rp]
f = s.lookaheads[rp]
# to
t = self.propagates[s][rp]
for x in t:
old = len(x)
x |= f
changed = changed or (len(x) != old)

def generate_lalr1_states(self):
t0 = time.time()
# 1 to 1 correspondance between LR0 and LALR1 states
# We must fetch the lookaheads we calculated,
# to create the LALR1 kernels from the LR0 kernels.
# Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items.
# map of LR0 states to LALR1 states
includes.append(nt2)
# state2 is at the final state for rp.rule
if rp.index == 0:
for rp2 in state2.closure:
if (rp2.rule == rp.rule) and rp2.is_satisfied:
lookback.add((state2, rp2.rule))
for nt2 in includes:
self.includes[nt2].add(nt)

def compute_follow_sets(self):
R = lambda nt: self.includes[nt]
G = lambda nt: self.read_sets[nt]
self.follow_sets = digraph(self.nonterminal_transitions, R, G)

def compute_lookaheads(self):
for nt, lookbacks in self.lookback.items():
for state, rule in lookbacks:
for s in self.follow_sets[nt]:
state.lookaheads[s].add(rule)

def compute_lalr1_states(self):
m = {}
t_closure = 0
z = 0
for s in self.states:
z = max(z, len(s.closure))
kernel = []
for rp in s.kernel:
#las = self.lookaheads[s][rp]
las = s.lookaheads[rp]
assert(len(las) > 0)
for la in las:
kernel.append(rp.lookahead(la))
t0_0 = time.time()
m[s] = self.generate_lr1_closure(kernel, time.time())
t0_1 = time.time()
t_closure += t0_1 - t0_0

print('Generating lalr1 closure for lalr kernels took {:.3f}'.format(t_closure))
print('Max lr0 state size was {}'.format(z))

t1 = time.time()

self.states = {}
for s, v in m.items():
for state in self.states:
actions = {}
for la, next_state in s.transitions.items():
for la, next_state in state.transitions.items():
actions[la] = (Shift, next_state.closure)

sat, _ = classify_bool(v, lambda x: x.rp.is_satisfied)
reductions = classify(sat, lambda x: x.la, lambda x: x.rp)
for la, rps in reductions.items():
if len(rps) > 1:
raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ])))
for la, rules in state.lookaheads.items():
if len(rules) > 1:
raise GrammarError('Collision in %s: %s' % (la, ', '.join([ str(r) for r in rules ])))
if la in actions:
if self.debug:
logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", la.name)
logging.warning(' * %s', str(rps[0]))
logging.warning('Shift/reduce conflict for terminal %s: (resolving as shift)', la.name)
logging.warning(' * %s', list(rules)[0])
else:
actions[la] = (Reduce, rps[0].rule)
actions[la] = (Reduce, list(rules)[0])
m[state] = { k.name: v for k, v in actions.items() }

self.states[s.closure] = {k.name: v for k, v in actions.items()}

t2 = time.time()
self.states = { k.closure: v for k, v in m.items() }

# compute end states
end_states = {}
for s in self.states:
for rp in s:
for state in self.states:
for rp in state:
for start in self.lr0_start_states:
if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
assert(not start in end_states)
end_states[start] = s

t3 = time.time()
end_states[start] = state

self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states)

t4 = time.time()
self._parse_table = ParseTable(self.states, { start: state.closure for start, state in self.lr0_start_states.items() }, end_states)

if self.debug:
self.parse_table = self._parse_table
else:
self.parse_table = IntParseTable.from_ParseTable(self._parse_table)

t5 = time.time()

print(('Generating lalr1 states took ' + ', '.join([ '{:.3f}' ] * 5)).format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4))
print('Generating firsts took {:.3f} (time actually calculating), {:.3f} (end to end), {:.3f} (just function call)'.format(grammar_analysis.t_firsts, grammar_analysis.t_xy, grammar_analysis.t_call))

def generate_lr1_closure(self, kernel, t_caller):
global t_call
global t_set_0
global t_set_1
global t_expand
global t_rules
global t_append
global t_z
global t_begin
global t_count

t_start = time.time()
t_call += t_start - t_caller

# cache the results of this function
# not many hits, no noticeable performance improvement
'''
k = fzset(kernel)
cached = self.lr1_cache.get(k, None)
if not cached is None:
return cached
'''

closure = set()
closure_hash = {}

y = 0

q = list(kernel)
while len(q) > 0:
t_a = time.time()
rp_la = q.pop()
#rp_la_hash = hash(rp_la)
t0 = time.time()
t_begin += t0 - t_a
# try to manually maintain hashtable,
# as a set of just hashes (ints) was notably faster
'''
if rp_la_hash in closure_hash:
if rp_la in closure_hash[rp_la_hash]:
t0_0 = time.time()
t_set_0 += t0_0 - t0
continue
t0_0 = time.time()
t_set_0 += t0_0 - t0
else:
closure_hash[rp_la_hash] = []
'''
if rp_la in closure:
t0_0 = time.time()
t_set_0 += t0_0 - t0
continue
t0_0 = time.time()
closure.add(rp_la)
#closure_hash[rp_la_hash].append(rp_la)
t1 = time.time()
t_set_0 += t0_0 - t0
t_set_1 += t1 - t0_0
rp = rp_la.rp
la = rp_la.la

if rp.is_satisfied:
continue
if rp.next.is_term:
continue

t2 = time.time()

# cache these calculations inside each RulePtr
# see grammar_analysis.py:79
l = []
'''
i = rp.index + 1
n = len(rp.rule.expansion)
l2_i = self.lr1_cache2.get((rp.rule, i), None)
l2 = []
if l2_i is None:
while i < n:
s = rp.rule.expansion[i]
l2.extend(self.FIRST.get(s, []))
if not s in self.NULLABLE:
break
i += 1
self.lr1_cache2[(rp.rule, i)] = (l2, i)
else:
l2 = l2_i[0]
i = l2_i[1]

l.extend(l2)
'''
# this function call seems really slow (see grammar_analysis.t_call above)
# tried making it not a method call so don't need to look up vtable
# still equally slow
l2, nullable = rp.first(rp.index + 1, self.FIRST, self.NULLABLE, time.time())
#l2, nullable = grammar_analysis.first(rp, rp.index + 1, self.FIRST, self.NULLABLE, time.time())
#l.extend(l2)
l = l2
t3 = time.time()

t_expand += t3 - t2

# if we don't modify l2 and add an extra check in the loop below,
# we don't have to copy it
# if all of rp.rule.expansion[rp.index + 1:] were nullable:
#if nullable:
# l.append(la)

t4 = time.time()
x = rp.next_rules_by_origin(self.lr0_rules_by_origin)
t5 = time.time()

# usually between 20-60? seen as high as ~175
y = max(y, len(x) * len(l))
#print('adding {} * {} rules to closure max {}'.format(len(x), len(l), y))
for r in x:
for s in l:
# cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
# cache r._rp in _rp (1 less object property lookup?)
_rp = r._rp
if _rp is None:
_rp = RulePtr(r, 0)
r._rp = _rp
q.append(_rp.lookahead(s))
#q.append((r._rp, s))
if nullable:
_rp = r._rp
if _rp is None:
_rp = RulePtr(r, 0)
r._rp = _rp
q.append(_rp.lookahead(la))
#q.append((r._rp, la))

t6 = time.time()
t_rules += t5 - t4
t_append += t6 - t5

#self.lr1_cache[k] = closure

t_end = time.time()
t_z += t_end - t_start

t_count += 1

if t_count % 1000 == 0:
print('\tGenerating lr1 closure took begin {:.3f}, set contains {:.3f}, set add {:.3f}, get first {:.3f}'.format(t_begin, t_set_0, t_set_1, t_expand))
print('\tget next rules {:.3f}, append rules {:.3f}, total {:.3f}, call time {:.3f}, count {}'.format(t_rules, t_append, t_z, t_call, t_count))
print('\tmax number of appends {}'.format(y))

return closure

+ 10
- 17
lark/parsers/lalr_parser.py View File

@@ -17,20 +17,13 @@ class LALR_Parser(object):
assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
analysis = LALR_Analyzer(parser_conf, debug=debug)
t0 = time.time()
analysis.generate_lr0_states()
t1 = time.time()
analysis.discover_lookaheads()
t2 = time.time()
analysis.propagate_lookaheads()
t3 = time.time()
analysis.generate_lalr1_states()
t4 = time.time()
print('Generating lr0 states took {:.3f}'.format(t1 - t0))
print('Discovering lookaheads took {:.3f}'.format(t2 - t1))
print('Propagating lookaheads took took {:.3f}'.format(t3 - t2))
print('Generating lalr states (closure) took {:.3f}'.format(t4 - t3))
print('-' * 32)
analysis.compute_lr0_states()
analysis.compute_reads_relations()
analysis.compute_read_sets()
analysis.compute_includes_lookback()
analysis.compute_follow_sets()
analysis.compute_lookaheads()
analysis.compute_lalr1_states()
callbacks = parser_conf.callbacks

self._parse_table = analysis.parse_table
@@ -80,9 +73,6 @@ class _Parser:
raise UnexpectedToken(token, expected, state=state)

def reduce(rule):
if state_stack[-1] == end_state:
return True

size = len(rule.expansion)
if size:
s = value_stack[-size:]
@@ -98,6 +88,9 @@ class _Parser:
state_stack.append(new_state)
value_stack.append(value)

if state_stack[-1] == end_state:
return True

return False

# Main LALR-parser loop


Loading…
Cancel
Save