Sfoglia il codice sorgente

Another refactoring step

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 anni fa
parent
commit
b95567c4a7
3 ha cambiato i file con 214 aggiunte e 213 eliminazioni
  1. +1
    -1
      lark/parsers/earley2.py
  2. +212
    -0
      lark/parsers/grammar_analysis.py
  3. +1
    -212
      lark/parsers/lalr_analysis.py

+ 1
- 1
lark/parsers/earley2.py Vedi File

@@ -1,5 +1,5 @@
from ..common import ParseError, UnexpectedToken, is_terminal
from lalr_analysis import GrammarAnalyzer
from grammar_analysis import GrammarAnalyzer

from ..tree import Tree



+ 212
- 0
lark/parsers/grammar_analysis.py Vedi File

@@ -0,0 +1,212 @@
import logging
from collections import defaultdict, deque

from ..utils import classify, classify_bool, bfs, fzset
from ..common import GrammarError, is_terminal

ACTION_SHIFT = 0

class Rule(object):
"""
origin : a symbol
expansion : a list of symbols
"""
def __init__(self, origin, expansion, alias=None):
self.origin = origin
self.expansion = expansion
self.alias = alias

def __repr__(self):
return '<%s : %s>' % (self.origin, ' '.join(self.expansion))

class RulePtr(object):
def __init__(self, rule, index):
assert isinstance(rule, Rule)
assert index <= len(rule.expansion)
self.rule = rule
self.index = index

def __repr__(self):
before = self.rule.expansion[:self.index]
after = self.rule.expansion[self.index:]
return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after))

@property
def next(self):
return self.rule.expansion[self.index]

def advance(self, sym):
assert self.next == sym
return RulePtr(self.rule, self.index+1)

@property
def is_satisfied(self):
return self.index == len(self.rule.expansion)

def __eq__(self, other):
return self.rule == other.rule and self.index == other.index
def __hash__(self):
return hash((self.rule, self.index))


def pairs(lst):
return zip(lst[:-1], lst[1:])

def update_set(set1, set2):
copy = set(set1)
set1 |= set2
return set1 != copy

def calculate_sets(rules):
"""Calculate FOLLOW sets.

Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
symbols.add('$root') # what about other unused rules?

# foreach grammar rule X ::= Y(1) ... Y(k)
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
# NULLABLE = NULLABLE union {X}
# for i = 1 to k
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
# FIRST(X) = FIRST(X) union FIRST(Y(i))
# for j = i+1 to k
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration

NULLABLE = set()
FIRST = {}
FOLLOW = {}
for sym in symbols:
FIRST[sym]={sym} if is_terminal(sym) else set()
FOLLOW[sym]=set()

changed = True
while changed:
changed = False

for rule in rules:
if set(rule.expansion) <= NULLABLE:
if update_set(NULLABLE, {rule.origin}):
changed = True

for i, sym in enumerate(rule.expansion):
if set(rule.expansion[:i]) <= NULLABLE:
if update_set(FIRST[rule.origin], FIRST[sym]):
changed = True
if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE:
if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
changed = True

for j in range(i+1, len(rule.expansion)):
if set(rule.expansion[i+1:j]) <= NULLABLE:
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
changed = True

return FIRST, FOLLOW, NULLABLE


class GrammarAnalyzer(object):
def __init__(self, rule_tuples, start_symbol, debug=False):
self.start_symbol = start_symbol
self.debug = debug
rule_tuples = list(rule_tuples)
rule_tuples.append(('$root', [start_symbol, '$end']))
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples]

self.rules = set()
self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples}
for origin, exp, alias in rule_tuples:
r = Rule( origin, exp, alias )
self.rules.add(r)
self.rules_by_origin[origin].append(r)

for r in self.rules:
for sym in r.expansion:
if not (is_terminal(sym) or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym)

self.init_state = self.expand_rule(start_symbol)

self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules)

def expand_rule(self, rule):
"Returns all init_ptrs accessible by rule (recursive)"
init_ptrs = set()
def _expand_rule(rule):
assert not is_terminal(rule)

for r in self.rules_by_origin[rule]:
init_ptr = RulePtr(r, 0)
init_ptrs.add(init_ptr)

if r.expansion: # if not empty rule
new_r = init_ptr.next
if not is_terminal(new_r):
yield new_r

_ = list(bfs([rule], _expand_rule))

return fzset(init_ptrs)

def _first(self, r):
if is_terminal(r):
return {r}
else:
return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)}

def analyze(self):

self.states = {}
def step(state):
lookahead = defaultdict(list)
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
for rp in sat:
for term in self.FOLLOW.get(rp.rule.origin, ()):
lookahead[term].append(('reduce', rp.rule))

d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items():
rps = {rp.advance(sym) for rp in rps}

for rp in set(rps):
if not rp.is_satisfied and not is_terminal(rp.next):
rps |= self.expand_rule(rp.next)

lookahead[sym].append(('shift', fzset(rps)))
yield fzset(rps)

for k, v in lookahead.items():
if len(v) > 1:
if self.debug:
logging.warn("Shift/reduce conflict for %s: %s. Resolving as shift.", k, v)
for x in v:
# XXX resolving shift/reduce into shift, like PLY
# Give a proper warning
if x[0] == 'shift':
lookahead[k] = [x]

for k, v in lookahead.items():
assert len(v) == 1, ("Collision", k, v)

self.states[state] = {k:v[0] for k, v in lookahead.items()}

for _ in bfs([self.init_state], step):
pass

# --
self.enum = list(self.states)
self.enum_rev = {s:i for i,s in enumerate(self.enum)}
self.states_idx = {}

for s, la in self.states.items():
la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift'
else (v[0], (v[1], len(v[1].expansion))) # Reduce
for k,v in la.items()}
self.states_idx[ self.enum_rev[s] ] = la


self.init_state_idx = self.enum_rev[self.init_state]

+ 1
- 212
lark/parsers/lalr_analysis.py Vedi File

@@ -1,212 +1 @@
import logging
from collections import defaultdict, deque

from ..utils import classify, classify_bool, bfs, fzset
from ..common import GrammarError, is_terminal

ACTION_SHIFT = 0

class Rule(object):
"""
origin : a symbol
expansion : a list of symbols
"""
def __init__(self, origin, expansion, alias=None):
self.origin = origin
self.expansion = expansion
self.alias = alias

def __repr__(self):
return '<%s : %s>' % (self.origin, ' '.join(self.expansion))

class RulePtr(object):
def __init__(self, rule, index):
assert isinstance(rule, Rule)
assert index <= len(rule.expansion)
self.rule = rule
self.index = index

def __repr__(self):
before = self.rule.expansion[:self.index]
after = self.rule.expansion[self.index:]
return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after))

@property
def next(self):
return self.rule.expansion[self.index]

def advance(self, sym):
assert self.next == sym
return RulePtr(self.rule, self.index+1)

@property
def is_satisfied(self):
return self.index == len(self.rule.expansion)

def __eq__(self, other):
return self.rule == other.rule and self.index == other.index
def __hash__(self):
return hash((self.rule, self.index))


def pairs(lst):
return zip(lst[:-1], lst[1:])

def update_set(set1, set2):
copy = set(set1)
set1 |= set2
return set1 != copy

def calculate_sets(rules):
"""Calculate FOLLOW sets.

Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
symbols = {sym for rule in rules for sym in rule.expansion} | {rule.origin for rule in rules}
symbols.add('$root') # what about other unused rules?

# foreach grammar rule X ::= Y(1) ... Y(k)
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
# NULLABLE = NULLABLE union {X}
# for i = 1 to k
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
# FIRST(X) = FIRST(X) union FIRST(Y(i))
# for j = i+1 to k
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration

NULLABLE = set()
FIRST = {}
FOLLOW = {}
for sym in symbols:
FIRST[sym]={sym} if is_terminal(sym) else set()
FOLLOW[sym]=set()

changed = True
while changed:
changed = False

for rule in rules:
if set(rule.expansion) <= NULLABLE:
if update_set(NULLABLE, {rule.origin}):
changed = True

for i, sym in enumerate(rule.expansion):
if set(rule.expansion[:i]) <= NULLABLE:
if update_set(FIRST[rule.origin], FIRST[sym]):
changed = True
if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE:
if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
changed = True

for j in range(i+1, len(rule.expansion)):
if set(rule.expansion[i+1:j]) <= NULLABLE:
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
changed = True

return FIRST, FOLLOW, NULLABLE


class GrammarAnalyzer(object):
def __init__(self, rule_tuples, start_symbol, debug=False):
self.start_symbol = start_symbol
self.debug = debug
rule_tuples = list(rule_tuples)
rule_tuples.append(('$root', [start_symbol, '$end']))
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples]

self.rules = set()
self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples}
for origin, exp, alias in rule_tuples:
r = Rule( origin, exp, alias )
self.rules.add(r)
self.rules_by_origin[origin].append(r)

for r in self.rules:
for sym in r.expansion:
if not (is_terminal(sym) or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym)

self.init_state = self.expand_rule(start_symbol)

self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules)

def expand_rule(self, rule):
"Returns all init_ptrs accessible by rule (recursive)"
init_ptrs = set()
def _expand_rule(rule):
assert not is_terminal(rule)

for r in self.rules_by_origin[rule]:
init_ptr = RulePtr(r, 0)
init_ptrs.add(init_ptr)

if r.expansion: # if not empty rule
new_r = init_ptr.next
if not is_terminal(new_r):
yield new_r

_ = list(bfs([rule], _expand_rule))

return fzset(init_ptrs)

def _first(self, r):
if is_terminal(r):
return {r}
else:
return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)}

def analyze(self):

self.states = {}
def step(state):
lookahead = defaultdict(list)
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
for rp in sat:
for term in self.FOLLOW.get(rp.rule.origin, ()):
lookahead[term].append(('reduce', rp.rule))

d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items():
rps = {rp.advance(sym) for rp in rps}

for rp in set(rps):
if not rp.is_satisfied and not is_terminal(rp.next):
rps |= self.expand_rule(rp.next)

lookahead[sym].append(('shift', fzset(rps)))
yield fzset(rps)

for k, v in lookahead.items():
if len(v) > 1:
if self.debug:
logging.warn("Shift/reduce conflict for %s: %s. Resolving as shift.", k, v)
for x in v:
# XXX resolving shift/reduce into shift, like PLY
# Give a proper warning
if x[0] == 'shift':
lookahead[k] = [x]

for k, v in lookahead.items():
assert len(v) == 1, ("Collision", k, v)

self.states[state] = {k:v[0] for k, v in lookahead.items()}

for _ in bfs([self.init_state], step):
pass

# --
self.enum = list(self.states)
self.enum_rev = {s:i for i,s in enumerate(self.enum)}
self.states_idx = {}

for s, la in self.states.items():
la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift'
else (v[0], (v[1], len(v[1].expansion))) # Reduce
for k,v in la.items()}
self.states_idx[ self.enum_rev[s] ] = la


self.init_state_idx = self.enum_rev[self.init_state]
from grammar_analysis import GrammarAnalyzer, ACTION_SHIFT

Caricamento…
Annulla
Salva