@@ -0,0 +1,148 @@ | |||
"My name is Earley" | |||
from .utils import classify | |||
class MatchFailed(object): | |||
pass | |||
class AbortParseMatch(Exception): | |||
pass | |||
class Rule(object): | |||
def __init__(self, name, symbols, postprocess): | |||
self.name = name | |||
self.symbols = symbols | |||
self.postprocess = postprocess | |||
class State(object): | |||
def __init__(self, rule, expect, reference, data=None): | |||
self.rule = rule | |||
self.expect = expect | |||
self.reference = reference | |||
self.data = data or [] | |||
self.is_complete = (self.expect == len(self.rule.symbols)) | |||
if not self.is_complete: | |||
self.expect_symbol = self.rule.symbols[self.expect] | |||
self.is_literal = isinstance(self.expect_symbol, dict) | |||
if self.is_literal: | |||
self.expect_symbol = self.expect_symbol['literal'] | |||
assert isinstance(self.expect_symbol, (str, unicode)), self.expect_symbol | |||
def next_state(self, data): | |||
return State(self.rule, self.expect+1, self.reference, self.data + [data]) | |||
def consume_terminal(self, inp): | |||
if not self.is_complete and self.is_literal: | |||
# PORT: originally tests regexp | |||
if self.expect_symbol == inp.type: | |||
return self.next_state(inp) | |||
def consume_nonterminal(self, inp): | |||
if not self.is_complete and not self.is_literal: | |||
if self.expect_symbol == inp: | |||
return self.next_state(inp) | |||
def process(self, location, ind, table, rules, added_rules): | |||
if self.is_complete: | |||
# Completed a rule | |||
if self.rule.postprocess: | |||
try: | |||
# self.data = self.rule.postprocess(self.data, self.reference) | |||
# import pdb | |||
# pdb.set_trace() | |||
self.data = self.rule.postprocess(self.data) | |||
except AbortParseMatch: | |||
self.data = MatchFailed | |||
if self.data is not MatchFailed: | |||
for s in table[self.reference]: | |||
x = s.consume_nonterminal(self.rule.name) | |||
if x: | |||
x.data[-1] = self.data | |||
x.epsilon_closure(location, ind, table) | |||
else: | |||
exp = self.rule.symbols[self.expect] | |||
if isinstance(exp, dict): | |||
return | |||
for r in rules[exp]: | |||
assert r.name == exp | |||
if r not in added_rules: | |||
if r.symbols: | |||
added_rules.add(r) | |||
State(r, 0, location).epsilon_closure(location, ind, table) | |||
else: | |||
# Empty rule | |||
new_copy = self.consume_nonterminal(r.name) | |||
if r.postprocess: | |||
new_copy.data[-1] = r.postprocess([]) | |||
# new_copy.data[-1] = r.postprocess([], self.reference) | |||
else: | |||
new_copy.data[-1] = [] | |||
new_copy.epsilon_closure(location, ind, table) | |||
def epsilon_closure(self, location, ind, table, result=None): | |||
col = table[location] | |||
if not result: | |||
result = col | |||
result.append(self) | |||
if not self.is_complete: | |||
for i in xrange(ind): | |||
state = col[i] | |||
if state.is_complete and state.reference == location: | |||
x = self.consume_nonterminal(state.rule.name) | |||
if x: | |||
x.data[-1] = state.data | |||
x.epsilon_closure(location, ind, table) | |||
class Parser(object): | |||
def __init__(self, rules, start=None): | |||
self.table = [[]] | |||
self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules] | |||
self.rules_by_name = classify(self.rules, lambda r: r.name) | |||
self.start = start or self.rules[0].name | |||
initial_rules = set(self.rules_by_name[self.start]) | |||
self.table[0] += [State(r, 0, 0) for r in initial_rules] | |||
self.advance_to(0, initial_rules) | |||
self.current = 0 | |||
def advance_to(self, n, added_rules): | |||
for w, s in enumerate(self.table[n]): | |||
s.process(n, w, self.table, self.rules_by_name, added_rules) | |||
def parse(self, chunk): | |||
chunk_pos = 0 | |||
for chunk_pos, chunk_item in enumerate(chunk): | |||
self.table.append([]) | |||
for s in self.table[self.current + chunk_pos]: | |||
x = s.consume_terminal(chunk_item) | |||
if x: | |||
self.table[self.current + chunk_pos + 1].append(x) | |||
added_rules = set() | |||
self.advance_to(self.current + chunk_pos + 1, added_rules) | |||
if not self.table[-1]: | |||
raise Exception('Error at line {t.line}:{t.column}'.format(t=chunk[chunk_pos])) | |||
self.current += chunk_pos | |||
return list(self.finish()) | |||
def finish(self): | |||
for t in self.table[-1]: | |||
if (t.rule.name == self.start | |||
and t.expect == len(t.rule.symbols) | |||
and t.reference == 0 | |||
and t.data != MatchFailed): | |||
yield t.data | |||
@@ -0,0 +1,59 @@ | |||
from lark.tree import Transformer | |||
from lark.lark import Lark | |||
calc_grammar = """ | |||
?start: sum | |||
| NAME "=" sum -> *assign_var | |||
?sum: product | |||
| sum "+" product -> *add | |||
| sum "-" product -> *sub | |||
?product: atom | |||
| product "*" atom -> *mul | |||
| product "/" atom -> *div | |||
?atom: /[\d.]+/ -> *number | |||
| "-" atom -> *neg | |||
| NAME -> *var | |||
| "(" sum ")" | |||
NAME: /\w+/ | |||
WS.ignore: /\s+/ | |||
""" | |||
class CalculateTree(Transformer): | |||
from operator import add, sub, mul, div, neg | |||
number = float | |||
def __init__(self): | |||
self.vars = {} | |||
def assign_var(self, name, value): | |||
self.vars[name] = value | |||
return value | |||
def var(self, name): | |||
return self.vars[name] | |||
calc_parser = Lark(calc_grammar, parser='lalr', transformer=CalculateTree()) | |||
calc = calc_parser.parse | |||
def main(): | |||
while True: | |||
try: | |||
s = raw_input('> ') | |||
except EOFError: | |||
break | |||
print(calc(s)) | |||
def test(): | |||
print calc("a = 1+2") | |||
print calc("1+a*-3") | |||
if __name__ == '__main__': | |||
test() | |||
# main() | |||
@@ -0,0 +1,62 @@ | |||
import sys | |||
from lark.lark import Lark | |||
from lark.tree import Transformer | |||
json_grammar = r""" | |||
?start: value | |||
?value: object | |||
| array | |||
| string | |||
| number | |||
| "true" -> *true | |||
| "false" -> *false | |||
| "null" -> *null | |||
array : "[" [value ("," value)*] "]" | |||
object : "{" [pair ("," pair)*] "}" | |||
pair : string ":" value | |||
*number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | |||
*string : /".*?(?<!\\)"/ | |||
WS.ignore.newline: /[ \t\n]+/ | |||
""" | |||
class TreeToJson(Transformer): | |||
def string(self, s): | |||
return s[1:-1] | |||
array = list | |||
pair = tuple | |||
object = dict | |||
number = float | |||
null = lambda self: None | |||
true = lambda self: True | |||
false = lambda self: False | |||
json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
def test(): | |||
test_json = ''' | |||
{ | |||
"empty_object" : {}, | |||
"empty_array" : [], | |||
"booleans" : { "YES" : true, "NO" : false }, | |||
"numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], | |||
"strings" : [ "This", [ "And" , "That" ] ], | |||
"nothing" : null | |||
} | |||
''' | |||
j = json_parser.parse(test_json) | |||
print j | |||
import json | |||
assert j == json.loads(test_json) | |||
if __name__ == '__main__': | |||
test() | |||
with open(sys.argv[1]) as f: | |||
print json_parser.parse(f.read()) | |||
@@ -0,0 +1,207 @@ | |||
from collections import defaultdict, deque | |||
from utils import classify, classify_bool, bfs, fzset | |||
ACTION_SHIFT = 0 | |||
class GrammarError(Exception): | |||
pass | |||
def is_terminal(sym): | |||
return sym.isupper() or sym[0] == '$' | |||
class Rule(object): | |||
""" | |||
origin : a symbol | |||
expansion : a list of symbols | |||
""" | |||
def __init__(self, origin, expansion, alias=None): | |||
assert expansion, "No support for empty rules" | |||
self.origin = origin | |||
self.expansion = expansion | |||
self.alias = alias | |||
def __repr__(self): | |||
return '<%s : %s>' % (self.origin, ' '.join(self.expansion)) | |||
class RulePtr(object): | |||
def __init__(self, rule, index): | |||
assert isinstance(rule, Rule) | |||
assert index <= len(rule.expansion) | |||
self.rule = rule | |||
self.index = index | |||
def __repr__(self): | |||
before = self.rule.expansion[:self.index] | |||
after = self.rule.expansion[self.index:] | |||
return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) | |||
@property | |||
def next(self): | |||
return self.rule.expansion[self.index] | |||
def advance(self, sym): | |||
assert self.next == sym | |||
return RulePtr(self.rule, self.index+1) | |||
@property | |||
def is_satisfied(self): | |||
return self.index == len(self.rule.expansion) | |||
def __eq__(self, other): | |||
return self.rule == other.rule and self.index == other.index | |||
def __hash__(self): | |||
return hash((self.rule, self.index)) | |||
def pairs(lst): | |||
return zip(lst[:-1], lst[1:]) | |||
def update_set(set1, set2): | |||
copy = set(set1) | |||
set1 |= set2 | |||
return set1 != copy | |||
class GrammarAnalyzer(object): | |||
def __init__(self, rule_tuples): | |||
rule_tuples = list(rule_tuples) | |||
rule_tuples.append(('$root', ['start', '$end'])) | |||
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] | |||
self.rules = set() | |||
self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} | |||
for origin, exp, alias in rule_tuples: | |||
r = Rule( origin, exp, alias ) | |||
self.rules.add(r) | |||
self.rules_by_origin[origin].append(r) | |||
for r in self.rules: | |||
for sym in r.expansion: | |||
if not (is_terminal(sym) or sym in self.rules_by_origin): | |||
raise GrammarError("Using an undefined rule: %s" % sym) | |||
self.init_state = self.expand_rule('start') | |||
def expand_rule(self, rule): | |||
"Returns all init_ptrs accessible by rule (recursive)" | |||
init_ptrs = set() | |||
def _expand_rule(rule): | |||
assert not is_terminal(rule) | |||
for r in self.rules_by_origin[rule]: | |||
init_ptr = RulePtr(r, 0) | |||
init_ptrs.add(init_ptr) | |||
new_r = init_ptr.next | |||
if not is_terminal(new_r): | |||
yield new_r | |||
_ = list(bfs([rule], _expand_rule)) | |||
return fzset(init_ptrs) | |||
def _first(self, r): | |||
if is_terminal(r): | |||
return {r} | |||
else: | |||
return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} | |||
def _calc(self): | |||
"""Calculate FOLLOW sets. | |||
Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" | |||
symbols = {sym for rule in self.rules for sym in rule.expansion} | |||
symbols.add('$root') # what about other unused rules? | |||
# foreach grammar rule X ::= Y(1) ... Y(k) | |||
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then | |||
# NULLABLE = NULLABLE union {X} | |||
# for i = 1 to k | |||
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then | |||
# FIRST(X) = FIRST(X) union FIRST(Y(i)) | |||
# for j = i+1 to k | |||
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then | |||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) | |||
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then | |||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) | |||
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration | |||
NULLABLE = set() | |||
FIRST = {} | |||
FOLLOW = {} | |||
for sym in symbols: | |||
FIRST[sym]={sym} if is_terminal(sym) else set() | |||
FOLLOW[sym]=set() | |||
changed = True | |||
while changed: | |||
changed = False | |||
for rule in self.rules: | |||
if set(rule.expansion) <= NULLABLE: | |||
if update_set(NULLABLE, {rule.origin}): | |||
changed = True | |||
for i, sym in enumerate(rule.expansion): | |||
if set(rule.expansion[:i]) <= NULLABLE: | |||
if update_set(FIRST[rule.origin], FIRST[sym]): | |||
changed = True | |||
if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: | |||
if update_set(FOLLOW[sym], FOLLOW[rule.origin]): | |||
changed = True | |||
for j in range(i+1, len(rule.expansion)): | |||
if set(rule.expansion[i+1:j]) <= NULLABLE: | |||
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): | |||
changed = True | |||
self.FOLLOW = FOLLOW | |||
def analyze(self): | |||
self._calc() | |||
self.states = {} | |||
def step(state): | |||
lookahead = defaultdict(list) | |||
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) | |||
for rp in sat: | |||
for term in self.FOLLOW.get(rp.rule.origin, ()): | |||
lookahead[term].append(('reduce', rp.rule)) | |||
d = classify(unsat, lambda rp: rp.next) | |||
for sym, rps in d.items(): | |||
rps = {rp.advance(sym) for rp in rps} | |||
for rp in set(rps): | |||
if not rp.is_satisfied and not is_terminal(rp.next): | |||
rps |= self.expand_rule(rp.next) | |||
lookahead[sym].append(('shift', fzset(rps))) | |||
yield fzset(rps) | |||
for k, v in lookahead.items(): | |||
if len(v) > 1: | |||
for x in v: | |||
# XXX resolving shift/reduce into shift, like PLY | |||
# Give a proper warning | |||
if x[0] == 'shift': | |||
lookahead[k] = [x] | |||
for k, v in lookahead.items(): | |||
assert len(v) == 1, ("Collision", k, v) | |||
self.states[state] = {k:v[0] for k, v in lookahead.items()} | |||
x = list(bfs([self.init_state], step)) | |||
# -- | |||
self.enum = list(self.states) | |||
self.enum_rev = {s:i for i,s in enumerate(self.enum)} | |||
self.states_idx = {} | |||
for s, la in self.states.items(): | |||
la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' else v for k,v in la.items()} | |||
self.states_idx[ self.enum_rev[s] ] = la | |||
self.init_state_idx = self.enum_rev[self.init_state] | |||
@@ -0,0 +1,217 @@ | |||
from __future__ import absolute_import | |||
from .utils import STRING_TYPE | |||
from .load_grammar import load_grammar | |||
from .tree import Tree, Transformer | |||
from .lexer import Lexer | |||
from .grammar_analysis import GrammarAnalyzer, is_terminal | |||
from . import parser, earley | |||
class LarkOptions(object): | |||
"""Specifies the options for Lark | |||
""" | |||
OPTIONS_DOC = """ | |||
parser - Which parser engine to use ("earley" or "lalr". Default: "earley") | |||
Note: Both will use Lark's lexer. | |||
transformer - Applies the transformer to every parse tree | |||
debug - Affects verbosity (default: False) | |||
only_lex - Don't build a parser. Useful for debugging (default: False) | |||
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | |||
cache_grammar - Cache the Lark grammar (Default: False) | |||
ignore_postproc - Don't call the post-processing function (default: False) | |||
""" | |||
__doc__ += OPTIONS_DOC | |||
def __init__(self, options_dict): | |||
o = dict(options_dict) | |||
self.debug = bool(o.pop('debug', False)) | |||
self.only_lex = bool(o.pop('only_lex', False)) | |||
self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) | |||
self.keep_empty_trees = bool(o.pop('keep_empty_trees', True)) | |||
self.tree_class = o.pop('tree_class', Tree) | |||
self.cache_grammar = o.pop('cache_grammar', False) | |||
self.ignore_postproc = bool(o.pop('ignore_postproc', False)) | |||
self.parser = o.pop('parser', 'earley') | |||
self.transformer = o.pop('transformer', None) | |||
if o: | |||
raise ValueError("Unknown options: %s" % o.keys()) | |||
class Callback(object): | |||
pass | |||
class RuleTreeToText(Transformer): | |||
def expansions(self, *x): | |||
return x | |||
def expansion(self, *symbols): | |||
return [sym.value for sym in symbols], None | |||
def alias(self, (expansion, _alias), alias): | |||
assert _alias is None, (alias, expansion, '-', _alias) | |||
return expansion, alias.value | |||
def create_rule_handler(expansion, usermethod): | |||
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||
if not (is_terminal(sym) and sym.startswith('_'))] | |||
def _build_ast(match): | |||
children = [] | |||
for i, to_expand in to_include: | |||
if to_expand: | |||
children += match[i].children | |||
else: | |||
children.append(match[i]) | |||
return usermethod(children) | |||
return _build_ast | |||
def create_expand1_tree_builder_function(tree_builder): | |||
def f(children): | |||
if len(children) == 1: | |||
return children[0] | |||
else: | |||
return tree_builder(children) | |||
return f | |||
def create_rule_inline(f): | |||
def _f(children): | |||
return f(*children) | |||
return _f | |||
class LALR: | |||
def build_parser(self, rules, callback): | |||
ga = GrammarAnalyzer(rules) | |||
ga.analyze() | |||
return parser.Parser(ga, callback) | |||
class Earley: | |||
@staticmethod | |||
def _process_expansion(x): | |||
return [{'literal': s} if is_terminal(s) else s for s in x] | |||
def build_parser(self, rules, callback): | |||
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||
return EarleyParser(earley.Parser(rules, 'start')) | |||
class EarleyParser: | |||
def __init__(self, parser): | |||
self.parser = parser | |||
def parse(self, text): | |||
res = self.parser.parse(text) | |||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
return res[0] | |||
class Lark: | |||
def __init__(self, grammar, **options): | |||
""" | |||
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) | |||
options : a dictionary controlling various aspects of Lark. | |||
""" | |||
self.options = LarkOptions(options) | |||
# Some, but not all file-like objects have a 'name' attribute | |||
try: | |||
source = grammar.name | |||
except AttributeError: | |||
source = '<string>' | |||
cache_file = "larkcache_%s" % str(hash(grammar)%(2**32)) | |||
else: | |||
cache_file = "larkcache_%s" % os.path.basename(source) | |||
# Drain file-like objects to get their contents | |||
try: | |||
read = grammar.read | |||
except AttributeError: | |||
pass | |||
else: | |||
grammar = read() | |||
assert isinstance(grammar, STRING_TYPE) | |||
if self.options.cache_grammar: | |||
raise NotImplementedError("Not available yet") | |||
self.tokens, self.rules = load_grammar(grammar) | |||
self.lexer = self._build_lexer() | |||
if not self.options.only_lex: | |||
self.parser_engine = { | |||
'lalr': LALR, | |||
'earley': Earley, | |||
}[self.options.parser]() | |||
self.parser = self._build_parser() | |||
def _build_lexer(self): | |||
ignore_tokens = [] | |||
tokens = {} | |||
for name, (value, flags) in self.tokens.items(): | |||
if 'ignore' in flags: | |||
ignore_tokens.append(name) | |||
tokens[name] = value | |||
return Lexer(tokens.items(), {}, ignore=ignore_tokens) | |||
def _build_parser(self): | |||
transformer = self.options.transformer | |||
callback = Callback() | |||
rules = [] | |||
rule_tree_to_text = RuleTreeToText() | |||
for origin, tree in self.rules.items(): | |||
for expansion, alias in rule_tree_to_text.transform(tree): | |||
if alias and origin.startswith('_'): | |||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||
expand1 = origin.startswith('?') | |||
inline_args = origin.startswith('*') or (alias and alias.startswith('*')) | |||
_origin = origin.lstrip('?*') | |||
if alias: | |||
alias = alias.lstrip('*') | |||
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||
assert not hasattr(callback, _alias) | |||
f = getattr(transformer, alias or _origin, None) | |||
if f is None: | |||
if alias: | |||
f = self._create_tree_builder_function(alias) | |||
else: | |||
f = self._create_tree_builder_function(_origin) | |||
if expand1: | |||
f = create_expand1_tree_builder_function(f) | |||
else: | |||
if inline_args: | |||
f = create_rule_inline(f) | |||
alias_handler = create_rule_handler(expansion, f) | |||
setattr(callback, _alias, alias_handler) | |||
rules.append((_origin, expansion, _alias)) | |||
return self.parser_engine.build_parser(rules, callback) | |||
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||
def _create_tree_builder_function(self, name): | |||
tree_class = self.options.tree_class | |||
def f(children): | |||
return tree_class(name, children) | |||
return f | |||
def lex(self, text): | |||
return self.lexer.lex(text) | |||
def parse(self, text): | |||
assert not self.options.only_lex | |||
l = list(self.lex(text)) | |||
return self.parser.parse(l) | |||
@@ -0,0 +1,84 @@ | |||
## Lexer Implementation | |||
from utils import Str | |||
class LexError(Exception): | |||
pass | |||
class Token(Str): | |||
def __new__(cls, type, value, pos_in_stream=None): | |||
inst = Str.__new__(cls, value) | |||
inst.type = type | |||
inst.pos_in_stream = pos_in_stream | |||
inst.value = value | |||
return inst | |||
# class Token(object): | |||
# def __init__(self, type, value, lexpos): | |||
# self.type = type | |||
# self.value = value | |||
# self.lexpos = lexpos | |||
def __repr__(self): | |||
return 'Token(%s, %s, %s)' % (self.type, self.value, self.pos_in_stream) | |||
class Regex: | |||
def __init__(self, pattern, flags=()): | |||
self.pattern = pattern | |||
self.flags = flags | |||
import re | |||
LIMIT = 50 # Stupid named groups limit in python re | |||
class Lexer(object): | |||
def __init__(self, tokens, callbacks, ignore=()): | |||
self.ignore = ignore | |||
# Sanitization | |||
token_names = {t[0] for t in tokens} | |||
for t in tokens: | |||
try: | |||
re.compile(t[1]) | |||
except: | |||
raise LexError("Cannot compile token: %s: %s" % t) | |||
assert all(t in token_names for t in ignore) | |||
# Init | |||
self.tokens = tokens | |||
self.callbacks = callbacks | |||
self.tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
self.mres = [] | |||
self.name_from_index = [] | |||
x = tokens | |||
while x: | |||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) | |||
self.mres.append(mre) | |||
self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) | |||
x = x[LIMIT:] | |||
def lex(self, stream): | |||
lex_pos = 0 | |||
while True: | |||
i = 0 | |||
for mre in self.mres: | |||
m = mre.match(stream, lex_pos) | |||
if m: | |||
value = m.group(0) | |||
type_ = self.name_from_index[i][m.lastindex] | |||
t = Token(type_, value, lex_pos) | |||
if t.type in self.callbacks: | |||
self.callbacks[t.type](t) | |||
if t.type not in self.ignore: | |||
yield t | |||
lex_pos += len(value) | |||
break | |||
i += 1 | |||
else: | |||
if lex_pos < len(stream): | |||
context = stream[lex_pos:lex_pos+5] | |||
raise LexError("No token defined for: '%s' in %s" % (stream[lex_pos], context)) | |||
break | |||
@@ -0,0 +1,358 @@ | |||
import re | |||
from lexer import Lexer, Token | |||
from grammar_analysis import GrammarAnalyzer | |||
from parser import Parser | |||
from tree import Tree as T, Transformer, Visitor | |||
_TOKEN_NAMES = { | |||
':' : 'COLON', | |||
',' : 'COMMA', | |||
';' : 'SEMICOLON', | |||
'+' : 'PLUS', | |||
'-' : 'MINUS', | |||
'*' : 'STAR', | |||
'/' : 'SLASH', | |||
'|' : 'VBAR', | |||
'!' : 'BANG', | |||
'?' : 'QMARK', | |||
'#' : 'HASH', | |||
'$' : 'DOLLAR', | |||
'&' : 'AMPERSAND', | |||
'<' : 'LESSTHAN', | |||
'>' : 'MORETHAN', | |||
'=' : 'EQUAL', | |||
'.' : 'DOT', | |||
'%' : 'PERCENT', | |||
'`' : 'BACKQUOTE', | |||
'^' : 'CIRCUMFLEX', | |||
'"' : 'DBLQUOTE', | |||
'\'' : 'QUOTE', | |||
'~' : 'TILDE', | |||
'@' : 'AT', | |||
'(' : 'LPAR', | |||
')' : 'RPAR', | |||
'{' : 'LBRACE', | |||
'}' : 'RBRACE', | |||
'[' : 'LSQB', | |||
']' : 'RSQB', | |||
} | |||
# Grammar Parser | |||
TOKENS = { | |||
'LPAR': '\(', | |||
'RPAR': '\)', | |||
'LBRA': '\[', | |||
'RBRA': '\]', | |||
'OP': '[+*?]', | |||
'COLON': ':', | |||
'OR': '\|', | |||
'DOT': '\.', | |||
'RULE': '[_?*]?[a-z][_a-z0-9]*', | |||
'TOKEN': '_?[A-Z][_A-Z0-9]*', | |||
'STRING': r'".*?[^\\]"', | |||
'REGEXP': r"/(.|\n)*?[^\\]/", | |||
'NL': r'(\r?\n)+\s*', | |||
'WS': r'[ \t]+', | |||
'COMMENT': r'#[^\n]*\n', | |||
'TO': '->' | |||
} | |||
RULES = [ | |||
('start', ['list']), | |||
('list', ['item']), | |||
('list', ['list', 'item']), | |||
('item', ['rule']), | |||
('item', ['token']), | |||
('item', ['NL']), | |||
('rule', ['RULE', 'COLON', 'expansions', 'NL']), | |||
('expansions', ['expansion']), | |||
('expansions', ['expansions', 'OR', 'expansion']), | |||
('expansions', ['expansions', 'NL', 'OR', 'expansion']), | |||
('expansion', ['_expansion']), | |||
('expansion', ['_expansion', 'TO', 'RULE']), | |||
('_expansion', ['expr']), | |||
('_expansion', ['_expansion', 'expr']), | |||
('expr', ['atom']), | |||
('expr', ['atom', 'OP']), | |||
('atom', ['LPAR', 'expansions', 'RPAR']), | |||
('atom', ['maybe']), | |||
('atom', ['RULE']), | |||
('atom', ['TOKEN']), | |||
('atom', ['anontoken']), | |||
('anontoken', ['tokenvalue']), | |||
('maybe', ['LBRA', 'expansions', 'RBRA']), | |||
('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']), | |||
('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']), | |||
('tokenvalue', ['REGEXP']), | |||
('tokenvalue', ['STRING']), | |||
('tokenmods', ['DOT', 'RULE']), | |||
('tokenmods', ['tokenmods', 'DOT', 'RULE']), | |||
] | |||
class SaveDefinitions(object): | |||
def __init__(self): | |||
self.rules = {} | |||
self.tokens = {} | |||
self.i = 0 | |||
def atom__3(self, _1, value, _2): | |||
return value | |||
def atom__1(self, value): | |||
return value | |||
def expr__1(self, expr): | |||
return expr | |||
def expr(self, *x): | |||
return T('expr', x) | |||
def expansion__1(self, expansion): | |||
return expansion | |||
def expansion__3(self, expansion, _, alias): | |||
return T('alias', [expansion, alias]) | |||
def _expansion(self, *x): | |||
return T('expansion', x) | |||
def expansions(self, *x): | |||
items = [i for i in x if isinstance(i, T)] | |||
return T('expansions', items) | |||
def maybe(self, _1, expr, _2): | |||
return T('expr', [expr, Token('OP', '?', -1)]) | |||
def rule(self, name, _1, expansion, _2): | |||
name = name.value | |||
if name in self.rules: | |||
raise ValueError("Rule '%s' defined more than once" % name) | |||
self.rules[name] = expansion | |||
def token(self, *x): | |||
name = x[0].value | |||
if name in self.tokens: | |||
raise ValueError("Token '%s' defined more than once" % name) | |||
if len(x) == 4: | |||
self.tokens[name] = x[2][1], [] | |||
else: | |||
self.tokens[name] = x[3][1], x[1].children | |||
def tokenvalue(self, tokenvalue): | |||
value = tokenvalue.value[1:-1] | |||
if tokenvalue.type == 'STRING': | |||
value = re.escape(value) | |||
return tokenvalue, value | |||
def anontoken(self, (token, value)): | |||
if token.type == 'STRING': | |||
try: | |||
token_name = _TOKEN_NAMES[token.value[1:-1]] | |||
except KeyError: | |||
if value.isalnum() and value[0].isalpha(): | |||
token_name = value.upper() | |||
else: | |||
token_name = 'ANONSTR_%d' % self.i | |||
self.i += 1 | |||
token_name = '__' + token_name | |||
elif token.type == 'REGEXP': | |||
token_name = 'ANONRE_%d' % self.i | |||
self.i += 1 | |||
else: | |||
assert False, x | |||
if token_name not in self.tokens: | |||
self.tokens[token_name] = value, [] | |||
return Token('TOKEN', token_name, -1) | |||
def tokenmods__2(self, _, rule): | |||
return T('tokenmods', [rule.value]) | |||
def tokenmods__3(self, tokenmods, _, rule): | |||
return T('tokenmods', tokenmods.children + [rule.value]) | |||
def start(self, *x): pass | |||
def list(self, *x): pass | |||
def item(self, *x): pass | |||
class EBNF_to_BNF(Transformer): | |||
def __init__(self): | |||
self.new_rules = {} | |||
self.prefix = 'anon' | |||
self.i = 0 | |||
def _add_recurse_rule(self, type_, expr): | |||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||
self.i += 1 | |||
t = Token('RULE', new_name, -1) | |||
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||
return t | |||
def expr(self, rule, op): | |||
if op.value == '?': | |||
return T('expansions', [rule, T('expansion', [])]) | |||
elif op.value == '+': | |||
# a : b c+ d | |||
# --> | |||
# a : b _c d | |||
# _c : _c c | c; | |||
return self._add_recurse_rule('plus', rule) | |||
elif op.value == '*': | |||
# a : b c* d | |||
# --> | |||
# a : b _c? d | |||
# _c : _c c | c; | |||
new_name = self._add_recurse_rule('star', rule) | |||
return T('expansions', [new_name, T('expansion', [])]) | |||
assert False, op | |||
class SimplifyRule_Visitor(Visitor): | |||
@staticmethod | |||
def _flatten(tree): | |||
while True: | |||
to_expand = [i for i, child in enumerate(tree.children) | |||
if isinstance(child, T) and child.data == tree.data] | |||
if not to_expand: | |||
break | |||
tree.expand_kids_by_index(*to_expand) | |||
def expansion(self, tree): | |||
# rules_list unpacking | |||
# a : b (c|d) e | |||
# --> | |||
# a : b c e | b d e | |||
# | |||
# In AST terms: | |||
# expansion(b, expansions(c, d), e) | |||
# --> | |||
# expansions( expansion(b, c, e), expansion(b, d, e) ) | |||
while True: | |||
self._flatten(tree) | |||
for i, child in enumerate(tree.children): | |||
if isinstance(child, T) and child.data == 'expansions': | |||
tree.data = 'expansions' | |||
tree.children = [self.visit(T('expansion', [option if i==j else other | |||
for j, other in enumerate(tree.children)])) | |||
for option in child.children] | |||
break | |||
else: | |||
break | |||
def alias(self, tree): | |||
rule, alias_name = tree.children | |||
if rule.data == 'expansions': | |||
aliases = [] | |||
for child in tree.children[0].children: | |||
aliases.append(T('alias', [child, alias_name])) | |||
tree.data = 'expansions' | |||
tree.children = aliases | |||
expansions = _flatten | |||
def dict_update_safe(d1, d2): | |||
for k, v in d2.iteritems(): | |||
assert k not in d1 | |||
d1[k] = v | |||
def generate_aliases(): | |||
sd = SaveDefinitions() | |||
for name, expansion in RULES: | |||
try: | |||
f = getattr(sd, "%s__%s" % (name, len(expansion))) | |||
except AttributeError: | |||
f = getattr(sd, name) | |||
yield name, expansion, f.__name__ | |||
def inline_args(f): | |||
def _f(self, args): | |||
return f(*args) | |||
return _f | |||
class GrammarLoader: | |||
def __init__(self): | |||
self.rules = list(generate_aliases()) | |||
self.ga = GrammarAnalyzer(self.rules) | |||
self.ga.analyze() | |||
self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | |||
self.simplify_rule = SimplifyRule_Visitor() | |||
def _generate_parser_callbacks(self, callbacks): | |||
d = {alias: inline_args(getattr(callbacks, alias)) | |||
for _n, _x, alias in self.rules} | |||
return type('Callback', (), d)() | |||
def load_grammar(self, grammar_text): | |||
sd = SaveDefinitions() | |||
c = self._generate_parser_callbacks(sd) | |||
p = Parser(self.ga, c) | |||
p.parse( list(self.lexer.lex(grammar_text+"\n")) ) | |||
ebnf_to_bnf = EBNF_to_BNF() | |||
rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} | |||
dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||
for r in rules.values(): | |||
self.simplify_rule.visit(r) | |||
return sd.tokens, rules | |||
load_grammar = GrammarLoader().load_grammar | |||
def test(): | |||
g = """ | |||
start: add | |||
# Rules | |||
add: mul | |||
| add _add_sym mul | |||
mul: _atom | |||
| mul _add_mul _atom | |||
neg: "-" _atom | |||
_atom: neg | |||
| number | |||
| "(" add ")" | |||
# Tokens | |||
number: /[\d.]+/ | |||
_add_sym: "+" | "-" | |||
_add_mul: "*" | "/" | |||
WS.ignore: /\s+/ | |||
""" | |||
g2 = """ | |||
start: a | |||
a: "a" (b*|(c d)+) "b"? | |||
b: "b" | |||
c: "c" | |||
d: "+" | "-" | |||
""" | |||
load_grammar(g) | |||
@@ -0,0 +1,61 @@ | |||
from grammar_analysis import ACTION_SHIFT | |||
class ParseError(Exception): | |||
pass | |||
class Parser(object): | |||
def __init__(self, ga, callback, temp=False): | |||
self.ga = ga | |||
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||
for rule in ga.rules} | |||
def parse(self, seq): | |||
states_idx = self.ga.states_idx | |||
stack = [(None, self.ga.init_state_idx)] | |||
i = 0 | |||
res = None | |||
def get_action(key): | |||
state = stack[-1][1] | |||
try: | |||
return states_idx[state][key] | |||
except KeyError: | |||
expected = states_idx[state].keys() | |||
context = ' '.join(['%s(%r)' % (t.type, t.value) for t in seq[i:i+5]]) | |||
raise ParseError("Unexpected input %r.\nExpected: %s\nContext: %s" % (key, expected, context)) | |||
def reduce(rule): | |||
s = stack[-len(rule.expansion):] | |||
del stack[-len(rule.expansion):] | |||
res = self.callbacks[rule]([x[0] for x in s]) | |||
if rule.origin == 'start': | |||
return res | |||
_action, new_state = get_action(rule.origin) | |||
assert _action == ACTION_SHIFT | |||
stack.append((res, new_state)) | |||
# Main LALR-parser loop | |||
while i < len(seq): | |||
action, arg = get_action(seq[i].type) | |||
if action == ACTION_SHIFT: | |||
stack.append((seq[i], arg)) | |||
i+= 1 | |||
else: | |||
reduce(arg) | |||
while len(stack) > 1: | |||
_action, rule = get_action('$end') | |||
assert _action == 'reduce' | |||
res = reduce(rule) | |||
if res: | |||
break | |||
assert stack == [(None, self.ga.init_state_idx)], len(stack) | |||
return res | |||
@@ -0,0 +1,83 @@ | |||
class Tree(object): | |||
def __init__(self, data, children): | |||
self.data = data | |||
self.children = list(children) | |||
def __repr__(self): | |||
return 'Tree(%s, %s)' % (self.data, self.children) | |||
def _pretty(self, level, indent_str): | |||
if len(self.children) == 1 and not isinstance(self.children[0], Tree): | |||
return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n'] | |||
l = [ indent_str*level, self.data, '\n' ] | |||
for n in self.children: | |||
if isinstance(n, Tree): | |||
l += n._pretty(level+1, indent_str) | |||
else: | |||
l += [ indent_str*(level+1), '%s' % n, '\n' ] | |||
return l | |||
def pretty(self, indent_str=' '): | |||
return ''.join(self._pretty(0, indent_str)) | |||
def expand_kids_by_index(self, *indices): | |||
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||
kid = self.children[i] | |||
self.children[i:i+1] = kid.children | |||
# def find_path(self, pred): | |||
# if pred(self): | |||
# yield [] | |||
# else: | |||
# for i, c in enumerate(self.children): | |||
# if isinstance(c, Tree): | |||
# for path in c.find_path(pred): | |||
# yield [i] + path | |||
# def follow_path(self, path): | |||
# x = self | |||
# for step in path: | |||
# x = x.children[step] | |||
# return x | |||
# def set_at_path(self, path, value): | |||
# x = self.follow_path(path[:-1]) | |||
# x.children[path[-1]] = value | |||
def clone(self): | |||
return Tree(self.data, [c.clone() if isinstance(c, Tree) else c for c in self.children]) | |||
class Transformer(object): | |||
def transform(self, tree): | |||
items = [self.transform(c) if isinstance(c, Tree) else c for c in tree.children] | |||
try: | |||
f = getattr(self, tree.data) | |||
except AttributeError: | |||
return self.__default__(tree.data, items) | |||
else: | |||
return f(*items) | |||
def __default__(self, data, children): | |||
return Tree(data, children) | |||
class Visitor(object): | |||
def visit(self, tree): | |||
for child in tree.children: | |||
if isinstance(child, Tree): | |||
self.visit(child) | |||
f = getattr(self, tree.data, self.__default__) | |||
f(tree) | |||
return tree | |||
def __default__(self, tree): | |||
pass | |||
@@ -0,0 +1,51 @@ | |||
from collections import deque | |||
class fzset(frozenset): | |||
def __repr__(self): | |||
return '{%s}' % ', '.join(map(repr, self)) | |||
def classify_bool(seq, pred): | |||
true_elems = [] | |||
false_elems = [] | |||
for elem in seq: | |||
if pred(elem): | |||
true_elems.append(elem) | |||
else: | |||
false_elems.append(elem) | |||
return true_elems, false_elems | |||
def classify(seq, key=None): | |||
d = {} | |||
for item in seq: | |||
k = key(item) if (key is not None) else item | |||
if k in d: | |||
d[k].append(item) | |||
else: | |||
d[k] = [item] | |||
return d | |||
def bfs(initial, expand): | |||
open_q = deque(list(initial)) | |||
visited = set(open_q) | |||
while open_q: | |||
node = open_q.popleft() | |||
yield node | |||
for next_node in expand(node): | |||
if next_node not in visited: | |||
visited.add(next_node) | |||
open_q.append(next_node) | |||
try: | |||
STRING_TYPE = basestring | |||
except NameError: # Python 3 | |||
STRING_TYPE = str | |||
Str = type(u'') | |||