@@ -0,0 +1,148 @@ | |||||
"My name is Earley" | |||||
from .utils import classify | |||||
class MatchFailed(object): | |||||
pass | |||||
class AbortParseMatch(Exception): | |||||
pass | |||||
class Rule(object): | |||||
def __init__(self, name, symbols, postprocess): | |||||
self.name = name | |||||
self.symbols = symbols | |||||
self.postprocess = postprocess | |||||
class State(object): | |||||
def __init__(self, rule, expect, reference, data=None): | |||||
self.rule = rule | |||||
self.expect = expect | |||||
self.reference = reference | |||||
self.data = data or [] | |||||
self.is_complete = (self.expect == len(self.rule.symbols)) | |||||
if not self.is_complete: | |||||
self.expect_symbol = self.rule.symbols[self.expect] | |||||
self.is_literal = isinstance(self.expect_symbol, dict) | |||||
if self.is_literal: | |||||
self.expect_symbol = self.expect_symbol['literal'] | |||||
assert isinstance(self.expect_symbol, (str, unicode)), self.expect_symbol | |||||
def next_state(self, data): | |||||
return State(self.rule, self.expect+1, self.reference, self.data + [data]) | |||||
def consume_terminal(self, inp): | |||||
if not self.is_complete and self.is_literal: | |||||
# PORT: originally tests regexp | |||||
if self.expect_symbol == inp.type: | |||||
return self.next_state(inp) | |||||
def consume_nonterminal(self, inp): | |||||
if not self.is_complete and not self.is_literal: | |||||
if self.expect_symbol == inp: | |||||
return self.next_state(inp) | |||||
def process(self, location, ind, table, rules, added_rules): | |||||
if self.is_complete: | |||||
# Completed a rule | |||||
if self.rule.postprocess: | |||||
try: | |||||
# self.data = self.rule.postprocess(self.data, self.reference) | |||||
# import pdb | |||||
# pdb.set_trace() | |||||
self.data = self.rule.postprocess(self.data) | |||||
except AbortParseMatch: | |||||
self.data = MatchFailed | |||||
if self.data is not MatchFailed: | |||||
for s in table[self.reference]: | |||||
x = s.consume_nonterminal(self.rule.name) | |||||
if x: | |||||
x.data[-1] = self.data | |||||
x.epsilon_closure(location, ind, table) | |||||
else: | |||||
exp = self.rule.symbols[self.expect] | |||||
if isinstance(exp, dict): | |||||
return | |||||
for r in rules[exp]: | |||||
assert r.name == exp | |||||
if r not in added_rules: | |||||
if r.symbols: | |||||
added_rules.add(r) | |||||
State(r, 0, location).epsilon_closure(location, ind, table) | |||||
else: | |||||
# Empty rule | |||||
new_copy = self.consume_nonterminal(r.name) | |||||
if r.postprocess: | |||||
new_copy.data[-1] = r.postprocess([]) | |||||
# new_copy.data[-1] = r.postprocess([], self.reference) | |||||
else: | |||||
new_copy.data[-1] = [] | |||||
new_copy.epsilon_closure(location, ind, table) | |||||
def epsilon_closure(self, location, ind, table, result=None): | |||||
col = table[location] | |||||
if not result: | |||||
result = col | |||||
result.append(self) | |||||
if not self.is_complete: | |||||
for i in xrange(ind): | |||||
state = col[i] | |||||
if state.is_complete and state.reference == location: | |||||
x = self.consume_nonterminal(state.rule.name) | |||||
if x: | |||||
x.data[-1] = state.data | |||||
x.epsilon_closure(location, ind, table) | |||||
class Parser(object): | |||||
def __init__(self, rules, start=None): | |||||
self.table = [[]] | |||||
self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules] | |||||
self.rules_by_name = classify(self.rules, lambda r: r.name) | |||||
self.start = start or self.rules[0].name | |||||
initial_rules = set(self.rules_by_name[self.start]) | |||||
self.table[0] += [State(r, 0, 0) for r in initial_rules] | |||||
self.advance_to(0, initial_rules) | |||||
self.current = 0 | |||||
def advance_to(self, n, added_rules): | |||||
for w, s in enumerate(self.table[n]): | |||||
s.process(n, w, self.table, self.rules_by_name, added_rules) | |||||
def parse(self, chunk): | |||||
chunk_pos = 0 | |||||
for chunk_pos, chunk_item in enumerate(chunk): | |||||
self.table.append([]) | |||||
for s in self.table[self.current + chunk_pos]: | |||||
x = s.consume_terminal(chunk_item) | |||||
if x: | |||||
self.table[self.current + chunk_pos + 1].append(x) | |||||
added_rules = set() | |||||
self.advance_to(self.current + chunk_pos + 1, added_rules) | |||||
if not self.table[-1]: | |||||
raise Exception('Error at line {t.line}:{t.column}'.format(t=chunk[chunk_pos])) | |||||
self.current += chunk_pos | |||||
return list(self.finish()) | |||||
def finish(self): | |||||
for t in self.table[-1]: | |||||
if (t.rule.name == self.start | |||||
and t.expect == len(t.rule.symbols) | |||||
and t.reference == 0 | |||||
and t.data != MatchFailed): | |||||
yield t.data | |||||
@@ -0,0 +1,59 @@ | |||||
from lark.tree import Transformer | |||||
from lark.lark import Lark | |||||
calc_grammar = """ | |||||
?start: sum | |||||
| NAME "=" sum -> *assign_var | |||||
?sum: product | |||||
| sum "+" product -> *add | |||||
| sum "-" product -> *sub | |||||
?product: atom | |||||
| product "*" atom -> *mul | |||||
| product "/" atom -> *div | |||||
?atom: /[\d.]+/ -> *number | |||||
| "-" atom -> *neg | |||||
| NAME -> *var | |||||
| "(" sum ")" | |||||
NAME: /\w+/ | |||||
WS.ignore: /\s+/ | |||||
""" | |||||
class CalculateTree(Transformer): | |||||
from operator import add, sub, mul, div, neg | |||||
number = float | |||||
def __init__(self): | |||||
self.vars = {} | |||||
def assign_var(self, name, value): | |||||
self.vars[name] = value | |||||
return value | |||||
def var(self, name): | |||||
return self.vars[name] | |||||
calc_parser = Lark(calc_grammar, parser='lalr', transformer=CalculateTree()) | |||||
calc = calc_parser.parse | |||||
def main(): | |||||
while True: | |||||
try: | |||||
s = raw_input('> ') | |||||
except EOFError: | |||||
break | |||||
print(calc(s)) | |||||
def test(): | |||||
print calc("a = 1+2") | |||||
print calc("1+a*-3") | |||||
if __name__ == '__main__': | |||||
test() | |||||
# main() | |||||
@@ -0,0 +1,62 @@ | |||||
import sys | |||||
from lark.lark import Lark | |||||
from lark.tree import Transformer | |||||
json_grammar = r""" | |||||
?start: value | |||||
?value: object | |||||
| array | |||||
| string | |||||
| number | |||||
| "true" -> *true | |||||
| "false" -> *false | |||||
| "null" -> *null | |||||
array : "[" [value ("," value)*] "]" | |||||
object : "{" [pair ("," pair)*] "}" | |||||
pair : string ":" value | |||||
*number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | |||||
*string : /".*?(?<!\\)"/ | |||||
WS.ignore.newline: /[ \t\n]+/ | |||||
""" | |||||
class TreeToJson(Transformer): | |||||
def string(self, s): | |||||
return s[1:-1] | |||||
array = list | |||||
pair = tuple | |||||
object = dict | |||||
number = float | |||||
null = lambda self: None | |||||
true = lambda self: True | |||||
false = lambda self: False | |||||
json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||||
def test(): | |||||
test_json = ''' | |||||
{ | |||||
"empty_object" : {}, | |||||
"empty_array" : [], | |||||
"booleans" : { "YES" : true, "NO" : false }, | |||||
"numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], | |||||
"strings" : [ "This", [ "And" , "That" ] ], | |||||
"nothing" : null | |||||
} | |||||
''' | |||||
j = json_parser.parse(test_json) | |||||
print j | |||||
import json | |||||
assert j == json.loads(test_json) | |||||
if __name__ == '__main__': | |||||
test() | |||||
with open(sys.argv[1]) as f: | |||||
print json_parser.parse(f.read()) | |||||
@@ -0,0 +1,207 @@ | |||||
from collections import defaultdict, deque | |||||
from utils import classify, classify_bool, bfs, fzset | |||||
ACTION_SHIFT = 0 | |||||
class GrammarError(Exception): | |||||
pass | |||||
def is_terminal(sym): | |||||
return sym.isupper() or sym[0] == '$' | |||||
class Rule(object): | |||||
""" | |||||
origin : a symbol | |||||
expansion : a list of symbols | |||||
""" | |||||
def __init__(self, origin, expansion, alias=None): | |||||
assert expansion, "No support for empty rules" | |||||
self.origin = origin | |||||
self.expansion = expansion | |||||
self.alias = alias | |||||
def __repr__(self): | |||||
return '<%s : %s>' % (self.origin, ' '.join(self.expansion)) | |||||
class RulePtr(object): | |||||
def __init__(self, rule, index): | |||||
assert isinstance(rule, Rule) | |||||
assert index <= len(rule.expansion) | |||||
self.rule = rule | |||||
self.index = index | |||||
def __repr__(self): | |||||
before = self.rule.expansion[:self.index] | |||||
after = self.rule.expansion[self.index:] | |||||
return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) | |||||
@property | |||||
def next(self): | |||||
return self.rule.expansion[self.index] | |||||
def advance(self, sym): | |||||
assert self.next == sym | |||||
return RulePtr(self.rule, self.index+1) | |||||
@property | |||||
def is_satisfied(self): | |||||
return self.index == len(self.rule.expansion) | |||||
def __eq__(self, other): | |||||
return self.rule == other.rule and self.index == other.index | |||||
def __hash__(self): | |||||
return hash((self.rule, self.index)) | |||||
def pairs(lst): | |||||
return zip(lst[:-1], lst[1:]) | |||||
def update_set(set1, set2): | |||||
copy = set(set1) | |||||
set1 |= set2 | |||||
return set1 != copy | |||||
class GrammarAnalyzer(object): | |||||
def __init__(self, rule_tuples): | |||||
rule_tuples = list(rule_tuples) | |||||
rule_tuples.append(('$root', ['start', '$end'])) | |||||
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] | |||||
self.rules = set() | |||||
self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} | |||||
for origin, exp, alias in rule_tuples: | |||||
r = Rule( origin, exp, alias ) | |||||
self.rules.add(r) | |||||
self.rules_by_origin[origin].append(r) | |||||
for r in self.rules: | |||||
for sym in r.expansion: | |||||
if not (is_terminal(sym) or sym in self.rules_by_origin): | |||||
raise GrammarError("Using an undefined rule: %s" % sym) | |||||
self.init_state = self.expand_rule('start') | |||||
def expand_rule(self, rule): | |||||
"Returns all init_ptrs accessible by rule (recursive)" | |||||
init_ptrs = set() | |||||
def _expand_rule(rule): | |||||
assert not is_terminal(rule) | |||||
for r in self.rules_by_origin[rule]: | |||||
init_ptr = RulePtr(r, 0) | |||||
init_ptrs.add(init_ptr) | |||||
new_r = init_ptr.next | |||||
if not is_terminal(new_r): | |||||
yield new_r | |||||
_ = list(bfs([rule], _expand_rule)) | |||||
return fzset(init_ptrs) | |||||
def _first(self, r): | |||||
if is_terminal(r): | |||||
return {r} | |||||
else: | |||||
return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} | |||||
def _calc(self): | |||||
"""Calculate FOLLOW sets. | |||||
Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" | |||||
symbols = {sym for rule in self.rules for sym in rule.expansion} | |||||
symbols.add('$root') # what about other unused rules? | |||||
# foreach grammar rule X ::= Y(1) ... Y(k) | |||||
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then | |||||
# NULLABLE = NULLABLE union {X} | |||||
# for i = 1 to k | |||||
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then | |||||
# FIRST(X) = FIRST(X) union FIRST(Y(i)) | |||||
# for j = i+1 to k | |||||
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then | |||||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) | |||||
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then | |||||
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) | |||||
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration | |||||
NULLABLE = set() | |||||
FIRST = {} | |||||
FOLLOW = {} | |||||
for sym in symbols: | |||||
FIRST[sym]={sym} if is_terminal(sym) else set() | |||||
FOLLOW[sym]=set() | |||||
changed = True | |||||
while changed: | |||||
changed = False | |||||
for rule in self.rules: | |||||
if set(rule.expansion) <= NULLABLE: | |||||
if update_set(NULLABLE, {rule.origin}): | |||||
changed = True | |||||
for i, sym in enumerate(rule.expansion): | |||||
if set(rule.expansion[:i]) <= NULLABLE: | |||||
if update_set(FIRST[rule.origin], FIRST[sym]): | |||||
changed = True | |||||
if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: | |||||
if update_set(FOLLOW[sym], FOLLOW[rule.origin]): | |||||
changed = True | |||||
for j in range(i+1, len(rule.expansion)): | |||||
if set(rule.expansion[i+1:j]) <= NULLABLE: | |||||
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): | |||||
changed = True | |||||
self.FOLLOW = FOLLOW | |||||
def analyze(self): | |||||
self._calc() | |||||
self.states = {} | |||||
def step(state): | |||||
lookahead = defaultdict(list) | |||||
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) | |||||
for rp in sat: | |||||
for term in self.FOLLOW.get(rp.rule.origin, ()): | |||||
lookahead[term].append(('reduce', rp.rule)) | |||||
d = classify(unsat, lambda rp: rp.next) | |||||
for sym, rps in d.items(): | |||||
rps = {rp.advance(sym) for rp in rps} | |||||
for rp in set(rps): | |||||
if not rp.is_satisfied and not is_terminal(rp.next): | |||||
rps |= self.expand_rule(rp.next) | |||||
lookahead[sym].append(('shift', fzset(rps))) | |||||
yield fzset(rps) | |||||
for k, v in lookahead.items(): | |||||
if len(v) > 1: | |||||
for x in v: | |||||
# XXX resolving shift/reduce into shift, like PLY | |||||
# Give a proper warning | |||||
if x[0] == 'shift': | |||||
lookahead[k] = [x] | |||||
for k, v in lookahead.items(): | |||||
assert len(v) == 1, ("Collision", k, v) | |||||
self.states[state] = {k:v[0] for k, v in lookahead.items()} | |||||
x = list(bfs([self.init_state], step)) | |||||
# -- | |||||
self.enum = list(self.states) | |||||
self.enum_rev = {s:i for i,s in enumerate(self.enum)} | |||||
self.states_idx = {} | |||||
for s, la in self.states.items(): | |||||
la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' else v for k,v in la.items()} | |||||
self.states_idx[ self.enum_rev[s] ] = la | |||||
self.init_state_idx = self.enum_rev[self.init_state] | |||||
@@ -0,0 +1,217 @@ | |||||
from __future__ import absolute_import | |||||
from .utils import STRING_TYPE | |||||
from .load_grammar import load_grammar | |||||
from .tree import Tree, Transformer | |||||
from .lexer import Lexer | |||||
from .grammar_analysis import GrammarAnalyzer, is_terminal | |||||
from . import parser, earley | |||||
class LarkOptions(object): | |||||
"""Specifies the options for Lark | |||||
""" | |||||
OPTIONS_DOC = """ | |||||
parser - Which parser engine to use ("earley" or "lalr". Default: "earley") | |||||
Note: Both will use Lark's lexer. | |||||
transformer - Applies the transformer to every parse tree | |||||
debug - Affects verbosity (default: False) | |||||
only_lex - Don't build a parser. Useful for debugging (default: False) | |||||
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | |||||
cache_grammar - Cache the Lark grammar (Default: False) | |||||
ignore_postproc - Don't call the post-processing function (default: False) | |||||
""" | |||||
__doc__ += OPTIONS_DOC | |||||
def __init__(self, options_dict): | |||||
o = dict(options_dict) | |||||
self.debug = bool(o.pop('debug', False)) | |||||
self.only_lex = bool(o.pop('only_lex', False)) | |||||
self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) | |||||
self.keep_empty_trees = bool(o.pop('keep_empty_trees', True)) | |||||
self.tree_class = o.pop('tree_class', Tree) | |||||
self.cache_grammar = o.pop('cache_grammar', False) | |||||
self.ignore_postproc = bool(o.pop('ignore_postproc', False)) | |||||
self.parser = o.pop('parser', 'earley') | |||||
self.transformer = o.pop('transformer', None) | |||||
if o: | |||||
raise ValueError("Unknown options: %s" % o.keys()) | |||||
class Callback(object): | |||||
pass | |||||
class RuleTreeToText(Transformer): | |||||
def expansions(self, *x): | |||||
return x | |||||
def expansion(self, *symbols): | |||||
return [sym.value for sym in symbols], None | |||||
def alias(self, (expansion, _alias), alias): | |||||
assert _alias is None, (alias, expansion, '-', _alias) | |||||
return expansion, alias.value | |||||
def create_rule_handler(expansion, usermethod): | |||||
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||||
if not (is_terminal(sym) and sym.startswith('_'))] | |||||
def _build_ast(match): | |||||
children = [] | |||||
for i, to_expand in to_include: | |||||
if to_expand: | |||||
children += match[i].children | |||||
else: | |||||
children.append(match[i]) | |||||
return usermethod(children) | |||||
return _build_ast | |||||
def create_expand1_tree_builder_function(tree_builder): | |||||
def f(children): | |||||
if len(children) == 1: | |||||
return children[0] | |||||
else: | |||||
return tree_builder(children) | |||||
return f | |||||
def create_rule_inline(f): | |||||
def _f(children): | |||||
return f(*children) | |||||
return _f | |||||
class LALR: | |||||
def build_parser(self, rules, callback): | |||||
ga = GrammarAnalyzer(rules) | |||||
ga.analyze() | |||||
return parser.Parser(ga, callback) | |||||
class Earley: | |||||
@staticmethod | |||||
def _process_expansion(x): | |||||
return [{'literal': s} if is_terminal(s) else s for s in x] | |||||
def build_parser(self, rules, callback): | |||||
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||||
return EarleyParser(earley.Parser(rules, 'start')) | |||||
class EarleyParser: | |||||
def __init__(self, parser): | |||||
self.parser = parser | |||||
def parse(self, text): | |||||
res = self.parser.parse(text) | |||||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||||
return res[0] | |||||
class Lark: | |||||
def __init__(self, grammar, **options): | |||||
""" | |||||
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) | |||||
options : a dictionary controlling various aspects of Lark. | |||||
""" | |||||
self.options = LarkOptions(options) | |||||
# Some, but not all file-like objects have a 'name' attribute | |||||
try: | |||||
source = grammar.name | |||||
except AttributeError: | |||||
source = '<string>' | |||||
cache_file = "larkcache_%s" % str(hash(grammar)%(2**32)) | |||||
else: | |||||
cache_file = "larkcache_%s" % os.path.basename(source) | |||||
# Drain file-like objects to get their contents | |||||
try: | |||||
read = grammar.read | |||||
except AttributeError: | |||||
pass | |||||
else: | |||||
grammar = read() | |||||
assert isinstance(grammar, STRING_TYPE) | |||||
if self.options.cache_grammar: | |||||
raise NotImplementedError("Not available yet") | |||||
self.tokens, self.rules = load_grammar(grammar) | |||||
self.lexer = self._build_lexer() | |||||
if not self.options.only_lex: | |||||
self.parser_engine = { | |||||
'lalr': LALR, | |||||
'earley': Earley, | |||||
}[self.options.parser]() | |||||
self.parser = self._build_parser() | |||||
def _build_lexer(self): | |||||
ignore_tokens = [] | |||||
tokens = {} | |||||
for name, (value, flags) in self.tokens.items(): | |||||
if 'ignore' in flags: | |||||
ignore_tokens.append(name) | |||||
tokens[name] = value | |||||
return Lexer(tokens.items(), {}, ignore=ignore_tokens) | |||||
def _build_parser(self): | |||||
transformer = self.options.transformer | |||||
callback = Callback() | |||||
rules = [] | |||||
rule_tree_to_text = RuleTreeToText() | |||||
for origin, tree in self.rules.items(): | |||||
for expansion, alias in rule_tree_to_text.transform(tree): | |||||
if alias and origin.startswith('_'): | |||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||||
expand1 = origin.startswith('?') | |||||
inline_args = origin.startswith('*') or (alias and alias.startswith('*')) | |||||
_origin = origin.lstrip('?*') | |||||
if alias: | |||||
alias = alias.lstrip('*') | |||||
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||||
assert not hasattr(callback, _alias) | |||||
f = getattr(transformer, alias or _origin, None) | |||||
if f is None: | |||||
if alias: | |||||
f = self._create_tree_builder_function(alias) | |||||
else: | |||||
f = self._create_tree_builder_function(_origin) | |||||
if expand1: | |||||
f = create_expand1_tree_builder_function(f) | |||||
else: | |||||
if inline_args: | |||||
f = create_rule_inline(f) | |||||
alias_handler = create_rule_handler(expansion, f) | |||||
setattr(callback, _alias, alias_handler) | |||||
rules.append((_origin, expansion, _alias)) | |||||
return self.parser_engine.build_parser(rules, callback) | |||||
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||||
def _create_tree_builder_function(self, name): | |||||
tree_class = self.options.tree_class | |||||
def f(children): | |||||
return tree_class(name, children) | |||||
return f | |||||
def lex(self, text): | |||||
return self.lexer.lex(text) | |||||
def parse(self, text): | |||||
assert not self.options.only_lex | |||||
l = list(self.lex(text)) | |||||
return self.parser.parse(l) | |||||
@@ -0,0 +1,84 @@ | |||||
## Lexer Implementation | |||||
from utils import Str | |||||
class LexError(Exception): | |||||
pass | |||||
class Token(Str): | |||||
def __new__(cls, type, value, pos_in_stream=None): | |||||
inst = Str.__new__(cls, value) | |||||
inst.type = type | |||||
inst.pos_in_stream = pos_in_stream | |||||
inst.value = value | |||||
return inst | |||||
# class Token(object): | |||||
# def __init__(self, type, value, lexpos): | |||||
# self.type = type | |||||
# self.value = value | |||||
# self.lexpos = lexpos | |||||
def __repr__(self): | |||||
return 'Token(%s, %s, %s)' % (self.type, self.value, self.pos_in_stream) | |||||
class Regex: | |||||
def __init__(self, pattern, flags=()): | |||||
self.pattern = pattern | |||||
self.flags = flags | |||||
import re | |||||
LIMIT = 50 # Stupid named groups limit in python re | |||||
class Lexer(object): | |||||
def __init__(self, tokens, callbacks, ignore=()): | |||||
self.ignore = ignore | |||||
# Sanitization | |||||
token_names = {t[0] for t in tokens} | |||||
for t in tokens: | |||||
try: | |||||
re.compile(t[1]) | |||||
except: | |||||
raise LexError("Cannot compile token: %s: %s" % t) | |||||
assert all(t in token_names for t in ignore) | |||||
# Init | |||||
self.tokens = tokens | |||||
self.callbacks = callbacks | |||||
self.tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||||
self.mres = [] | |||||
self.name_from_index = [] | |||||
x = tokens | |||||
while x: | |||||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) | |||||
self.mres.append(mre) | |||||
self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) | |||||
x = x[LIMIT:] | |||||
def lex(self, stream): | |||||
lex_pos = 0 | |||||
while True: | |||||
i = 0 | |||||
for mre in self.mres: | |||||
m = mre.match(stream, lex_pos) | |||||
if m: | |||||
value = m.group(0) | |||||
type_ = self.name_from_index[i][m.lastindex] | |||||
t = Token(type_, value, lex_pos) | |||||
if t.type in self.callbacks: | |||||
self.callbacks[t.type](t) | |||||
if t.type not in self.ignore: | |||||
yield t | |||||
lex_pos += len(value) | |||||
break | |||||
i += 1 | |||||
else: | |||||
if lex_pos < len(stream): | |||||
context = stream[lex_pos:lex_pos+5] | |||||
raise LexError("No token defined for: '%s' in %s" % (stream[lex_pos], context)) | |||||
break | |||||
@@ -0,0 +1,358 @@ | |||||
import re | |||||
from lexer import Lexer, Token | |||||
from grammar_analysis import GrammarAnalyzer | |||||
from parser import Parser | |||||
from tree import Tree as T, Transformer, Visitor | |||||
_TOKEN_NAMES = { | |||||
':' : 'COLON', | |||||
',' : 'COMMA', | |||||
';' : 'SEMICOLON', | |||||
'+' : 'PLUS', | |||||
'-' : 'MINUS', | |||||
'*' : 'STAR', | |||||
'/' : 'SLASH', | |||||
'|' : 'VBAR', | |||||
'!' : 'BANG', | |||||
'?' : 'QMARK', | |||||
'#' : 'HASH', | |||||
'$' : 'DOLLAR', | |||||
'&' : 'AMPERSAND', | |||||
'<' : 'LESSTHAN', | |||||
'>' : 'MORETHAN', | |||||
'=' : 'EQUAL', | |||||
'.' : 'DOT', | |||||
'%' : 'PERCENT', | |||||
'`' : 'BACKQUOTE', | |||||
'^' : 'CIRCUMFLEX', | |||||
'"' : 'DBLQUOTE', | |||||
'\'' : 'QUOTE', | |||||
'~' : 'TILDE', | |||||
'@' : 'AT', | |||||
'(' : 'LPAR', | |||||
')' : 'RPAR', | |||||
'{' : 'LBRACE', | |||||
'}' : 'RBRACE', | |||||
'[' : 'LSQB', | |||||
']' : 'RSQB', | |||||
} | |||||
# Grammar Parser | |||||
TOKENS = { | |||||
'LPAR': '\(', | |||||
'RPAR': '\)', | |||||
'LBRA': '\[', | |||||
'RBRA': '\]', | |||||
'OP': '[+*?]', | |||||
'COLON': ':', | |||||
'OR': '\|', | |||||
'DOT': '\.', | |||||
'RULE': '[_?*]?[a-z][_a-z0-9]*', | |||||
'TOKEN': '_?[A-Z][_A-Z0-9]*', | |||||
'STRING': r'".*?[^\\]"', | |||||
'REGEXP': r"/(.|\n)*?[^\\]/", | |||||
'NL': r'(\r?\n)+\s*', | |||||
'WS': r'[ \t]+', | |||||
'COMMENT': r'#[^\n]*\n', | |||||
'TO': '->' | |||||
} | |||||
RULES = [ | |||||
('start', ['list']), | |||||
('list', ['item']), | |||||
('list', ['list', 'item']), | |||||
('item', ['rule']), | |||||
('item', ['token']), | |||||
('item', ['NL']), | |||||
('rule', ['RULE', 'COLON', 'expansions', 'NL']), | |||||
('expansions', ['expansion']), | |||||
('expansions', ['expansions', 'OR', 'expansion']), | |||||
('expansions', ['expansions', 'NL', 'OR', 'expansion']), | |||||
('expansion', ['_expansion']), | |||||
('expansion', ['_expansion', 'TO', 'RULE']), | |||||
('_expansion', ['expr']), | |||||
('_expansion', ['_expansion', 'expr']), | |||||
('expr', ['atom']), | |||||
('expr', ['atom', 'OP']), | |||||
('atom', ['LPAR', 'expansions', 'RPAR']), | |||||
('atom', ['maybe']), | |||||
('atom', ['RULE']), | |||||
('atom', ['TOKEN']), | |||||
('atom', ['anontoken']), | |||||
('anontoken', ['tokenvalue']), | |||||
('maybe', ['LBRA', 'expansions', 'RBRA']), | |||||
('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']), | |||||
('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']), | |||||
('tokenvalue', ['REGEXP']), | |||||
('tokenvalue', ['STRING']), | |||||
('tokenmods', ['DOT', 'RULE']), | |||||
('tokenmods', ['tokenmods', 'DOT', 'RULE']), | |||||
] | |||||
class SaveDefinitions(object): | |||||
def __init__(self): | |||||
self.rules = {} | |||||
self.tokens = {} | |||||
self.i = 0 | |||||
def atom__3(self, _1, value, _2): | |||||
return value | |||||
def atom__1(self, value): | |||||
return value | |||||
def expr__1(self, expr): | |||||
return expr | |||||
def expr(self, *x): | |||||
return T('expr', x) | |||||
def expansion__1(self, expansion): | |||||
return expansion | |||||
def expansion__3(self, expansion, _, alias): | |||||
return T('alias', [expansion, alias]) | |||||
def _expansion(self, *x): | |||||
return T('expansion', x) | |||||
def expansions(self, *x): | |||||
items = [i for i in x if isinstance(i, T)] | |||||
return T('expansions', items) | |||||
def maybe(self, _1, expr, _2): | |||||
return T('expr', [expr, Token('OP', '?', -1)]) | |||||
def rule(self, name, _1, expansion, _2): | |||||
name = name.value | |||||
if name in self.rules: | |||||
raise ValueError("Rule '%s' defined more than once" % name) | |||||
self.rules[name] = expansion | |||||
def token(self, *x): | |||||
name = x[0].value | |||||
if name in self.tokens: | |||||
raise ValueError("Token '%s' defined more than once" % name) | |||||
if len(x) == 4: | |||||
self.tokens[name] = x[2][1], [] | |||||
else: | |||||
self.tokens[name] = x[3][1], x[1].children | |||||
def tokenvalue(self, tokenvalue): | |||||
value = tokenvalue.value[1:-1] | |||||
if tokenvalue.type == 'STRING': | |||||
value = re.escape(value) | |||||
return tokenvalue, value | |||||
def anontoken(self, (token, value)): | |||||
if token.type == 'STRING': | |||||
try: | |||||
token_name = _TOKEN_NAMES[token.value[1:-1]] | |||||
except KeyError: | |||||
if value.isalnum() and value[0].isalpha(): | |||||
token_name = value.upper() | |||||
else: | |||||
token_name = 'ANONSTR_%d' % self.i | |||||
self.i += 1 | |||||
token_name = '__' + token_name | |||||
elif token.type == 'REGEXP': | |||||
token_name = 'ANONRE_%d' % self.i | |||||
self.i += 1 | |||||
else: | |||||
assert False, x | |||||
if token_name not in self.tokens: | |||||
self.tokens[token_name] = value, [] | |||||
return Token('TOKEN', token_name, -1) | |||||
def tokenmods__2(self, _, rule): | |||||
return T('tokenmods', [rule.value]) | |||||
def tokenmods__3(self, tokenmods, _, rule): | |||||
return T('tokenmods', tokenmods.children + [rule.value]) | |||||
def start(self, *x): pass | |||||
def list(self, *x): pass | |||||
def item(self, *x): pass | |||||
class EBNF_to_BNF(Transformer): | |||||
def __init__(self): | |||||
self.new_rules = {} | |||||
self.prefix = 'anon' | |||||
self.i = 0 | |||||
def _add_recurse_rule(self, type_, expr): | |||||
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||||
self.i += 1 | |||||
t = Token('RULE', new_name, -1) | |||||
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||||
return t | |||||
def expr(self, rule, op): | |||||
if op.value == '?': | |||||
return T('expansions', [rule, T('expansion', [])]) | |||||
elif op.value == '+': | |||||
# a : b c+ d | |||||
# --> | |||||
# a : b _c d | |||||
# _c : _c c | c; | |||||
return self._add_recurse_rule('plus', rule) | |||||
elif op.value == '*': | |||||
# a : b c* d | |||||
# --> | |||||
# a : b _c? d | |||||
# _c : _c c | c; | |||||
new_name = self._add_recurse_rule('star', rule) | |||||
return T('expansions', [new_name, T('expansion', [])]) | |||||
assert False, op | |||||
class SimplifyRule_Visitor(Visitor): | |||||
@staticmethod | |||||
def _flatten(tree): | |||||
while True: | |||||
to_expand = [i for i, child in enumerate(tree.children) | |||||
if isinstance(child, T) and child.data == tree.data] | |||||
if not to_expand: | |||||
break | |||||
tree.expand_kids_by_index(*to_expand) | |||||
def expansion(self, tree): | |||||
# rules_list unpacking | |||||
# a : b (c|d) e | |||||
# --> | |||||
# a : b c e | b d e | |||||
# | |||||
# In AST terms: | |||||
# expansion(b, expansions(c, d), e) | |||||
# --> | |||||
# expansions( expansion(b, c, e), expansion(b, d, e) ) | |||||
while True: | |||||
self._flatten(tree) | |||||
for i, child in enumerate(tree.children): | |||||
if isinstance(child, T) and child.data == 'expansions': | |||||
tree.data = 'expansions' | |||||
tree.children = [self.visit(T('expansion', [option if i==j else other | |||||
for j, other in enumerate(tree.children)])) | |||||
for option in child.children] | |||||
break | |||||
else: | |||||
break | |||||
def alias(self, tree): | |||||
rule, alias_name = tree.children | |||||
if rule.data == 'expansions': | |||||
aliases = [] | |||||
for child in tree.children[0].children: | |||||
aliases.append(T('alias', [child, alias_name])) | |||||
tree.data = 'expansions' | |||||
tree.children = aliases | |||||
expansions = _flatten | |||||
def dict_update_safe(d1, d2): | |||||
for k, v in d2.iteritems(): | |||||
assert k not in d1 | |||||
d1[k] = v | |||||
def generate_aliases(): | |||||
sd = SaveDefinitions() | |||||
for name, expansion in RULES: | |||||
try: | |||||
f = getattr(sd, "%s__%s" % (name, len(expansion))) | |||||
except AttributeError: | |||||
f = getattr(sd, name) | |||||
yield name, expansion, f.__name__ | |||||
def inline_args(f): | |||||
def _f(self, args): | |||||
return f(*args) | |||||
return _f | |||||
class GrammarLoader: | |||||
def __init__(self): | |||||
self.rules = list(generate_aliases()) | |||||
self.ga = GrammarAnalyzer(self.rules) | |||||
self.ga.analyze() | |||||
self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | |||||
self.simplify_rule = SimplifyRule_Visitor() | |||||
def _generate_parser_callbacks(self, callbacks): | |||||
d = {alias: inline_args(getattr(callbacks, alias)) | |||||
for _n, _x, alias in self.rules} | |||||
return type('Callback', (), d)() | |||||
def load_grammar(self, grammar_text): | |||||
sd = SaveDefinitions() | |||||
c = self._generate_parser_callbacks(sd) | |||||
p = Parser(self.ga, c) | |||||
p.parse( list(self.lexer.lex(grammar_text+"\n")) ) | |||||
ebnf_to_bnf = EBNF_to_BNF() | |||||
rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} | |||||
dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||||
for r in rules.values(): | |||||
self.simplify_rule.visit(r) | |||||
return sd.tokens, rules | |||||
load_grammar = GrammarLoader().load_grammar | |||||
def test(): | |||||
g = """ | |||||
start: add | |||||
# Rules | |||||
add: mul | |||||
| add _add_sym mul | |||||
mul: _atom | |||||
| mul _add_mul _atom | |||||
neg: "-" _atom | |||||
_atom: neg | |||||
| number | |||||
| "(" add ")" | |||||
# Tokens | |||||
number: /[\d.]+/ | |||||
_add_sym: "+" | "-" | |||||
_add_mul: "*" | "/" | |||||
WS.ignore: /\s+/ | |||||
""" | |||||
g2 = """ | |||||
start: a | |||||
a: "a" (b*|(c d)+) "b"? | |||||
b: "b" | |||||
c: "c" | |||||
d: "+" | "-" | |||||
""" | |||||
load_grammar(g) | |||||
@@ -0,0 +1,61 @@ | |||||
from grammar_analysis import ACTION_SHIFT | |||||
class ParseError(Exception): | |||||
pass | |||||
class Parser(object): | |||||
def __init__(self, ga, callback, temp=False): | |||||
self.ga = ga | |||||
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||||
for rule in ga.rules} | |||||
def parse(self, seq): | |||||
states_idx = self.ga.states_idx | |||||
stack = [(None, self.ga.init_state_idx)] | |||||
i = 0 | |||||
res = None | |||||
def get_action(key): | |||||
state = stack[-1][1] | |||||
try: | |||||
return states_idx[state][key] | |||||
except KeyError: | |||||
expected = states_idx[state].keys() | |||||
context = ' '.join(['%s(%r)' % (t.type, t.value) for t in seq[i:i+5]]) | |||||
raise ParseError("Unexpected input %r.\nExpected: %s\nContext: %s" % (key, expected, context)) | |||||
def reduce(rule): | |||||
s = stack[-len(rule.expansion):] | |||||
del stack[-len(rule.expansion):] | |||||
res = self.callbacks[rule]([x[0] for x in s]) | |||||
if rule.origin == 'start': | |||||
return res | |||||
_action, new_state = get_action(rule.origin) | |||||
assert _action == ACTION_SHIFT | |||||
stack.append((res, new_state)) | |||||
# Main LALR-parser loop | |||||
while i < len(seq): | |||||
action, arg = get_action(seq[i].type) | |||||
if action == ACTION_SHIFT: | |||||
stack.append((seq[i], arg)) | |||||
i+= 1 | |||||
else: | |||||
reduce(arg) | |||||
while len(stack) > 1: | |||||
_action, rule = get_action('$end') | |||||
assert _action == 'reduce' | |||||
res = reduce(rule) | |||||
if res: | |||||
break | |||||
assert stack == [(None, self.ga.init_state_idx)], len(stack) | |||||
return res | |||||
@@ -0,0 +1,83 @@ | |||||
class Tree(object): | |||||
def __init__(self, data, children): | |||||
self.data = data | |||||
self.children = list(children) | |||||
def __repr__(self): | |||||
return 'Tree(%s, %s)' % (self.data, self.children) | |||||
def _pretty(self, level, indent_str): | |||||
if len(self.children) == 1 and not isinstance(self.children[0], Tree): | |||||
return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n'] | |||||
l = [ indent_str*level, self.data, '\n' ] | |||||
for n in self.children: | |||||
if isinstance(n, Tree): | |||||
l += n._pretty(level+1, indent_str) | |||||
else: | |||||
l += [ indent_str*(level+1), '%s' % n, '\n' ] | |||||
return l | |||||
def pretty(self, indent_str=' '): | |||||
return ''.join(self._pretty(0, indent_str)) | |||||
def expand_kids_by_index(self, *indices): | |||||
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||||
kid = self.children[i] | |||||
self.children[i:i+1] = kid.children | |||||
# def find_path(self, pred): | |||||
# if pred(self): | |||||
# yield [] | |||||
# else: | |||||
# for i, c in enumerate(self.children): | |||||
# if isinstance(c, Tree): | |||||
# for path in c.find_path(pred): | |||||
# yield [i] + path | |||||
# def follow_path(self, path): | |||||
# x = self | |||||
# for step in path: | |||||
# x = x.children[step] | |||||
# return x | |||||
# def set_at_path(self, path, value): | |||||
# x = self.follow_path(path[:-1]) | |||||
# x.children[path[-1]] = value | |||||
def clone(self): | |||||
return Tree(self.data, [c.clone() if isinstance(c, Tree) else c for c in self.children]) | |||||
class Transformer(object): | |||||
def transform(self, tree): | |||||
items = [self.transform(c) if isinstance(c, Tree) else c for c in tree.children] | |||||
try: | |||||
f = getattr(self, tree.data) | |||||
except AttributeError: | |||||
return self.__default__(tree.data, items) | |||||
else: | |||||
return f(*items) | |||||
def __default__(self, data, children): | |||||
return Tree(data, children) | |||||
class Visitor(object): | |||||
def visit(self, tree): | |||||
for child in tree.children: | |||||
if isinstance(child, Tree): | |||||
self.visit(child) | |||||
f = getattr(self, tree.data, self.__default__) | |||||
f(tree) | |||||
return tree | |||||
def __default__(self, tree): | |||||
pass | |||||
@@ -0,0 +1,51 @@ | |||||
from collections import deque | |||||
class fzset(frozenset): | |||||
def __repr__(self): | |||||
return '{%s}' % ', '.join(map(repr, self)) | |||||
def classify_bool(seq, pred): | |||||
true_elems = [] | |||||
false_elems = [] | |||||
for elem in seq: | |||||
if pred(elem): | |||||
true_elems.append(elem) | |||||
else: | |||||
false_elems.append(elem) | |||||
return true_elems, false_elems | |||||
def classify(seq, key=None): | |||||
d = {} | |||||
for item in seq: | |||||
k = key(item) if (key is not None) else item | |||||
if k in d: | |||||
d[k].append(item) | |||||
else: | |||||
d[k] = [item] | |||||
return d | |||||
def bfs(initial, expand): | |||||
open_q = deque(list(initial)) | |||||
visited = set(open_q) | |||||
while open_q: | |||||
node = open_q.popleft() | |||||
yield node | |||||
for next_node in expand(node): | |||||
if next_node not in visited: | |||||
visited.add(next_node) | |||||
open_q.append(next_node) | |||||
try: | |||||
STRING_TYPE = basestring | |||||
except NameError: # Python 3 | |||||
STRING_TYPE = str | |||||
Str = type(u'') | |||||