| @@ -0,0 +1,148 @@ | |||
| "My name is Earley" | |||
| from .utils import classify | |||
| class MatchFailed(object): | |||
| pass | |||
| class AbortParseMatch(Exception): | |||
| pass | |||
| class Rule(object): | |||
| def __init__(self, name, symbols, postprocess): | |||
| self.name = name | |||
| self.symbols = symbols | |||
| self.postprocess = postprocess | |||
| class State(object): | |||
| def __init__(self, rule, expect, reference, data=None): | |||
| self.rule = rule | |||
| self.expect = expect | |||
| self.reference = reference | |||
| self.data = data or [] | |||
| self.is_complete = (self.expect == len(self.rule.symbols)) | |||
| if not self.is_complete: | |||
| self.expect_symbol = self.rule.symbols[self.expect] | |||
| self.is_literal = isinstance(self.expect_symbol, dict) | |||
| if self.is_literal: | |||
| self.expect_symbol = self.expect_symbol['literal'] | |||
| assert isinstance(self.expect_symbol, (str, unicode)), self.expect_symbol | |||
| def next_state(self, data): | |||
| return State(self.rule, self.expect+1, self.reference, self.data + [data]) | |||
| def consume_terminal(self, inp): | |||
| if not self.is_complete and self.is_literal: | |||
| # PORT: originally tests regexp | |||
| if self.expect_symbol == inp.type: | |||
| return self.next_state(inp) | |||
| def consume_nonterminal(self, inp): | |||
| if not self.is_complete and not self.is_literal: | |||
| if self.expect_symbol == inp: | |||
| return self.next_state(inp) | |||
| def process(self, location, ind, table, rules, added_rules): | |||
| if self.is_complete: | |||
| # Completed a rule | |||
| if self.rule.postprocess: | |||
| try: | |||
| # self.data = self.rule.postprocess(self.data, self.reference) | |||
| # import pdb | |||
| # pdb.set_trace() | |||
| self.data = self.rule.postprocess(self.data) | |||
| except AbortParseMatch: | |||
| self.data = MatchFailed | |||
| if self.data is not MatchFailed: | |||
| for s in table[self.reference]: | |||
| x = s.consume_nonterminal(self.rule.name) | |||
| if x: | |||
| x.data[-1] = self.data | |||
| x.epsilon_closure(location, ind, table) | |||
| else: | |||
| exp = self.rule.symbols[self.expect] | |||
| if isinstance(exp, dict): | |||
| return | |||
| for r in rules[exp]: | |||
| assert r.name == exp | |||
| if r not in added_rules: | |||
| if r.symbols: | |||
| added_rules.add(r) | |||
| State(r, 0, location).epsilon_closure(location, ind, table) | |||
| else: | |||
| # Empty rule | |||
| new_copy = self.consume_nonterminal(r.name) | |||
| if r.postprocess: | |||
| new_copy.data[-1] = r.postprocess([]) | |||
| # new_copy.data[-1] = r.postprocess([], self.reference) | |||
| else: | |||
| new_copy.data[-1] = [] | |||
| new_copy.epsilon_closure(location, ind, table) | |||
| def epsilon_closure(self, location, ind, table, result=None): | |||
| col = table[location] | |||
| if not result: | |||
| result = col | |||
| result.append(self) | |||
| if not self.is_complete: | |||
| for i in xrange(ind): | |||
| state = col[i] | |||
| if state.is_complete and state.reference == location: | |||
| x = self.consume_nonterminal(state.rule.name) | |||
| if x: | |||
| x.data[-1] = state.data | |||
| x.epsilon_closure(location, ind, table) | |||
| class Parser(object): | |||
| def __init__(self, rules, start=None): | |||
| self.table = [[]] | |||
| self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules] | |||
| self.rules_by_name = classify(self.rules, lambda r: r.name) | |||
| self.start = start or self.rules[0].name | |||
| initial_rules = set(self.rules_by_name[self.start]) | |||
| self.table[0] += [State(r, 0, 0) for r in initial_rules] | |||
| self.advance_to(0, initial_rules) | |||
| self.current = 0 | |||
| def advance_to(self, n, added_rules): | |||
| for w, s in enumerate(self.table[n]): | |||
| s.process(n, w, self.table, self.rules_by_name, added_rules) | |||
| def parse(self, chunk): | |||
| chunk_pos = 0 | |||
| for chunk_pos, chunk_item in enumerate(chunk): | |||
| self.table.append([]) | |||
| for s in self.table[self.current + chunk_pos]: | |||
| x = s.consume_terminal(chunk_item) | |||
| if x: | |||
| self.table[self.current + chunk_pos + 1].append(x) | |||
| added_rules = set() | |||
| self.advance_to(self.current + chunk_pos + 1, added_rules) | |||
| if not self.table[-1]: | |||
| raise Exception('Error at line {t.line}:{t.column}'.format(t=chunk[chunk_pos])) | |||
| self.current += chunk_pos | |||
| return list(self.finish()) | |||
| def finish(self): | |||
| for t in self.table[-1]: | |||
| if (t.rule.name == self.start | |||
| and t.expect == len(t.rule.symbols) | |||
| and t.reference == 0 | |||
| and t.data != MatchFailed): | |||
| yield t.data | |||
| @@ -0,0 +1,59 @@ | |||
| from lark.tree import Transformer | |||
| from lark.lark import Lark | |||
| calc_grammar = """ | |||
| ?start: sum | |||
| | NAME "=" sum -> *assign_var | |||
| ?sum: product | |||
| | sum "+" product -> *add | |||
| | sum "-" product -> *sub | |||
| ?product: atom | |||
| | product "*" atom -> *mul | |||
| | product "/" atom -> *div | |||
| ?atom: /[\d.]+/ -> *number | |||
| | "-" atom -> *neg | |||
| | NAME -> *var | |||
| | "(" sum ")" | |||
| NAME: /\w+/ | |||
| WS.ignore: /\s+/ | |||
| """ | |||
| class CalculateTree(Transformer): | |||
| from operator import add, sub, mul, div, neg | |||
| number = float | |||
| def __init__(self): | |||
| self.vars = {} | |||
| def assign_var(self, name, value): | |||
| self.vars[name] = value | |||
| return value | |||
| def var(self, name): | |||
| return self.vars[name] | |||
| calc_parser = Lark(calc_grammar, parser='lalr', transformer=CalculateTree()) | |||
| calc = calc_parser.parse | |||
| def main(): | |||
| while True: | |||
| try: | |||
| s = raw_input('> ') | |||
| except EOFError: | |||
| break | |||
| print(calc(s)) | |||
| def test(): | |||
| print calc("a = 1+2") | |||
| print calc("1+a*-3") | |||
| if __name__ == '__main__': | |||
| test() | |||
| # main() | |||
| @@ -0,0 +1,62 @@ | |||
| import sys | |||
| from lark.lark import Lark | |||
| from lark.tree import Transformer | |||
| json_grammar = r""" | |||
| ?start: value | |||
| ?value: object | |||
| | array | |||
| | string | |||
| | number | |||
| | "true" -> *true | |||
| | "false" -> *false | |||
| | "null" -> *null | |||
| array : "[" [value ("," value)*] "]" | |||
| object : "{" [pair ("," pair)*] "}" | |||
| pair : string ":" value | |||
| *number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | |||
| *string : /".*?(?<!\\)"/ | |||
| WS.ignore.newline: /[ \t\n]+/ | |||
| """ | |||
| class TreeToJson(Transformer): | |||
| def string(self, s): | |||
| return s[1:-1] | |||
| array = list | |||
| pair = tuple | |||
| object = dict | |||
| number = float | |||
| null = lambda self: None | |||
| true = lambda self: True | |||
| false = lambda self: False | |||
| json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
| def test(): | |||
| test_json = ''' | |||
| { | |||
| "empty_object" : {}, | |||
| "empty_array" : [], | |||
| "booleans" : { "YES" : true, "NO" : false }, | |||
| "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], | |||
| "strings" : [ "This", [ "And" , "That" ] ], | |||
| "nothing" : null | |||
| } | |||
| ''' | |||
| j = json_parser.parse(test_json) | |||
| print j | |||
| import json | |||
| assert j == json.loads(test_json) | |||
| if __name__ == '__main__': | |||
| test() | |||
| with open(sys.argv[1]) as f: | |||
| print json_parser.parse(f.read()) | |||
| @@ -0,0 +1,207 @@ | |||
| from collections import defaultdict, deque | |||
| from utils import classify, classify_bool, bfs, fzset | |||
| ACTION_SHIFT = 0 | |||
| class GrammarError(Exception): | |||
| pass | |||
| def is_terminal(sym): | |||
| return sym.isupper() or sym[0] == '$' | |||
| class Rule(object): | |||
| """ | |||
| origin : a symbol | |||
| expansion : a list of symbols | |||
| """ | |||
| def __init__(self, origin, expansion, alias=None): | |||
| assert expansion, "No support for empty rules" | |||
| self.origin = origin | |||
| self.expansion = expansion | |||
| self.alias = alias | |||
| def __repr__(self): | |||
| return '<%s : %s>' % (self.origin, ' '.join(self.expansion)) | |||
| class RulePtr(object): | |||
| def __init__(self, rule, index): | |||
| assert isinstance(rule, Rule) | |||
| assert index <= len(rule.expansion) | |||
| self.rule = rule | |||
| self.index = index | |||
| def __repr__(self): | |||
| before = self.rule.expansion[:self.index] | |||
| after = self.rule.expansion[self.index:] | |||
| return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) | |||
| @property | |||
| def next(self): | |||
| return self.rule.expansion[self.index] | |||
| def advance(self, sym): | |||
| assert self.next == sym | |||
| return RulePtr(self.rule, self.index+1) | |||
| @property | |||
| def is_satisfied(self): | |||
| return self.index == len(self.rule.expansion) | |||
| def __eq__(self, other): | |||
| return self.rule == other.rule and self.index == other.index | |||
| def __hash__(self): | |||
| return hash((self.rule, self.index)) | |||
| def pairs(lst): | |||
| return zip(lst[:-1], lst[1:]) | |||
| def update_set(set1, set2): | |||
| copy = set(set1) | |||
| set1 |= set2 | |||
| return set1 != copy | |||
| class GrammarAnalyzer(object): | |||
| def __init__(self, rule_tuples): | |||
| rule_tuples = list(rule_tuples) | |||
| rule_tuples.append(('$root', ['start', '$end'])) | |||
| rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] | |||
| self.rules = set() | |||
| self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} | |||
| for origin, exp, alias in rule_tuples: | |||
| r = Rule( origin, exp, alias ) | |||
| self.rules.add(r) | |||
| self.rules_by_origin[origin].append(r) | |||
| for r in self.rules: | |||
| for sym in r.expansion: | |||
| if not (is_terminal(sym) or sym in self.rules_by_origin): | |||
| raise GrammarError("Using an undefined rule: %s" % sym) | |||
| self.init_state = self.expand_rule('start') | |||
| def expand_rule(self, rule): | |||
| "Returns all init_ptrs accessible by rule (recursive)" | |||
| init_ptrs = set() | |||
| def _expand_rule(rule): | |||
| assert not is_terminal(rule) | |||
| for r in self.rules_by_origin[rule]: | |||
| init_ptr = RulePtr(r, 0) | |||
| init_ptrs.add(init_ptr) | |||
| new_r = init_ptr.next | |||
| if not is_terminal(new_r): | |||
| yield new_r | |||
| _ = list(bfs([rule], _expand_rule)) | |||
| return fzset(init_ptrs) | |||
| def _first(self, r): | |||
| if is_terminal(r): | |||
| return {r} | |||
| else: | |||
| return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} | |||
| def _calc(self): | |||
| """Calculate FOLLOW sets. | |||
| Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" | |||
| symbols = {sym for rule in self.rules for sym in rule.expansion} | |||
| symbols.add('$root') # what about other unused rules? | |||
| # foreach grammar rule X ::= Y(1) ... Y(k) | |||
| # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then | |||
| # NULLABLE = NULLABLE union {X} | |||
| # for i = 1 to k | |||
| # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then | |||
| # FIRST(X) = FIRST(X) union FIRST(Y(i)) | |||
| # for j = i+1 to k | |||
| # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then | |||
| # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) | |||
| # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then | |||
| # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) | |||
| # until none of NULLABLE,FIRST,FOLLOW changed in last iteration | |||
| NULLABLE = set() | |||
| FIRST = {} | |||
| FOLLOW = {} | |||
| for sym in symbols: | |||
| FIRST[sym]={sym} if is_terminal(sym) else set() | |||
| FOLLOW[sym]=set() | |||
| changed = True | |||
| while changed: | |||
| changed = False | |||
| for rule in self.rules: | |||
| if set(rule.expansion) <= NULLABLE: | |||
| if update_set(NULLABLE, {rule.origin}): | |||
| changed = True | |||
| for i, sym in enumerate(rule.expansion): | |||
| if set(rule.expansion[:i]) <= NULLABLE: | |||
| if update_set(FIRST[rule.origin], FIRST[sym]): | |||
| changed = True | |||
| if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: | |||
| if update_set(FOLLOW[sym], FOLLOW[rule.origin]): | |||
| changed = True | |||
| for j in range(i+1, len(rule.expansion)): | |||
| if set(rule.expansion[i+1:j]) <= NULLABLE: | |||
| if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): | |||
| changed = True | |||
| self.FOLLOW = FOLLOW | |||
| def analyze(self): | |||
| self._calc() | |||
| self.states = {} | |||
| def step(state): | |||
| lookahead = defaultdict(list) | |||
| sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) | |||
| for rp in sat: | |||
| for term in self.FOLLOW.get(rp.rule.origin, ()): | |||
| lookahead[term].append(('reduce', rp.rule)) | |||
| d = classify(unsat, lambda rp: rp.next) | |||
| for sym, rps in d.items(): | |||
| rps = {rp.advance(sym) for rp in rps} | |||
| for rp in set(rps): | |||
| if not rp.is_satisfied and not is_terminal(rp.next): | |||
| rps |= self.expand_rule(rp.next) | |||
| lookahead[sym].append(('shift', fzset(rps))) | |||
| yield fzset(rps) | |||
| for k, v in lookahead.items(): | |||
| if len(v) > 1: | |||
| for x in v: | |||
| # XXX resolving shift/reduce into shift, like PLY | |||
| # Give a proper warning | |||
| if x[0] == 'shift': | |||
| lookahead[k] = [x] | |||
| for k, v in lookahead.items(): | |||
| assert len(v) == 1, ("Collision", k, v) | |||
| self.states[state] = {k:v[0] for k, v in lookahead.items()} | |||
| x = list(bfs([self.init_state], step)) | |||
| # -- | |||
| self.enum = list(self.states) | |||
| self.enum_rev = {s:i for i,s in enumerate(self.enum)} | |||
| self.states_idx = {} | |||
| for s, la in self.states.items(): | |||
| la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' else v for k,v in la.items()} | |||
| self.states_idx[ self.enum_rev[s] ] = la | |||
| self.init_state_idx = self.enum_rev[self.init_state] | |||
| @@ -0,0 +1,217 @@ | |||
| from __future__ import absolute_import | |||
| from .utils import STRING_TYPE | |||
| from .load_grammar import load_grammar | |||
| from .tree import Tree, Transformer | |||
| from .lexer import Lexer | |||
| from .grammar_analysis import GrammarAnalyzer, is_terminal | |||
| from . import parser, earley | |||
| class LarkOptions(object): | |||
| """Specifies the options for Lark | |||
| """ | |||
| OPTIONS_DOC = """ | |||
| parser - Which parser engine to use ("earley" or "lalr". Default: "earley") | |||
| Note: Both will use Lark's lexer. | |||
| transformer - Applies the transformer to every parse tree | |||
| debug - Affects verbosity (default: False) | |||
| only_lex - Don't build a parser. Useful for debugging (default: False) | |||
| keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | |||
| cache_grammar - Cache the Lark grammar (Default: False) | |||
| ignore_postproc - Don't call the post-processing function (default: False) | |||
| """ | |||
| __doc__ += OPTIONS_DOC | |||
| def __init__(self, options_dict): | |||
| o = dict(options_dict) | |||
| self.debug = bool(o.pop('debug', False)) | |||
| self.only_lex = bool(o.pop('only_lex', False)) | |||
| self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) | |||
| self.keep_empty_trees = bool(o.pop('keep_empty_trees', True)) | |||
| self.tree_class = o.pop('tree_class', Tree) | |||
| self.cache_grammar = o.pop('cache_grammar', False) | |||
| self.ignore_postproc = bool(o.pop('ignore_postproc', False)) | |||
| self.parser = o.pop('parser', 'earley') | |||
| self.transformer = o.pop('transformer', None) | |||
| if o: | |||
| raise ValueError("Unknown options: %s" % o.keys()) | |||
| class Callback(object): | |||
| pass | |||
| class RuleTreeToText(Transformer): | |||
| def expansions(self, *x): | |||
| return x | |||
| def expansion(self, *symbols): | |||
| return [sym.value for sym in symbols], None | |||
| def alias(self, (expansion, _alias), alias): | |||
| assert _alias is None, (alias, expansion, '-', _alias) | |||
| return expansion, alias.value | |||
| def create_rule_handler(expansion, usermethod): | |||
| to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||
| if not (is_terminal(sym) and sym.startswith('_'))] | |||
| def _build_ast(match): | |||
| children = [] | |||
| for i, to_expand in to_include: | |||
| if to_expand: | |||
| children += match[i].children | |||
| else: | |||
| children.append(match[i]) | |||
| return usermethod(children) | |||
| return _build_ast | |||
| def create_expand1_tree_builder_function(tree_builder): | |||
| def f(children): | |||
| if len(children) == 1: | |||
| return children[0] | |||
| else: | |||
| return tree_builder(children) | |||
| return f | |||
| def create_rule_inline(f): | |||
| def _f(children): | |||
| return f(*children) | |||
| return _f | |||
| class LALR: | |||
| def build_parser(self, rules, callback): | |||
| ga = GrammarAnalyzer(rules) | |||
| ga.analyze() | |||
| return parser.Parser(ga, callback) | |||
| class Earley: | |||
| @staticmethod | |||
| def _process_expansion(x): | |||
| return [{'literal': s} if is_terminal(s) else s for s in x] | |||
| def build_parser(self, rules, callback): | |||
| rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||
| return EarleyParser(earley.Parser(rules, 'start')) | |||
| class EarleyParser: | |||
| def __init__(self, parser): | |||
| self.parser = parser | |||
| def parse(self, text): | |||
| res = self.parser.parse(text) | |||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
| return res[0] | |||
| class Lark: | |||
| def __init__(self, grammar, **options): | |||
| """ | |||
| grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) | |||
| options : a dictionary controlling various aspects of Lark. | |||
| """ | |||
| self.options = LarkOptions(options) | |||
| # Some, but not all file-like objects have a 'name' attribute | |||
| try: | |||
| source = grammar.name | |||
| except AttributeError: | |||
| source = '<string>' | |||
| cache_file = "larkcache_%s" % str(hash(grammar)%(2**32)) | |||
| else: | |||
| cache_file = "larkcache_%s" % os.path.basename(source) | |||
| # Drain file-like objects to get their contents | |||
| try: | |||
| read = grammar.read | |||
| except AttributeError: | |||
| pass | |||
| else: | |||
| grammar = read() | |||
| assert isinstance(grammar, STRING_TYPE) | |||
| if self.options.cache_grammar: | |||
| raise NotImplementedError("Not available yet") | |||
| self.tokens, self.rules = load_grammar(grammar) | |||
| self.lexer = self._build_lexer() | |||
| if not self.options.only_lex: | |||
| self.parser_engine = { | |||
| 'lalr': LALR, | |||
| 'earley': Earley, | |||
| }[self.options.parser]() | |||
| self.parser = self._build_parser() | |||
| def _build_lexer(self): | |||
| ignore_tokens = [] | |||
| tokens = {} | |||
| for name, (value, flags) in self.tokens.items(): | |||
| if 'ignore' in flags: | |||
| ignore_tokens.append(name) | |||
| tokens[name] = value | |||
| return Lexer(tokens.items(), {}, ignore=ignore_tokens) | |||
| def _build_parser(self): | |||
| transformer = self.options.transformer | |||
| callback = Callback() | |||
| rules = [] | |||
| rule_tree_to_text = RuleTreeToText() | |||
| for origin, tree in self.rules.items(): | |||
| for expansion, alias in rule_tree_to_text.transform(tree): | |||
| if alias and origin.startswith('_'): | |||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||
| expand1 = origin.startswith('?') | |||
| inline_args = origin.startswith('*') or (alias and alias.startswith('*')) | |||
| _origin = origin.lstrip('?*') | |||
| if alias: | |||
| alias = alias.lstrip('*') | |||
| _alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||
| assert not hasattr(callback, _alias) | |||
| f = getattr(transformer, alias or _origin, None) | |||
| if f is None: | |||
| if alias: | |||
| f = self._create_tree_builder_function(alias) | |||
| else: | |||
| f = self._create_tree_builder_function(_origin) | |||
| if expand1: | |||
| f = create_expand1_tree_builder_function(f) | |||
| else: | |||
| if inline_args: | |||
| f = create_rule_inline(f) | |||
| alias_handler = create_rule_handler(expansion, f) | |||
| setattr(callback, _alias, alias_handler) | |||
| rules.append((_origin, expansion, _alias)) | |||
| return self.parser_engine.build_parser(rules, callback) | |||
| __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||
| def _create_tree_builder_function(self, name): | |||
| tree_class = self.options.tree_class | |||
| def f(children): | |||
| return tree_class(name, children) | |||
| return f | |||
| def lex(self, text): | |||
| return self.lexer.lex(text) | |||
| def parse(self, text): | |||
| assert not self.options.only_lex | |||
| l = list(self.lex(text)) | |||
| return self.parser.parse(l) | |||
| @@ -0,0 +1,84 @@ | |||
| ## Lexer Implementation | |||
| from utils import Str | |||
| class LexError(Exception): | |||
| pass | |||
| class Token(Str): | |||
| def __new__(cls, type, value, pos_in_stream=None): | |||
| inst = Str.__new__(cls, value) | |||
| inst.type = type | |||
| inst.pos_in_stream = pos_in_stream | |||
| inst.value = value | |||
| return inst | |||
| # class Token(object): | |||
| # def __init__(self, type, value, lexpos): | |||
| # self.type = type | |||
| # self.value = value | |||
| # self.lexpos = lexpos | |||
| def __repr__(self): | |||
| return 'Token(%s, %s, %s)' % (self.type, self.value, self.pos_in_stream) | |||
| class Regex: | |||
| def __init__(self, pattern, flags=()): | |||
| self.pattern = pattern | |||
| self.flags = flags | |||
| import re | |||
| LIMIT = 50 # Stupid named groups limit in python re | |||
| class Lexer(object): | |||
| def __init__(self, tokens, callbacks, ignore=()): | |||
| self.ignore = ignore | |||
| # Sanitization | |||
| token_names = {t[0] for t in tokens} | |||
| for t in tokens: | |||
| try: | |||
| re.compile(t[1]) | |||
| except: | |||
| raise LexError("Cannot compile token: %s: %s" % t) | |||
| assert all(t in token_names for t in ignore) | |||
| # Init | |||
| self.tokens = tokens | |||
| self.callbacks = callbacks | |||
| self.tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
| self.mres = [] | |||
| self.name_from_index = [] | |||
| x = tokens | |||
| while x: | |||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) | |||
| self.mres.append(mre) | |||
| self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) | |||
| x = x[LIMIT:] | |||
| def lex(self, stream): | |||
| lex_pos = 0 | |||
| while True: | |||
| i = 0 | |||
| for mre in self.mres: | |||
| m = mre.match(stream, lex_pos) | |||
| if m: | |||
| value = m.group(0) | |||
| type_ = self.name_from_index[i][m.lastindex] | |||
| t = Token(type_, value, lex_pos) | |||
| if t.type in self.callbacks: | |||
| self.callbacks[t.type](t) | |||
| if t.type not in self.ignore: | |||
| yield t | |||
| lex_pos += len(value) | |||
| break | |||
| i += 1 | |||
| else: | |||
| if lex_pos < len(stream): | |||
| context = stream[lex_pos:lex_pos+5] | |||
| raise LexError("No token defined for: '%s' in %s" % (stream[lex_pos], context)) | |||
| break | |||
| @@ -0,0 +1,358 @@ | |||
| import re | |||
| from lexer import Lexer, Token | |||
| from grammar_analysis import GrammarAnalyzer | |||
| from parser import Parser | |||
| from tree import Tree as T, Transformer, Visitor | |||
| _TOKEN_NAMES = { | |||
| ':' : 'COLON', | |||
| ',' : 'COMMA', | |||
| ';' : 'SEMICOLON', | |||
| '+' : 'PLUS', | |||
| '-' : 'MINUS', | |||
| '*' : 'STAR', | |||
| '/' : 'SLASH', | |||
| '|' : 'VBAR', | |||
| '!' : 'BANG', | |||
| '?' : 'QMARK', | |||
| '#' : 'HASH', | |||
| '$' : 'DOLLAR', | |||
| '&' : 'AMPERSAND', | |||
| '<' : 'LESSTHAN', | |||
| '>' : 'MORETHAN', | |||
| '=' : 'EQUAL', | |||
| '.' : 'DOT', | |||
| '%' : 'PERCENT', | |||
| '`' : 'BACKQUOTE', | |||
| '^' : 'CIRCUMFLEX', | |||
| '"' : 'DBLQUOTE', | |||
| '\'' : 'QUOTE', | |||
| '~' : 'TILDE', | |||
| '@' : 'AT', | |||
| '(' : 'LPAR', | |||
| ')' : 'RPAR', | |||
| '{' : 'LBRACE', | |||
| '}' : 'RBRACE', | |||
| '[' : 'LSQB', | |||
| ']' : 'RSQB', | |||
| } | |||
| # Grammar Parser | |||
| TOKENS = { | |||
| 'LPAR': '\(', | |||
| 'RPAR': '\)', | |||
| 'LBRA': '\[', | |||
| 'RBRA': '\]', | |||
| 'OP': '[+*?]', | |||
| 'COLON': ':', | |||
| 'OR': '\|', | |||
| 'DOT': '\.', | |||
| 'RULE': '[_?*]?[a-z][_a-z0-9]*', | |||
| 'TOKEN': '_?[A-Z][_A-Z0-9]*', | |||
| 'STRING': r'".*?[^\\]"', | |||
| 'REGEXP': r"/(.|\n)*?[^\\]/", | |||
| 'NL': r'(\r?\n)+\s*', | |||
| 'WS': r'[ \t]+', | |||
| 'COMMENT': r'#[^\n]*\n', | |||
| 'TO': '->' | |||
| } | |||
| RULES = [ | |||
| ('start', ['list']), | |||
| ('list', ['item']), | |||
| ('list', ['list', 'item']), | |||
| ('item', ['rule']), | |||
| ('item', ['token']), | |||
| ('item', ['NL']), | |||
| ('rule', ['RULE', 'COLON', 'expansions', 'NL']), | |||
| ('expansions', ['expansion']), | |||
| ('expansions', ['expansions', 'OR', 'expansion']), | |||
| ('expansions', ['expansions', 'NL', 'OR', 'expansion']), | |||
| ('expansion', ['_expansion']), | |||
| ('expansion', ['_expansion', 'TO', 'RULE']), | |||
| ('_expansion', ['expr']), | |||
| ('_expansion', ['_expansion', 'expr']), | |||
| ('expr', ['atom']), | |||
| ('expr', ['atom', 'OP']), | |||
| ('atom', ['LPAR', 'expansions', 'RPAR']), | |||
| ('atom', ['maybe']), | |||
| ('atom', ['RULE']), | |||
| ('atom', ['TOKEN']), | |||
| ('atom', ['anontoken']), | |||
| ('anontoken', ['tokenvalue']), | |||
| ('maybe', ['LBRA', 'expansions', 'RBRA']), | |||
| ('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']), | |||
| ('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']), | |||
| ('tokenvalue', ['REGEXP']), | |||
| ('tokenvalue', ['STRING']), | |||
| ('tokenmods', ['DOT', 'RULE']), | |||
| ('tokenmods', ['tokenmods', 'DOT', 'RULE']), | |||
| ] | |||
| class SaveDefinitions(object): | |||
| def __init__(self): | |||
| self.rules = {} | |||
| self.tokens = {} | |||
| self.i = 0 | |||
| def atom__3(self, _1, value, _2): | |||
| return value | |||
| def atom__1(self, value): | |||
| return value | |||
| def expr__1(self, expr): | |||
| return expr | |||
| def expr(self, *x): | |||
| return T('expr', x) | |||
| def expansion__1(self, expansion): | |||
| return expansion | |||
| def expansion__3(self, expansion, _, alias): | |||
| return T('alias', [expansion, alias]) | |||
| def _expansion(self, *x): | |||
| return T('expansion', x) | |||
| def expansions(self, *x): | |||
| items = [i for i in x if isinstance(i, T)] | |||
| return T('expansions', items) | |||
| def maybe(self, _1, expr, _2): | |||
| return T('expr', [expr, Token('OP', '?', -1)]) | |||
| def rule(self, name, _1, expansion, _2): | |||
| name = name.value | |||
| if name in self.rules: | |||
| raise ValueError("Rule '%s' defined more than once" % name) | |||
| self.rules[name] = expansion | |||
| def token(self, *x): | |||
| name = x[0].value | |||
| if name in self.tokens: | |||
| raise ValueError("Token '%s' defined more than once" % name) | |||
| if len(x) == 4: | |||
| self.tokens[name] = x[2][1], [] | |||
| else: | |||
| self.tokens[name] = x[3][1], x[1].children | |||
| def tokenvalue(self, tokenvalue): | |||
| value = tokenvalue.value[1:-1] | |||
| if tokenvalue.type == 'STRING': | |||
| value = re.escape(value) | |||
| return tokenvalue, value | |||
| def anontoken(self, (token, value)): | |||
| if token.type == 'STRING': | |||
| try: | |||
| token_name = _TOKEN_NAMES[token.value[1:-1]] | |||
| except KeyError: | |||
| if value.isalnum() and value[0].isalpha(): | |||
| token_name = value.upper() | |||
| else: | |||
| token_name = 'ANONSTR_%d' % self.i | |||
| self.i += 1 | |||
| token_name = '__' + token_name | |||
| elif token.type == 'REGEXP': | |||
| token_name = 'ANONRE_%d' % self.i | |||
| self.i += 1 | |||
| else: | |||
| assert False, x | |||
| if token_name not in self.tokens: | |||
| self.tokens[token_name] = value, [] | |||
| return Token('TOKEN', token_name, -1) | |||
| def tokenmods__2(self, _, rule): | |||
| return T('tokenmods', [rule.value]) | |||
| def tokenmods__3(self, tokenmods, _, rule): | |||
| return T('tokenmods', tokenmods.children + [rule.value]) | |||
| def start(self, *x): pass | |||
| def list(self, *x): pass | |||
| def item(self, *x): pass | |||
| class EBNF_to_BNF(Transformer): | |||
| def __init__(self): | |||
| self.new_rules = {} | |||
| self.prefix = 'anon' | |||
| self.i = 0 | |||
| def _add_recurse_rule(self, type_, expr): | |||
| new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||
| self.i += 1 | |||
| t = Token('RULE', new_name, -1) | |||
| self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||
| return t | |||
| def expr(self, rule, op): | |||
| if op.value == '?': | |||
| return T('expansions', [rule, T('expansion', [])]) | |||
| elif op.value == '+': | |||
| # a : b c+ d | |||
| # --> | |||
| # a : b _c d | |||
| # _c : _c c | c; | |||
| return self._add_recurse_rule('plus', rule) | |||
| elif op.value == '*': | |||
| # a : b c* d | |||
| # --> | |||
| # a : b _c? d | |||
| # _c : _c c | c; | |||
| new_name = self._add_recurse_rule('star', rule) | |||
| return T('expansions', [new_name, T('expansion', [])]) | |||
| assert False, op | |||
| class SimplifyRule_Visitor(Visitor): | |||
| @staticmethod | |||
| def _flatten(tree): | |||
| while True: | |||
| to_expand = [i for i, child in enumerate(tree.children) | |||
| if isinstance(child, T) and child.data == tree.data] | |||
| if not to_expand: | |||
| break | |||
| tree.expand_kids_by_index(*to_expand) | |||
| def expansion(self, tree): | |||
| # rules_list unpacking | |||
| # a : b (c|d) e | |||
| # --> | |||
| # a : b c e | b d e | |||
| # | |||
| # In AST terms: | |||
| # expansion(b, expansions(c, d), e) | |||
| # --> | |||
| # expansions( expansion(b, c, e), expansion(b, d, e) ) | |||
| while True: | |||
| self._flatten(tree) | |||
| for i, child in enumerate(tree.children): | |||
| if isinstance(child, T) and child.data == 'expansions': | |||
| tree.data = 'expansions' | |||
| tree.children = [self.visit(T('expansion', [option if i==j else other | |||
| for j, other in enumerate(tree.children)])) | |||
| for option in child.children] | |||
| break | |||
| else: | |||
| break | |||
| def alias(self, tree): | |||
| rule, alias_name = tree.children | |||
| if rule.data == 'expansions': | |||
| aliases = [] | |||
| for child in tree.children[0].children: | |||
| aliases.append(T('alias', [child, alias_name])) | |||
| tree.data = 'expansions' | |||
| tree.children = aliases | |||
| expansions = _flatten | |||
| def dict_update_safe(d1, d2): | |||
| for k, v in d2.iteritems(): | |||
| assert k not in d1 | |||
| d1[k] = v | |||
| def generate_aliases(): | |||
| sd = SaveDefinitions() | |||
| for name, expansion in RULES: | |||
| try: | |||
| f = getattr(sd, "%s__%s" % (name, len(expansion))) | |||
| except AttributeError: | |||
| f = getattr(sd, name) | |||
| yield name, expansion, f.__name__ | |||
| def inline_args(f): | |||
| def _f(self, args): | |||
| return f(*args) | |||
| return _f | |||
| class GrammarLoader: | |||
| def __init__(self): | |||
| self.rules = list(generate_aliases()) | |||
| self.ga = GrammarAnalyzer(self.rules) | |||
| self.ga.analyze() | |||
| self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | |||
| self.simplify_rule = SimplifyRule_Visitor() | |||
| def _generate_parser_callbacks(self, callbacks): | |||
| d = {alias: inline_args(getattr(callbacks, alias)) | |||
| for _n, _x, alias in self.rules} | |||
| return type('Callback', (), d)() | |||
| def load_grammar(self, grammar_text): | |||
| sd = SaveDefinitions() | |||
| c = self._generate_parser_callbacks(sd) | |||
| p = Parser(self.ga, c) | |||
| p.parse( list(self.lexer.lex(grammar_text+"\n")) ) | |||
| ebnf_to_bnf = EBNF_to_BNF() | |||
| rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} | |||
| dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||
| for r in rules.values(): | |||
| self.simplify_rule.visit(r) | |||
| return sd.tokens, rules | |||
| load_grammar = GrammarLoader().load_grammar | |||
| def test(): | |||
| g = """ | |||
| start: add | |||
| # Rules | |||
| add: mul | |||
| | add _add_sym mul | |||
| mul: _atom | |||
| | mul _add_mul _atom | |||
| neg: "-" _atom | |||
| _atom: neg | |||
| | number | |||
| | "(" add ")" | |||
| # Tokens | |||
| number: /[\d.]+/ | |||
| _add_sym: "+" | "-" | |||
| _add_mul: "*" | "/" | |||
| WS.ignore: /\s+/ | |||
| """ | |||
| g2 = """ | |||
| start: a | |||
| a: "a" (b*|(c d)+) "b"? | |||
| b: "b" | |||
| c: "c" | |||
| d: "+" | "-" | |||
| """ | |||
| load_grammar(g) | |||
| @@ -0,0 +1,61 @@ | |||
| from grammar_analysis import ACTION_SHIFT | |||
| class ParseError(Exception): | |||
| pass | |||
| class Parser(object): | |||
| def __init__(self, ga, callback, temp=False): | |||
| self.ga = ga | |||
| self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||
| for rule in ga.rules} | |||
| def parse(self, seq): | |||
| states_idx = self.ga.states_idx | |||
| stack = [(None, self.ga.init_state_idx)] | |||
| i = 0 | |||
| res = None | |||
| def get_action(key): | |||
| state = stack[-1][1] | |||
| try: | |||
| return states_idx[state][key] | |||
| except KeyError: | |||
| expected = states_idx[state].keys() | |||
| context = ' '.join(['%s(%r)' % (t.type, t.value) for t in seq[i:i+5]]) | |||
| raise ParseError("Unexpected input %r.\nExpected: %s\nContext: %s" % (key, expected, context)) | |||
| def reduce(rule): | |||
| s = stack[-len(rule.expansion):] | |||
| del stack[-len(rule.expansion):] | |||
| res = self.callbacks[rule]([x[0] for x in s]) | |||
| if rule.origin == 'start': | |||
| return res | |||
| _action, new_state = get_action(rule.origin) | |||
| assert _action == ACTION_SHIFT | |||
| stack.append((res, new_state)) | |||
| # Main LALR-parser loop | |||
| while i < len(seq): | |||
| action, arg = get_action(seq[i].type) | |||
| if action == ACTION_SHIFT: | |||
| stack.append((seq[i], arg)) | |||
| i+= 1 | |||
| else: | |||
| reduce(arg) | |||
| while len(stack) > 1: | |||
| _action, rule = get_action('$end') | |||
| assert _action == 'reduce' | |||
| res = reduce(rule) | |||
| if res: | |||
| break | |||
| assert stack == [(None, self.ga.init_state_idx)], len(stack) | |||
| return res | |||
| @@ -0,0 +1,83 @@ | |||
| class Tree(object): | |||
| def __init__(self, data, children): | |||
| self.data = data | |||
| self.children = list(children) | |||
| def __repr__(self): | |||
| return 'Tree(%s, %s)' % (self.data, self.children) | |||
| def _pretty(self, level, indent_str): | |||
| if len(self.children) == 1 and not isinstance(self.children[0], Tree): | |||
| return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n'] | |||
| l = [ indent_str*level, self.data, '\n' ] | |||
| for n in self.children: | |||
| if isinstance(n, Tree): | |||
| l += n._pretty(level+1, indent_str) | |||
| else: | |||
| l += [ indent_str*(level+1), '%s' % n, '\n' ] | |||
| return l | |||
| def pretty(self, indent_str=' '): | |||
| return ''.join(self._pretty(0, indent_str)) | |||
| def expand_kids_by_index(self, *indices): | |||
| for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||
| kid = self.children[i] | |||
| self.children[i:i+1] = kid.children | |||
| # def find_path(self, pred): | |||
| # if pred(self): | |||
| # yield [] | |||
| # else: | |||
| # for i, c in enumerate(self.children): | |||
| # if isinstance(c, Tree): | |||
| # for path in c.find_path(pred): | |||
| # yield [i] + path | |||
| # def follow_path(self, path): | |||
| # x = self | |||
| # for step in path: | |||
| # x = x.children[step] | |||
| # return x | |||
| # def set_at_path(self, path, value): | |||
| # x = self.follow_path(path[:-1]) | |||
| # x.children[path[-1]] = value | |||
| def clone(self): | |||
| return Tree(self.data, [c.clone() if isinstance(c, Tree) else c for c in self.children]) | |||
| class Transformer(object): | |||
| def transform(self, tree): | |||
| items = [self.transform(c) if isinstance(c, Tree) else c for c in tree.children] | |||
| try: | |||
| f = getattr(self, tree.data) | |||
| except AttributeError: | |||
| return self.__default__(tree.data, items) | |||
| else: | |||
| return f(*items) | |||
| def __default__(self, data, children): | |||
| return Tree(data, children) | |||
| class Visitor(object): | |||
| def visit(self, tree): | |||
| for child in tree.children: | |||
| if isinstance(child, Tree): | |||
| self.visit(child) | |||
| f = getattr(self, tree.data, self.__default__) | |||
| f(tree) | |||
| return tree | |||
| def __default__(self, tree): | |||
| pass | |||
| @@ -0,0 +1,51 @@ | |||
| from collections import deque | |||
| class fzset(frozenset): | |||
| def __repr__(self): | |||
| return '{%s}' % ', '.join(map(repr, self)) | |||
| def classify_bool(seq, pred): | |||
| true_elems = [] | |||
| false_elems = [] | |||
| for elem in seq: | |||
| if pred(elem): | |||
| true_elems.append(elem) | |||
| else: | |||
| false_elems.append(elem) | |||
| return true_elems, false_elems | |||
| def classify(seq, key=None): | |||
| d = {} | |||
| for item in seq: | |||
| k = key(item) if (key is not None) else item | |||
| if k in d: | |||
| d[k].append(item) | |||
| else: | |||
| d[k] = [item] | |||
| return d | |||
| def bfs(initial, expand): | |||
| open_q = deque(list(initial)) | |||
| visited = set(open_q) | |||
| while open_q: | |||
| node = open_q.popleft() | |||
| yield node | |||
| for next_node in expand(node): | |||
| if next_node not in visited: | |||
| visited.add(next_node) | |||
| open_q.append(next_node) | |||
| try: | |||
| STRING_TYPE = basestring | |||
| except NameError: # Python 3 | |||
| STRING_TYPE = str | |||
| Str = type(u'') | |||