| @@ -0,0 +1,148 @@ | |||||
| "My name is Earley" | |||||
| from .utils import classify | |||||
| class MatchFailed(object): | |||||
| pass | |||||
| class AbortParseMatch(Exception): | |||||
| pass | |||||
| class Rule(object): | |||||
| def __init__(self, name, symbols, postprocess): | |||||
| self.name = name | |||||
| self.symbols = symbols | |||||
| self.postprocess = postprocess | |||||
| class State(object): | |||||
| def __init__(self, rule, expect, reference, data=None): | |||||
| self.rule = rule | |||||
| self.expect = expect | |||||
| self.reference = reference | |||||
| self.data = data or [] | |||||
| self.is_complete = (self.expect == len(self.rule.symbols)) | |||||
| if not self.is_complete: | |||||
| self.expect_symbol = self.rule.symbols[self.expect] | |||||
| self.is_literal = isinstance(self.expect_symbol, dict) | |||||
| if self.is_literal: | |||||
| self.expect_symbol = self.expect_symbol['literal'] | |||||
| assert isinstance(self.expect_symbol, (str, unicode)), self.expect_symbol | |||||
| def next_state(self, data): | |||||
| return State(self.rule, self.expect+1, self.reference, self.data + [data]) | |||||
| def consume_terminal(self, inp): | |||||
| if not self.is_complete and self.is_literal: | |||||
| # PORT: originally tests regexp | |||||
| if self.expect_symbol == inp.type: | |||||
| return self.next_state(inp) | |||||
| def consume_nonterminal(self, inp): | |||||
| if not self.is_complete and not self.is_literal: | |||||
| if self.expect_symbol == inp: | |||||
| return self.next_state(inp) | |||||
| def process(self, location, ind, table, rules, added_rules): | |||||
| if self.is_complete: | |||||
| # Completed a rule | |||||
| if self.rule.postprocess: | |||||
| try: | |||||
| # self.data = self.rule.postprocess(self.data, self.reference) | |||||
| # import pdb | |||||
| # pdb.set_trace() | |||||
| self.data = self.rule.postprocess(self.data) | |||||
| except AbortParseMatch: | |||||
| self.data = MatchFailed | |||||
| if self.data is not MatchFailed: | |||||
| for s in table[self.reference]: | |||||
| x = s.consume_nonterminal(self.rule.name) | |||||
| if x: | |||||
| x.data[-1] = self.data | |||||
| x.epsilon_closure(location, ind, table) | |||||
| else: | |||||
| exp = self.rule.symbols[self.expect] | |||||
| if isinstance(exp, dict): | |||||
| return | |||||
| for r in rules[exp]: | |||||
| assert r.name == exp | |||||
| if r not in added_rules: | |||||
| if r.symbols: | |||||
| added_rules.add(r) | |||||
| State(r, 0, location).epsilon_closure(location, ind, table) | |||||
| else: | |||||
| # Empty rule | |||||
| new_copy = self.consume_nonterminal(r.name) | |||||
| if r.postprocess: | |||||
| new_copy.data[-1] = r.postprocess([]) | |||||
| # new_copy.data[-1] = r.postprocess([], self.reference) | |||||
| else: | |||||
| new_copy.data[-1] = [] | |||||
| new_copy.epsilon_closure(location, ind, table) | |||||
| def epsilon_closure(self, location, ind, table, result=None): | |||||
| col = table[location] | |||||
| if not result: | |||||
| result = col | |||||
| result.append(self) | |||||
| if not self.is_complete: | |||||
| for i in xrange(ind): | |||||
| state = col[i] | |||||
| if state.is_complete and state.reference == location: | |||||
| x = self.consume_nonterminal(state.rule.name) | |||||
| if x: | |||||
| x.data[-1] = state.data | |||||
| x.epsilon_closure(location, ind, table) | |||||
| class Parser(object): | |||||
| def __init__(self, rules, start=None): | |||||
| self.table = [[]] | |||||
| self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules] | |||||
| self.rules_by_name = classify(self.rules, lambda r: r.name) | |||||
| self.start = start or self.rules[0].name | |||||
| initial_rules = set(self.rules_by_name[self.start]) | |||||
| self.table[0] += [State(r, 0, 0) for r in initial_rules] | |||||
| self.advance_to(0, initial_rules) | |||||
| self.current = 0 | |||||
| def advance_to(self, n, added_rules): | |||||
| for w, s in enumerate(self.table[n]): | |||||
| s.process(n, w, self.table, self.rules_by_name, added_rules) | |||||
| def parse(self, chunk): | |||||
| chunk_pos = 0 | |||||
| for chunk_pos, chunk_item in enumerate(chunk): | |||||
| self.table.append([]) | |||||
| for s in self.table[self.current + chunk_pos]: | |||||
| x = s.consume_terminal(chunk_item) | |||||
| if x: | |||||
| self.table[self.current + chunk_pos + 1].append(x) | |||||
| added_rules = set() | |||||
| self.advance_to(self.current + chunk_pos + 1, added_rules) | |||||
| if not self.table[-1]: | |||||
| raise Exception('Error at line {t.line}:{t.column}'.format(t=chunk[chunk_pos])) | |||||
| self.current += chunk_pos | |||||
| return list(self.finish()) | |||||
| def finish(self): | |||||
| for t in self.table[-1]: | |||||
| if (t.rule.name == self.start | |||||
| and t.expect == len(t.rule.symbols) | |||||
| and t.reference == 0 | |||||
| and t.data != MatchFailed): | |||||
| yield t.data | |||||
| @@ -0,0 +1,59 @@ | |||||
| from lark.tree import Transformer | |||||
| from lark.lark import Lark | |||||
| calc_grammar = """ | |||||
| ?start: sum | |||||
| | NAME "=" sum -> *assign_var | |||||
| ?sum: product | |||||
| | sum "+" product -> *add | |||||
| | sum "-" product -> *sub | |||||
| ?product: atom | |||||
| | product "*" atom -> *mul | |||||
| | product "/" atom -> *div | |||||
| ?atom: /[\d.]+/ -> *number | |||||
| | "-" atom -> *neg | |||||
| | NAME -> *var | |||||
| | "(" sum ")" | |||||
| NAME: /\w+/ | |||||
| WS.ignore: /\s+/ | |||||
| """ | |||||
| class CalculateTree(Transformer): | |||||
| from operator import add, sub, mul, div, neg | |||||
| number = float | |||||
| def __init__(self): | |||||
| self.vars = {} | |||||
| def assign_var(self, name, value): | |||||
| self.vars[name] = value | |||||
| return value | |||||
| def var(self, name): | |||||
| return self.vars[name] | |||||
| calc_parser = Lark(calc_grammar, parser='lalr', transformer=CalculateTree()) | |||||
| calc = calc_parser.parse | |||||
| def main(): | |||||
| while True: | |||||
| try: | |||||
| s = raw_input('> ') | |||||
| except EOFError: | |||||
| break | |||||
| print(calc(s)) | |||||
| def test(): | |||||
| print calc("a = 1+2") | |||||
| print calc("1+a*-3") | |||||
| if __name__ == '__main__': | |||||
| test() | |||||
| # main() | |||||
| @@ -0,0 +1,62 @@ | |||||
| import sys | |||||
| from lark.lark import Lark | |||||
| from lark.tree import Transformer | |||||
| json_grammar = r""" | |||||
| ?start: value | |||||
| ?value: object | |||||
| | array | |||||
| | string | |||||
| | number | |||||
| | "true" -> *true | |||||
| | "false" -> *false | |||||
| | "null" -> *null | |||||
| array : "[" [value ("," value)*] "]" | |||||
| object : "{" [pair ("," pair)*] "}" | |||||
| pair : string ":" value | |||||
| *number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | |||||
| *string : /".*?(?<!\\)"/ | |||||
| WS.ignore.newline: /[ \t\n]+/ | |||||
| """ | |||||
| class TreeToJson(Transformer): | |||||
| def string(self, s): | |||||
| return s[1:-1] | |||||
| array = list | |||||
| pair = tuple | |||||
| object = dict | |||||
| number = float | |||||
| null = lambda self: None | |||||
| true = lambda self: True | |||||
| false = lambda self: False | |||||
| json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||||
| def test(): | |||||
| test_json = ''' | |||||
| { | |||||
| "empty_object" : {}, | |||||
| "empty_array" : [], | |||||
| "booleans" : { "YES" : true, "NO" : false }, | |||||
| "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], | |||||
| "strings" : [ "This", [ "And" , "That" ] ], | |||||
| "nothing" : null | |||||
| } | |||||
| ''' | |||||
| j = json_parser.parse(test_json) | |||||
| print j | |||||
| import json | |||||
| assert j == json.loads(test_json) | |||||
| if __name__ == '__main__': | |||||
| test() | |||||
| with open(sys.argv[1]) as f: | |||||
| print json_parser.parse(f.read()) | |||||
| @@ -0,0 +1,207 @@ | |||||
| from collections import defaultdict, deque | |||||
| from utils import classify, classify_bool, bfs, fzset | |||||
| ACTION_SHIFT = 0 | |||||
| class GrammarError(Exception): | |||||
| pass | |||||
| def is_terminal(sym): | |||||
| return sym.isupper() or sym[0] == '$' | |||||
| class Rule(object): | |||||
| """ | |||||
| origin : a symbol | |||||
| expansion : a list of symbols | |||||
| """ | |||||
| def __init__(self, origin, expansion, alias=None): | |||||
| assert expansion, "No support for empty rules" | |||||
| self.origin = origin | |||||
| self.expansion = expansion | |||||
| self.alias = alias | |||||
| def __repr__(self): | |||||
| return '<%s : %s>' % (self.origin, ' '.join(self.expansion)) | |||||
| class RulePtr(object): | |||||
| def __init__(self, rule, index): | |||||
| assert isinstance(rule, Rule) | |||||
| assert index <= len(rule.expansion) | |||||
| self.rule = rule | |||||
| self.index = index | |||||
| def __repr__(self): | |||||
| before = self.rule.expansion[:self.index] | |||||
| after = self.rule.expansion[self.index:] | |||||
| return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) | |||||
| @property | |||||
| def next(self): | |||||
| return self.rule.expansion[self.index] | |||||
| def advance(self, sym): | |||||
| assert self.next == sym | |||||
| return RulePtr(self.rule, self.index+1) | |||||
| @property | |||||
| def is_satisfied(self): | |||||
| return self.index == len(self.rule.expansion) | |||||
| def __eq__(self, other): | |||||
| return self.rule == other.rule and self.index == other.index | |||||
| def __hash__(self): | |||||
| return hash((self.rule, self.index)) | |||||
| def pairs(lst): | |||||
| return zip(lst[:-1], lst[1:]) | |||||
| def update_set(set1, set2): | |||||
| copy = set(set1) | |||||
| set1 |= set2 | |||||
| return set1 != copy | |||||
| class GrammarAnalyzer(object): | |||||
| def __init__(self, rule_tuples): | |||||
| rule_tuples = list(rule_tuples) | |||||
| rule_tuples.append(('$root', ['start', '$end'])) | |||||
| rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] | |||||
| self.rules = set() | |||||
| self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} | |||||
| for origin, exp, alias in rule_tuples: | |||||
| r = Rule( origin, exp, alias ) | |||||
| self.rules.add(r) | |||||
| self.rules_by_origin[origin].append(r) | |||||
| for r in self.rules: | |||||
| for sym in r.expansion: | |||||
| if not (is_terminal(sym) or sym in self.rules_by_origin): | |||||
| raise GrammarError("Using an undefined rule: %s" % sym) | |||||
| self.init_state = self.expand_rule('start') | |||||
| def expand_rule(self, rule): | |||||
| "Returns all init_ptrs accessible by rule (recursive)" | |||||
| init_ptrs = set() | |||||
| def _expand_rule(rule): | |||||
| assert not is_terminal(rule) | |||||
| for r in self.rules_by_origin[rule]: | |||||
| init_ptr = RulePtr(r, 0) | |||||
| init_ptrs.add(init_ptr) | |||||
| new_r = init_ptr.next | |||||
| if not is_terminal(new_r): | |||||
| yield new_r | |||||
| _ = list(bfs([rule], _expand_rule)) | |||||
| return fzset(init_ptrs) | |||||
| def _first(self, r): | |||||
| if is_terminal(r): | |||||
| return {r} | |||||
| else: | |||||
| return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} | |||||
| def _calc(self): | |||||
| """Calculate FOLLOW sets. | |||||
| Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" | |||||
| symbols = {sym for rule in self.rules for sym in rule.expansion} | |||||
| symbols.add('$root') # what about other unused rules? | |||||
| # foreach grammar rule X ::= Y(1) ... Y(k) | |||||
| # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then | |||||
| # NULLABLE = NULLABLE union {X} | |||||
| # for i = 1 to k | |||||
| # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then | |||||
| # FIRST(X) = FIRST(X) union FIRST(Y(i)) | |||||
| # for j = i+1 to k | |||||
| # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then | |||||
| # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) | |||||
| # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then | |||||
| # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) | |||||
| # until none of NULLABLE,FIRST,FOLLOW changed in last iteration | |||||
| NULLABLE = set() | |||||
| FIRST = {} | |||||
| FOLLOW = {} | |||||
| for sym in symbols: | |||||
| FIRST[sym]={sym} if is_terminal(sym) else set() | |||||
| FOLLOW[sym]=set() | |||||
| changed = True | |||||
| while changed: | |||||
| changed = False | |||||
| for rule in self.rules: | |||||
| if set(rule.expansion) <= NULLABLE: | |||||
| if update_set(NULLABLE, {rule.origin}): | |||||
| changed = True | |||||
| for i, sym in enumerate(rule.expansion): | |||||
| if set(rule.expansion[:i]) <= NULLABLE: | |||||
| if update_set(FIRST[rule.origin], FIRST[sym]): | |||||
| changed = True | |||||
| if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: | |||||
| if update_set(FOLLOW[sym], FOLLOW[rule.origin]): | |||||
| changed = True | |||||
| for j in range(i+1, len(rule.expansion)): | |||||
| if set(rule.expansion[i+1:j]) <= NULLABLE: | |||||
| if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): | |||||
| changed = True | |||||
| self.FOLLOW = FOLLOW | |||||
| def analyze(self): | |||||
| self._calc() | |||||
| self.states = {} | |||||
| def step(state): | |||||
| lookahead = defaultdict(list) | |||||
| sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) | |||||
| for rp in sat: | |||||
| for term in self.FOLLOW.get(rp.rule.origin, ()): | |||||
| lookahead[term].append(('reduce', rp.rule)) | |||||
| d = classify(unsat, lambda rp: rp.next) | |||||
| for sym, rps in d.items(): | |||||
| rps = {rp.advance(sym) for rp in rps} | |||||
| for rp in set(rps): | |||||
| if not rp.is_satisfied and not is_terminal(rp.next): | |||||
| rps |= self.expand_rule(rp.next) | |||||
| lookahead[sym].append(('shift', fzset(rps))) | |||||
| yield fzset(rps) | |||||
| for k, v in lookahead.items(): | |||||
| if len(v) > 1: | |||||
| for x in v: | |||||
| # XXX resolving shift/reduce into shift, like PLY | |||||
| # Give a proper warning | |||||
| if x[0] == 'shift': | |||||
| lookahead[k] = [x] | |||||
| for k, v in lookahead.items(): | |||||
| assert len(v) == 1, ("Collision", k, v) | |||||
| self.states[state] = {k:v[0] for k, v in lookahead.items()} | |||||
| x = list(bfs([self.init_state], step)) | |||||
| # -- | |||||
| self.enum = list(self.states) | |||||
| self.enum_rev = {s:i for i,s in enumerate(self.enum)} | |||||
| self.states_idx = {} | |||||
| for s, la in self.states.items(): | |||||
| la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' else v for k,v in la.items()} | |||||
| self.states_idx[ self.enum_rev[s] ] = la | |||||
| self.init_state_idx = self.enum_rev[self.init_state] | |||||
| @@ -0,0 +1,217 @@ | |||||
| from __future__ import absolute_import | |||||
| from .utils import STRING_TYPE | |||||
| from .load_grammar import load_grammar | |||||
| from .tree import Tree, Transformer | |||||
| from .lexer import Lexer | |||||
| from .grammar_analysis import GrammarAnalyzer, is_terminal | |||||
| from . import parser, earley | |||||
| class LarkOptions(object): | |||||
| """Specifies the options for Lark | |||||
| """ | |||||
| OPTIONS_DOC = """ | |||||
| parser - Which parser engine to use ("earley" or "lalr". Default: "earley") | |||||
| Note: Both will use Lark's lexer. | |||||
| transformer - Applies the transformer to every parse tree | |||||
| debug - Affects verbosity (default: False) | |||||
| only_lex - Don't build a parser. Useful for debugging (default: False) | |||||
| keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | |||||
| cache_grammar - Cache the Lark grammar (Default: False) | |||||
| ignore_postproc - Don't call the post-processing function (default: False) | |||||
| """ | |||||
| __doc__ += OPTIONS_DOC | |||||
| def __init__(self, options_dict): | |||||
| o = dict(options_dict) | |||||
| self.debug = bool(o.pop('debug', False)) | |||||
| self.only_lex = bool(o.pop('only_lex', False)) | |||||
| self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) | |||||
| self.keep_empty_trees = bool(o.pop('keep_empty_trees', True)) | |||||
| self.tree_class = o.pop('tree_class', Tree) | |||||
| self.cache_grammar = o.pop('cache_grammar', False) | |||||
| self.ignore_postproc = bool(o.pop('ignore_postproc', False)) | |||||
| self.parser = o.pop('parser', 'earley') | |||||
| self.transformer = o.pop('transformer', None) | |||||
| if o: | |||||
| raise ValueError("Unknown options: %s" % o.keys()) | |||||
| class Callback(object): | |||||
| pass | |||||
| class RuleTreeToText(Transformer): | |||||
| def expansions(self, *x): | |||||
| return x | |||||
| def expansion(self, *symbols): | |||||
| return [sym.value for sym in symbols], None | |||||
| def alias(self, (expansion, _alias), alias): | |||||
| assert _alias is None, (alias, expansion, '-', _alias) | |||||
| return expansion, alias.value | |||||
| def create_rule_handler(expansion, usermethod): | |||||
| to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||||
| if not (is_terminal(sym) and sym.startswith('_'))] | |||||
| def _build_ast(match): | |||||
| children = [] | |||||
| for i, to_expand in to_include: | |||||
| if to_expand: | |||||
| children += match[i].children | |||||
| else: | |||||
| children.append(match[i]) | |||||
| return usermethod(children) | |||||
| return _build_ast | |||||
| def create_expand1_tree_builder_function(tree_builder): | |||||
| def f(children): | |||||
| if len(children) == 1: | |||||
| return children[0] | |||||
| else: | |||||
| return tree_builder(children) | |||||
| return f | |||||
| def create_rule_inline(f): | |||||
| def _f(children): | |||||
| return f(*children) | |||||
| return _f | |||||
| class LALR: | |||||
| def build_parser(self, rules, callback): | |||||
| ga = GrammarAnalyzer(rules) | |||||
| ga.analyze() | |||||
| return parser.Parser(ga, callback) | |||||
| class Earley: | |||||
| @staticmethod | |||||
| def _process_expansion(x): | |||||
| return [{'literal': s} if is_terminal(s) else s for s in x] | |||||
| def build_parser(self, rules, callback): | |||||
| rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||||
| return EarleyParser(earley.Parser(rules, 'start')) | |||||
| class EarleyParser: | |||||
| def __init__(self, parser): | |||||
| self.parser = parser | |||||
| def parse(self, text): | |||||
| res = self.parser.parse(text) | |||||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||||
| return res[0] | |||||
| class Lark: | |||||
| def __init__(self, grammar, **options): | |||||
| """ | |||||
| grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) | |||||
| options : a dictionary controlling various aspects of Lark. | |||||
| """ | |||||
| self.options = LarkOptions(options) | |||||
| # Some, but not all file-like objects have a 'name' attribute | |||||
| try: | |||||
| source = grammar.name | |||||
| except AttributeError: | |||||
| source = '<string>' | |||||
| cache_file = "larkcache_%s" % str(hash(grammar)%(2**32)) | |||||
| else: | |||||
| cache_file = "larkcache_%s" % os.path.basename(source) | |||||
| # Drain file-like objects to get their contents | |||||
| try: | |||||
| read = grammar.read | |||||
| except AttributeError: | |||||
| pass | |||||
| else: | |||||
| grammar = read() | |||||
| assert isinstance(grammar, STRING_TYPE) | |||||
| if self.options.cache_grammar: | |||||
| raise NotImplementedError("Not available yet") | |||||
| self.tokens, self.rules = load_grammar(grammar) | |||||
| self.lexer = self._build_lexer() | |||||
| if not self.options.only_lex: | |||||
| self.parser_engine = { | |||||
| 'lalr': LALR, | |||||
| 'earley': Earley, | |||||
| }[self.options.parser]() | |||||
| self.parser = self._build_parser() | |||||
| def _build_lexer(self): | |||||
| ignore_tokens = [] | |||||
| tokens = {} | |||||
| for name, (value, flags) in self.tokens.items(): | |||||
| if 'ignore' in flags: | |||||
| ignore_tokens.append(name) | |||||
| tokens[name] = value | |||||
| return Lexer(tokens.items(), {}, ignore=ignore_tokens) | |||||
| def _build_parser(self): | |||||
| transformer = self.options.transformer | |||||
| callback = Callback() | |||||
| rules = [] | |||||
| rule_tree_to_text = RuleTreeToText() | |||||
| for origin, tree in self.rules.items(): | |||||
| for expansion, alias in rule_tree_to_text.transform(tree): | |||||
| if alias and origin.startswith('_'): | |||||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||||
| expand1 = origin.startswith('?') | |||||
| inline_args = origin.startswith('*') or (alias and alias.startswith('*')) | |||||
| _origin = origin.lstrip('?*') | |||||
| if alias: | |||||
| alias = alias.lstrip('*') | |||||
| _alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||||
| assert not hasattr(callback, _alias) | |||||
| f = getattr(transformer, alias or _origin, None) | |||||
| if f is None: | |||||
| if alias: | |||||
| f = self._create_tree_builder_function(alias) | |||||
| else: | |||||
| f = self._create_tree_builder_function(_origin) | |||||
| if expand1: | |||||
| f = create_expand1_tree_builder_function(f) | |||||
| else: | |||||
| if inline_args: | |||||
| f = create_rule_inline(f) | |||||
| alias_handler = create_rule_handler(expansion, f) | |||||
| setattr(callback, _alias, alias_handler) | |||||
| rules.append((_origin, expansion, _alias)) | |||||
| return self.parser_engine.build_parser(rules, callback) | |||||
| __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||||
| def _create_tree_builder_function(self, name): | |||||
| tree_class = self.options.tree_class | |||||
| def f(children): | |||||
| return tree_class(name, children) | |||||
| return f | |||||
| def lex(self, text): | |||||
| return self.lexer.lex(text) | |||||
| def parse(self, text): | |||||
| assert not self.options.only_lex | |||||
| l = list(self.lex(text)) | |||||
| return self.parser.parse(l) | |||||
| @@ -0,0 +1,84 @@ | |||||
| ## Lexer Implementation | |||||
| from utils import Str | |||||
| class LexError(Exception): | |||||
| pass | |||||
| class Token(Str): | |||||
| def __new__(cls, type, value, pos_in_stream=None): | |||||
| inst = Str.__new__(cls, value) | |||||
| inst.type = type | |||||
| inst.pos_in_stream = pos_in_stream | |||||
| inst.value = value | |||||
| return inst | |||||
| # class Token(object): | |||||
| # def __init__(self, type, value, lexpos): | |||||
| # self.type = type | |||||
| # self.value = value | |||||
| # self.lexpos = lexpos | |||||
| def __repr__(self): | |||||
| return 'Token(%s, %s, %s)' % (self.type, self.value, self.pos_in_stream) | |||||
| class Regex: | |||||
| def __init__(self, pattern, flags=()): | |||||
| self.pattern = pattern | |||||
| self.flags = flags | |||||
| import re | |||||
| LIMIT = 50 # Stupid named groups limit in python re | |||||
| class Lexer(object): | |||||
| def __init__(self, tokens, callbacks, ignore=()): | |||||
| self.ignore = ignore | |||||
| # Sanitization | |||||
| token_names = {t[0] for t in tokens} | |||||
| for t in tokens: | |||||
| try: | |||||
| re.compile(t[1]) | |||||
| except: | |||||
| raise LexError("Cannot compile token: %s: %s" % t) | |||||
| assert all(t in token_names for t in ignore) | |||||
| # Init | |||||
| self.tokens = tokens | |||||
| self.callbacks = callbacks | |||||
| self.tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||||
| self.mres = [] | |||||
| self.name_from_index = [] | |||||
| x = tokens | |||||
| while x: | |||||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) | |||||
| self.mres.append(mre) | |||||
| self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) | |||||
| x = x[LIMIT:] | |||||
| def lex(self, stream): | |||||
| lex_pos = 0 | |||||
| while True: | |||||
| i = 0 | |||||
| for mre in self.mres: | |||||
| m = mre.match(stream, lex_pos) | |||||
| if m: | |||||
| value = m.group(0) | |||||
| type_ = self.name_from_index[i][m.lastindex] | |||||
| t = Token(type_, value, lex_pos) | |||||
| if t.type in self.callbacks: | |||||
| self.callbacks[t.type](t) | |||||
| if t.type not in self.ignore: | |||||
| yield t | |||||
| lex_pos += len(value) | |||||
| break | |||||
| i += 1 | |||||
| else: | |||||
| if lex_pos < len(stream): | |||||
| context = stream[lex_pos:lex_pos+5] | |||||
| raise LexError("No token defined for: '%s' in %s" % (stream[lex_pos], context)) | |||||
| break | |||||
| @@ -0,0 +1,358 @@ | |||||
| import re | |||||
| from lexer import Lexer, Token | |||||
| from grammar_analysis import GrammarAnalyzer | |||||
| from parser import Parser | |||||
| from tree import Tree as T, Transformer, Visitor | |||||
| _TOKEN_NAMES = { | |||||
| ':' : 'COLON', | |||||
| ',' : 'COMMA', | |||||
| ';' : 'SEMICOLON', | |||||
| '+' : 'PLUS', | |||||
| '-' : 'MINUS', | |||||
| '*' : 'STAR', | |||||
| '/' : 'SLASH', | |||||
| '|' : 'VBAR', | |||||
| '!' : 'BANG', | |||||
| '?' : 'QMARK', | |||||
| '#' : 'HASH', | |||||
| '$' : 'DOLLAR', | |||||
| '&' : 'AMPERSAND', | |||||
| '<' : 'LESSTHAN', | |||||
| '>' : 'MORETHAN', | |||||
| '=' : 'EQUAL', | |||||
| '.' : 'DOT', | |||||
| '%' : 'PERCENT', | |||||
| '`' : 'BACKQUOTE', | |||||
| '^' : 'CIRCUMFLEX', | |||||
| '"' : 'DBLQUOTE', | |||||
| '\'' : 'QUOTE', | |||||
| '~' : 'TILDE', | |||||
| '@' : 'AT', | |||||
| '(' : 'LPAR', | |||||
| ')' : 'RPAR', | |||||
| '{' : 'LBRACE', | |||||
| '}' : 'RBRACE', | |||||
| '[' : 'LSQB', | |||||
| ']' : 'RSQB', | |||||
| } | |||||
| # Grammar Parser | |||||
| TOKENS = { | |||||
| 'LPAR': '\(', | |||||
| 'RPAR': '\)', | |||||
| 'LBRA': '\[', | |||||
| 'RBRA': '\]', | |||||
| 'OP': '[+*?]', | |||||
| 'COLON': ':', | |||||
| 'OR': '\|', | |||||
| 'DOT': '\.', | |||||
| 'RULE': '[_?*]?[a-z][_a-z0-9]*', | |||||
| 'TOKEN': '_?[A-Z][_A-Z0-9]*', | |||||
| 'STRING': r'".*?[^\\]"', | |||||
| 'REGEXP': r"/(.|\n)*?[^\\]/", | |||||
| 'NL': r'(\r?\n)+\s*', | |||||
| 'WS': r'[ \t]+', | |||||
| 'COMMENT': r'#[^\n]*\n', | |||||
| 'TO': '->' | |||||
| } | |||||
| RULES = [ | |||||
| ('start', ['list']), | |||||
| ('list', ['item']), | |||||
| ('list', ['list', 'item']), | |||||
| ('item', ['rule']), | |||||
| ('item', ['token']), | |||||
| ('item', ['NL']), | |||||
| ('rule', ['RULE', 'COLON', 'expansions', 'NL']), | |||||
| ('expansions', ['expansion']), | |||||
| ('expansions', ['expansions', 'OR', 'expansion']), | |||||
| ('expansions', ['expansions', 'NL', 'OR', 'expansion']), | |||||
| ('expansion', ['_expansion']), | |||||
| ('expansion', ['_expansion', 'TO', 'RULE']), | |||||
| ('_expansion', ['expr']), | |||||
| ('_expansion', ['_expansion', 'expr']), | |||||
| ('expr', ['atom']), | |||||
| ('expr', ['atom', 'OP']), | |||||
| ('atom', ['LPAR', 'expansions', 'RPAR']), | |||||
| ('atom', ['maybe']), | |||||
| ('atom', ['RULE']), | |||||
| ('atom', ['TOKEN']), | |||||
| ('atom', ['anontoken']), | |||||
| ('anontoken', ['tokenvalue']), | |||||
| ('maybe', ['LBRA', 'expansions', 'RBRA']), | |||||
| ('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']), | |||||
| ('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']), | |||||
| ('tokenvalue', ['REGEXP']), | |||||
| ('tokenvalue', ['STRING']), | |||||
| ('tokenmods', ['DOT', 'RULE']), | |||||
| ('tokenmods', ['tokenmods', 'DOT', 'RULE']), | |||||
| ] | |||||
| class SaveDefinitions(object): | |||||
| def __init__(self): | |||||
| self.rules = {} | |||||
| self.tokens = {} | |||||
| self.i = 0 | |||||
| def atom__3(self, _1, value, _2): | |||||
| return value | |||||
| def atom__1(self, value): | |||||
| return value | |||||
| def expr__1(self, expr): | |||||
| return expr | |||||
| def expr(self, *x): | |||||
| return T('expr', x) | |||||
| def expansion__1(self, expansion): | |||||
| return expansion | |||||
| def expansion__3(self, expansion, _, alias): | |||||
| return T('alias', [expansion, alias]) | |||||
| def _expansion(self, *x): | |||||
| return T('expansion', x) | |||||
| def expansions(self, *x): | |||||
| items = [i for i in x if isinstance(i, T)] | |||||
| return T('expansions', items) | |||||
| def maybe(self, _1, expr, _2): | |||||
| return T('expr', [expr, Token('OP', '?', -1)]) | |||||
| def rule(self, name, _1, expansion, _2): | |||||
| name = name.value | |||||
| if name in self.rules: | |||||
| raise ValueError("Rule '%s' defined more than once" % name) | |||||
| self.rules[name] = expansion | |||||
| def token(self, *x): | |||||
| name = x[0].value | |||||
| if name in self.tokens: | |||||
| raise ValueError("Token '%s' defined more than once" % name) | |||||
| if len(x) == 4: | |||||
| self.tokens[name] = x[2][1], [] | |||||
| else: | |||||
| self.tokens[name] = x[3][1], x[1].children | |||||
| def tokenvalue(self, tokenvalue): | |||||
| value = tokenvalue.value[1:-1] | |||||
| if tokenvalue.type == 'STRING': | |||||
| value = re.escape(value) | |||||
| return tokenvalue, value | |||||
| def anontoken(self, (token, value)): | |||||
| if token.type == 'STRING': | |||||
| try: | |||||
| token_name = _TOKEN_NAMES[token.value[1:-1]] | |||||
| except KeyError: | |||||
| if value.isalnum() and value[0].isalpha(): | |||||
| token_name = value.upper() | |||||
| else: | |||||
| token_name = 'ANONSTR_%d' % self.i | |||||
| self.i += 1 | |||||
| token_name = '__' + token_name | |||||
| elif token.type == 'REGEXP': | |||||
| token_name = 'ANONRE_%d' % self.i | |||||
| self.i += 1 | |||||
| else: | |||||
| assert False, x | |||||
| if token_name not in self.tokens: | |||||
| self.tokens[token_name] = value, [] | |||||
| return Token('TOKEN', token_name, -1) | |||||
| def tokenmods__2(self, _, rule): | |||||
| return T('tokenmods', [rule.value]) | |||||
| def tokenmods__3(self, tokenmods, _, rule): | |||||
| return T('tokenmods', tokenmods.children + [rule.value]) | |||||
| def start(self, *x): pass | |||||
| def list(self, *x): pass | |||||
| def item(self, *x): pass | |||||
| class EBNF_to_BNF(Transformer): | |||||
| def __init__(self): | |||||
| self.new_rules = {} | |||||
| self.prefix = 'anon' | |||||
| self.i = 0 | |||||
| def _add_recurse_rule(self, type_, expr): | |||||
| new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||||
| self.i += 1 | |||||
| t = Token('RULE', new_name, -1) | |||||
| self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||||
| return t | |||||
| def expr(self, rule, op): | |||||
| if op.value == '?': | |||||
| return T('expansions', [rule, T('expansion', [])]) | |||||
| elif op.value == '+': | |||||
| # a : b c+ d | |||||
| # --> | |||||
| # a : b _c d | |||||
| # _c : _c c | c; | |||||
| return self._add_recurse_rule('plus', rule) | |||||
| elif op.value == '*': | |||||
| # a : b c* d | |||||
| # --> | |||||
| # a : b _c? d | |||||
| # _c : _c c | c; | |||||
| new_name = self._add_recurse_rule('star', rule) | |||||
| return T('expansions', [new_name, T('expansion', [])]) | |||||
| assert False, op | |||||
| class SimplifyRule_Visitor(Visitor): | |||||
| @staticmethod | |||||
| def _flatten(tree): | |||||
| while True: | |||||
| to_expand = [i for i, child in enumerate(tree.children) | |||||
| if isinstance(child, T) and child.data == tree.data] | |||||
| if not to_expand: | |||||
| break | |||||
| tree.expand_kids_by_index(*to_expand) | |||||
| def expansion(self, tree): | |||||
| # rules_list unpacking | |||||
| # a : b (c|d) e | |||||
| # --> | |||||
| # a : b c e | b d e | |||||
| # | |||||
| # In AST terms: | |||||
| # expansion(b, expansions(c, d), e) | |||||
| # --> | |||||
| # expansions( expansion(b, c, e), expansion(b, d, e) ) | |||||
| while True: | |||||
| self._flatten(tree) | |||||
| for i, child in enumerate(tree.children): | |||||
| if isinstance(child, T) and child.data == 'expansions': | |||||
| tree.data = 'expansions' | |||||
| tree.children = [self.visit(T('expansion', [option if i==j else other | |||||
| for j, other in enumerate(tree.children)])) | |||||
| for option in child.children] | |||||
| break | |||||
| else: | |||||
| break | |||||
| def alias(self, tree): | |||||
| rule, alias_name = tree.children | |||||
| if rule.data == 'expansions': | |||||
| aliases = [] | |||||
| for child in tree.children[0].children: | |||||
| aliases.append(T('alias', [child, alias_name])) | |||||
| tree.data = 'expansions' | |||||
| tree.children = aliases | |||||
| expansions = _flatten | |||||
| def dict_update_safe(d1, d2): | |||||
| for k, v in d2.iteritems(): | |||||
| assert k not in d1 | |||||
| d1[k] = v | |||||
| def generate_aliases(): | |||||
| sd = SaveDefinitions() | |||||
| for name, expansion in RULES: | |||||
| try: | |||||
| f = getattr(sd, "%s__%s" % (name, len(expansion))) | |||||
| except AttributeError: | |||||
| f = getattr(sd, name) | |||||
| yield name, expansion, f.__name__ | |||||
| def inline_args(f): | |||||
| def _f(self, args): | |||||
| return f(*args) | |||||
| return _f | |||||
| class GrammarLoader: | |||||
| def __init__(self): | |||||
| self.rules = list(generate_aliases()) | |||||
| self.ga = GrammarAnalyzer(self.rules) | |||||
| self.ga.analyze() | |||||
| self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | |||||
| self.simplify_rule = SimplifyRule_Visitor() | |||||
| def _generate_parser_callbacks(self, callbacks): | |||||
| d = {alias: inline_args(getattr(callbacks, alias)) | |||||
| for _n, _x, alias in self.rules} | |||||
| return type('Callback', (), d)() | |||||
| def load_grammar(self, grammar_text): | |||||
| sd = SaveDefinitions() | |||||
| c = self._generate_parser_callbacks(sd) | |||||
| p = Parser(self.ga, c) | |||||
| p.parse( list(self.lexer.lex(grammar_text+"\n")) ) | |||||
| ebnf_to_bnf = EBNF_to_BNF() | |||||
| rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} | |||||
| dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||||
| for r in rules.values(): | |||||
| self.simplify_rule.visit(r) | |||||
| return sd.tokens, rules | |||||
| load_grammar = GrammarLoader().load_grammar | |||||
| def test(): | |||||
| g = """ | |||||
| start: add | |||||
| # Rules | |||||
| add: mul | |||||
| | add _add_sym mul | |||||
| mul: _atom | |||||
| | mul _add_mul _atom | |||||
| neg: "-" _atom | |||||
| _atom: neg | |||||
| | number | |||||
| | "(" add ")" | |||||
| # Tokens | |||||
| number: /[\d.]+/ | |||||
| _add_sym: "+" | "-" | |||||
| _add_mul: "*" | "/" | |||||
| WS.ignore: /\s+/ | |||||
| """ | |||||
| g2 = """ | |||||
| start: a | |||||
| a: "a" (b*|(c d)+) "b"? | |||||
| b: "b" | |||||
| c: "c" | |||||
| d: "+" | "-" | |||||
| """ | |||||
| load_grammar(g) | |||||
| @@ -0,0 +1,61 @@ | |||||
| from grammar_analysis import ACTION_SHIFT | |||||
| class ParseError(Exception): | |||||
| pass | |||||
| class Parser(object): | |||||
| def __init__(self, ga, callback, temp=False): | |||||
| self.ga = ga | |||||
| self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||||
| for rule in ga.rules} | |||||
| def parse(self, seq): | |||||
| states_idx = self.ga.states_idx | |||||
| stack = [(None, self.ga.init_state_idx)] | |||||
| i = 0 | |||||
| res = None | |||||
| def get_action(key): | |||||
| state = stack[-1][1] | |||||
| try: | |||||
| return states_idx[state][key] | |||||
| except KeyError: | |||||
| expected = states_idx[state].keys() | |||||
| context = ' '.join(['%s(%r)' % (t.type, t.value) for t in seq[i:i+5]]) | |||||
| raise ParseError("Unexpected input %r.\nExpected: %s\nContext: %s" % (key, expected, context)) | |||||
| def reduce(rule): | |||||
| s = stack[-len(rule.expansion):] | |||||
| del stack[-len(rule.expansion):] | |||||
| res = self.callbacks[rule]([x[0] for x in s]) | |||||
| if rule.origin == 'start': | |||||
| return res | |||||
| _action, new_state = get_action(rule.origin) | |||||
| assert _action == ACTION_SHIFT | |||||
| stack.append((res, new_state)) | |||||
| # Main LALR-parser loop | |||||
| while i < len(seq): | |||||
| action, arg = get_action(seq[i].type) | |||||
| if action == ACTION_SHIFT: | |||||
| stack.append((seq[i], arg)) | |||||
| i+= 1 | |||||
| else: | |||||
| reduce(arg) | |||||
| while len(stack) > 1: | |||||
| _action, rule = get_action('$end') | |||||
| assert _action == 'reduce' | |||||
| res = reduce(rule) | |||||
| if res: | |||||
| break | |||||
| assert stack == [(None, self.ga.init_state_idx)], len(stack) | |||||
| return res | |||||
| @@ -0,0 +1,83 @@ | |||||
| class Tree(object): | |||||
| def __init__(self, data, children): | |||||
| self.data = data | |||||
| self.children = list(children) | |||||
| def __repr__(self): | |||||
| return 'Tree(%s, %s)' % (self.data, self.children) | |||||
| def _pretty(self, level, indent_str): | |||||
| if len(self.children) == 1 and not isinstance(self.children[0], Tree): | |||||
| return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n'] | |||||
| l = [ indent_str*level, self.data, '\n' ] | |||||
| for n in self.children: | |||||
| if isinstance(n, Tree): | |||||
| l += n._pretty(level+1, indent_str) | |||||
| else: | |||||
| l += [ indent_str*(level+1), '%s' % n, '\n' ] | |||||
| return l | |||||
| def pretty(self, indent_str=' '): | |||||
| return ''.join(self._pretty(0, indent_str)) | |||||
| def expand_kids_by_index(self, *indices): | |||||
| for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||||
| kid = self.children[i] | |||||
| self.children[i:i+1] = kid.children | |||||
| # def find_path(self, pred): | |||||
| # if pred(self): | |||||
| # yield [] | |||||
| # else: | |||||
| # for i, c in enumerate(self.children): | |||||
| # if isinstance(c, Tree): | |||||
| # for path in c.find_path(pred): | |||||
| # yield [i] + path | |||||
| # def follow_path(self, path): | |||||
| # x = self | |||||
| # for step in path: | |||||
| # x = x.children[step] | |||||
| # return x | |||||
| # def set_at_path(self, path, value): | |||||
| # x = self.follow_path(path[:-1]) | |||||
| # x.children[path[-1]] = value | |||||
| def clone(self): | |||||
| return Tree(self.data, [c.clone() if isinstance(c, Tree) else c for c in self.children]) | |||||
| class Transformer(object): | |||||
| def transform(self, tree): | |||||
| items = [self.transform(c) if isinstance(c, Tree) else c for c in tree.children] | |||||
| try: | |||||
| f = getattr(self, tree.data) | |||||
| except AttributeError: | |||||
| return self.__default__(tree.data, items) | |||||
| else: | |||||
| return f(*items) | |||||
| def __default__(self, data, children): | |||||
| return Tree(data, children) | |||||
| class Visitor(object): | |||||
| def visit(self, tree): | |||||
| for child in tree.children: | |||||
| if isinstance(child, Tree): | |||||
| self.visit(child) | |||||
| f = getattr(self, tree.data, self.__default__) | |||||
| f(tree) | |||||
| return tree | |||||
| def __default__(self, tree): | |||||
| pass | |||||
| @@ -0,0 +1,51 @@ | |||||
| from collections import deque | |||||
| class fzset(frozenset): | |||||
| def __repr__(self): | |||||
| return '{%s}' % ', '.join(map(repr, self)) | |||||
| def classify_bool(seq, pred): | |||||
| true_elems = [] | |||||
| false_elems = [] | |||||
| for elem in seq: | |||||
| if pred(elem): | |||||
| true_elems.append(elem) | |||||
| else: | |||||
| false_elems.append(elem) | |||||
| return true_elems, false_elems | |||||
| def classify(seq, key=None): | |||||
| d = {} | |||||
| for item in seq: | |||||
| k = key(item) if (key is not None) else item | |||||
| if k in d: | |||||
| d[k].append(item) | |||||
| else: | |||||
| d[k] = [item] | |||||
| return d | |||||
| def bfs(initial, expand): | |||||
| open_q = deque(list(initial)) | |||||
| visited = set(open_q) | |||||
| while open_q: | |||||
| node = open_q.popleft() | |||||
| yield node | |||||
| for next_node in expand(node): | |||||
| if next_node not in visited: | |||||
| visited.add(next_node) | |||||
| open_q.append(next_node) | |||||
| try: | |||||
| STRING_TYPE = basestring | |||||
| except NameError: # Python 3 | |||||
| STRING_TYPE = str | |||||
| Str = type(u'') | |||||