瀏覽代碼

Lark big first commit. Examples working.

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 7 年之前
父節點
當前提交
73178d6ae0
共有 12 個文件被更改,包括 1330 次插入0 次删除
  1. +0
    -0
      lark/__init__.py
  2. +148
    -0
      lark/earley.py
  3. +0
    -0
      lark/examples/__init__.py
  4. +59
    -0
      lark/examples/calc.py
  5. +62
    -0
      lark/examples/json_example.py
  6. +207
    -0
      lark/grammar_analysis.py
  7. +217
    -0
      lark/lark.py
  8. +84
    -0
      lark/lexer.py
  9. +358
    -0
      lark/load_grammar.py
  10. +61
    -0
      lark/parser.py
  11. +83
    -0
      lark/tree.py
  12. +51
    -0
      lark/utils.py

+ 0
- 0
lark/__init__.py 查看文件


+ 148
- 0
lark/earley.py 查看文件

@@ -0,0 +1,148 @@
"My name is Earley"

from .utils import classify

class MatchFailed(object):
pass

class AbortParseMatch(Exception):
pass

class Rule(object):
def __init__(self, name, symbols, postprocess):
self.name = name
self.symbols = symbols
self.postprocess = postprocess

class State(object):
def __init__(self, rule, expect, reference, data=None):
self.rule = rule
self.expect = expect
self.reference = reference
self.data = data or []

self.is_complete = (self.expect == len(self.rule.symbols))
if not self.is_complete:
self.expect_symbol = self.rule.symbols[self.expect]
self.is_literal = isinstance(self.expect_symbol, dict)
if self.is_literal:
self.expect_symbol = self.expect_symbol['literal']
assert isinstance(self.expect_symbol, (str, unicode)), self.expect_symbol

def next_state(self, data):
return State(self.rule, self.expect+1, self.reference, self.data + [data])

def consume_terminal(self, inp):
if not self.is_complete and self.is_literal:
# PORT: originally tests regexp

if self.expect_symbol == inp.type:
return self.next_state(inp)

def consume_nonterminal(self, inp):
if not self.is_complete and not self.is_literal:

if self.expect_symbol == inp:
return self.next_state(inp)

def process(self, location, ind, table, rules, added_rules):
if self.is_complete:
# Completed a rule
if self.rule.postprocess:
try:
# self.data = self.rule.postprocess(self.data, self.reference)
# import pdb
# pdb.set_trace()
self.data = self.rule.postprocess(self.data)
except AbortParseMatch:
self.data = MatchFailed

if self.data is not MatchFailed:
for s in table[self.reference]:
x = s.consume_nonterminal(self.rule.name)
if x:
x.data[-1] = self.data
x.epsilon_closure(location, ind, table)

else:
exp = self.rule.symbols[self.expect]
if isinstance(exp, dict):
return

for r in rules[exp]:
assert r.name == exp
if r not in added_rules:
if r.symbols:
added_rules.add(r)
State(r, 0, location).epsilon_closure(location, ind, table)
else:
# Empty rule
new_copy = self.consume_nonterminal(r.name)
if r.postprocess:
new_copy.data[-1] = r.postprocess([])
# new_copy.data[-1] = r.postprocess([], self.reference)
else:
new_copy.data[-1] = []

new_copy.epsilon_closure(location, ind, table)

def epsilon_closure(self, location, ind, table, result=None):
col = table[location]
if not result:
result = col

result.append(self)

if not self.is_complete:
for i in xrange(ind):
state = col[i]
if state.is_complete and state.reference == location:
x = self.consume_nonterminal(state.rule.name)
if x:
x.data[-1] = state.data
x.epsilon_closure(location, ind, table)


class Parser(object):
def __init__(self, rules, start=None):
self.table = [[]]
self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules]
self.rules_by_name = classify(self.rules, lambda r: r.name)
self.start = start or self.rules[0].name
initial_rules = set(self.rules_by_name[self.start])
self.table[0] += [State(r, 0, 0) for r in initial_rules]
self.advance_to(0, initial_rules)
self.current = 0

def advance_to(self, n, added_rules):
for w, s in enumerate(self.table[n]):
s.process(n, w, self.table, self.rules_by_name, added_rules)

def parse(self, chunk):
chunk_pos = 0
for chunk_pos, chunk_item in enumerate(chunk):
self.table.append([])

for s in self.table[self.current + chunk_pos]:
x = s.consume_terminal(chunk_item)
if x:
self.table[self.current + chunk_pos + 1].append(x)


added_rules = set()
self.advance_to(self.current + chunk_pos + 1, added_rules)

if not self.table[-1]:
raise Exception('Error at line {t.line}:{t.column}'.format(t=chunk[chunk_pos]))

self.current += chunk_pos
return list(self.finish())

def finish(self):
for t in self.table[-1]:
if (t.rule.name == self.start
and t.expect == len(t.rule.symbols)
and t.reference == 0
and t.data != MatchFailed):
yield t.data


+ 0
- 0
lark/examples/__init__.py 查看文件


+ 59
- 0
lark/examples/calc.py 查看文件

@@ -0,0 +1,59 @@
from lark.tree import Transformer
from lark.lark import Lark

calc_grammar = """
?start: sum
| NAME "=" sum -> *assign_var

?sum: product
| sum "+" product -> *add
| sum "-" product -> *sub

?product: atom
| product "*" atom -> *mul
| product "/" atom -> *div

?atom: /[\d.]+/ -> *number
| "-" atom -> *neg
| NAME -> *var
| "(" sum ")"

NAME: /\w+/
WS.ignore: /\s+/
"""

class CalculateTree(Transformer):
from operator import add, sub, mul, div, neg
number = float

def __init__(self):
self.vars = {}

def assign_var(self, name, value):
self.vars[name] = value
return value

def var(self, name):
return self.vars[name]



calc_parser = Lark(calc_grammar, parser='lalr', transformer=CalculateTree())
calc = calc_parser.parse

def main():
while True:
try:
s = raw_input('> ')
except EOFError:
break
print(calc(s))

def test():
print calc("a = 1+2")
print calc("1+a*-3")

if __name__ == '__main__':
test()
# main()


+ 62
- 0
lark/examples/json_example.py 查看文件

@@ -0,0 +1,62 @@
import sys
from lark.lark import Lark
from lark.tree import Transformer

json_grammar = r"""
?start: value

?value: object
| array
| string
| number
| "true" -> *true
| "false" -> *false
| "null" -> *null

array : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair : string ":" value

*number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/
*string : /".*?(?<!\\)"/

WS.ignore.newline: /[ \t\n]+/
"""

class TreeToJson(Transformer):
def string(self, s):
return s[1:-1]

array = list
pair = tuple
object = dict
number = float

null = lambda self: None
true = lambda self: True
false = lambda self: False

json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson())

def test():
test_json = '''
{
"empty_object" : {},
"empty_array" : [],
"booleans" : { "YES" : true, "NO" : false },
"numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ],
"strings" : [ "This", [ "And" , "That" ] ],
"nothing" : null
}
'''

j = json_parser.parse(test_json)
print j
import json
assert j == json.loads(test_json)

if __name__ == '__main__':
test()
with open(sys.argv[1]) as f:
print json_parser.parse(f.read())


+ 207
- 0
lark/grammar_analysis.py 查看文件

@@ -0,0 +1,207 @@
from collections import defaultdict, deque
from utils import classify, classify_bool, bfs, fzset

ACTION_SHIFT = 0

class GrammarError(Exception):
pass

def is_terminal(sym):
return sym.isupper() or sym[0] == '$'

class Rule(object):
"""
origin : a symbol
expansion : a list of symbols
"""
def __init__(self, origin, expansion, alias=None):
assert expansion, "No support for empty rules"
self.origin = origin
self.expansion = expansion
self.alias = alias

def __repr__(self):
return '<%s : %s>' % (self.origin, ' '.join(self.expansion))

class RulePtr(object):
def __init__(self, rule, index):
assert isinstance(rule, Rule)
assert index <= len(rule.expansion)
self.rule = rule
self.index = index

def __repr__(self):
before = self.rule.expansion[:self.index]
after = self.rule.expansion[self.index:]
return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after))

@property
def next(self):
return self.rule.expansion[self.index]

def advance(self, sym):
assert self.next == sym
return RulePtr(self.rule, self.index+1)

@property
def is_satisfied(self):
return self.index == len(self.rule.expansion)

def __eq__(self, other):
return self.rule == other.rule and self.index == other.index
def __hash__(self):
return hash((self.rule, self.index))


def pairs(lst):
return zip(lst[:-1], lst[1:])

def update_set(set1, set2):
copy = set(set1)
set1 |= set2
return set1 != copy

class GrammarAnalyzer(object):
def __init__(self, rule_tuples):
rule_tuples = list(rule_tuples)
rule_tuples.append(('$root', ['start', '$end']))
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples]

self.rules = set()
self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples}
for origin, exp, alias in rule_tuples:
r = Rule( origin, exp, alias )
self.rules.add(r)
self.rules_by_origin[origin].append(r)

for r in self.rules:
for sym in r.expansion:
if not (is_terminal(sym) or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym)

self.init_state = self.expand_rule('start')

def expand_rule(self, rule):
"Returns all init_ptrs accessible by rule (recursive)"
init_ptrs = set()
def _expand_rule(rule):
assert not is_terminal(rule)

for r in self.rules_by_origin[rule]:
init_ptr = RulePtr(r, 0)
init_ptrs.add(init_ptr)

new_r = init_ptr.next
if not is_terminal(new_r):
yield new_r

_ = list(bfs([rule], _expand_rule))

return fzset(init_ptrs)

def _first(self, r):
if is_terminal(r):
return {r}
else:
return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)}

def _calc(self):
"""Calculate FOLLOW sets.

Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets"""
symbols = {sym for rule in self.rules for sym in rule.expansion}
symbols.add('$root') # what about other unused rules?

# foreach grammar rule X ::= Y(1) ... Y(k)
# if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then
# NULLABLE = NULLABLE union {X}
# for i = 1 to k
# if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then
# FIRST(X) = FIRST(X) union FIRST(Y(i))
# for j = i+1 to k
# if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X)
# if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then
# FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j))
# until none of NULLABLE,FIRST,FOLLOW changed in last iteration

NULLABLE = set()
FIRST = {}
FOLLOW = {}
for sym in symbols:
FIRST[sym]={sym} if is_terminal(sym) else set()
FOLLOW[sym]=set()

changed = True
while changed:
changed = False

for rule in self.rules:
if set(rule.expansion) <= NULLABLE:
if update_set(NULLABLE, {rule.origin}):
changed = True

for i, sym in enumerate(rule.expansion):
if set(rule.expansion[:i]) <= NULLABLE:
if update_set(FIRST[rule.origin], FIRST[sym]):
changed = True
if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE:
if update_set(FOLLOW[sym], FOLLOW[rule.origin]):
changed = True

for j in range(i+1, len(rule.expansion)):
if set(rule.expansion[i+1:j]) <= NULLABLE:
if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]):
changed = True

self.FOLLOW = FOLLOW

def analyze(self):
self._calc()

self.states = {}
def step(state):
lookahead = defaultdict(list)
sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied)
for rp in sat:
for term in self.FOLLOW.get(rp.rule.origin, ()):
lookahead[term].append(('reduce', rp.rule))

d = classify(unsat, lambda rp: rp.next)
for sym, rps in d.items():
rps = {rp.advance(sym) for rp in rps}

for rp in set(rps):
if not rp.is_satisfied and not is_terminal(rp.next):
rps |= self.expand_rule(rp.next)

lookahead[sym].append(('shift', fzset(rps)))
yield fzset(rps)

for k, v in lookahead.items():
if len(v) > 1:
for x in v:
# XXX resolving shift/reduce into shift, like PLY
# Give a proper warning
if x[0] == 'shift':
lookahead[k] = [x]

for k, v in lookahead.items():
assert len(v) == 1, ("Collision", k, v)

self.states[state] = {k:v[0] for k, v in lookahead.items()}

x = list(bfs([self.init_state], step))

# --
self.enum = list(self.states)
self.enum_rev = {s:i for i,s in enumerate(self.enum)}
self.states_idx = {}

for s, la in self.states.items():
la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' else v for k,v in la.items()}
self.states_idx[ self.enum_rev[s] ] = la


self.init_state_idx = self.enum_rev[self.init_state]


+ 217
- 0
lark/lark.py 查看文件

@@ -0,0 +1,217 @@
from __future__ import absolute_import

from .utils import STRING_TYPE
from .load_grammar import load_grammar
from .tree import Tree, Transformer

from .lexer import Lexer
from .grammar_analysis import GrammarAnalyzer, is_terminal
from . import parser, earley

class LarkOptions(object):
"""Specifies the options for Lark

"""
OPTIONS_DOC = """
parser - Which parser engine to use ("earley" or "lalr". Default: "earley")
Note: Both will use Lark's lexer.
transformer - Applies the transformer to every parse tree
debug - Affects verbosity (default: False)
only_lex - Don't build a parser. Useful for debugging (default: False)
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True)
cache_grammar - Cache the Lark grammar (Default: False)
ignore_postproc - Don't call the post-processing function (default: False)
"""
__doc__ += OPTIONS_DOC
def __init__(self, options_dict):
o = dict(options_dict)

self.debug = bool(o.pop('debug', False))
self.only_lex = bool(o.pop('only_lex', False))
self.keep_all_tokens = bool(o.pop('keep_all_tokens', False))
self.keep_empty_trees = bool(o.pop('keep_empty_trees', True))
self.tree_class = o.pop('tree_class', Tree)
self.cache_grammar = o.pop('cache_grammar', False)
self.ignore_postproc = bool(o.pop('ignore_postproc', False))
self.parser = o.pop('parser', 'earley')
self.transformer = o.pop('transformer', None)

if o:
raise ValueError("Unknown options: %s" % o.keys())


class Callback(object):
pass


class RuleTreeToText(Transformer):
def expansions(self, *x):
return x
def expansion(self, *symbols):
return [sym.value for sym in symbols], None
def alias(self, (expansion, _alias), alias):
assert _alias is None, (alias, expansion, '-', _alias)
return expansion, alias.value



def create_rule_handler(expansion, usermethod):
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion)
if not (is_terminal(sym) and sym.startswith('_'))]

def _build_ast(match):
children = []
for i, to_expand in to_include:
if to_expand:
children += match[i].children
else:
children.append(match[i])

return usermethod(children)
return _build_ast

def create_expand1_tree_builder_function(tree_builder):
def f(children):
if len(children) == 1:
return children[0]
else:
return tree_builder(children)
return f

def create_rule_inline(f):
def _f(children):
return f(*children)
return _f


class LALR:
def build_parser(self, rules, callback):
ga = GrammarAnalyzer(rules)
ga.analyze()
return parser.Parser(ga, callback)

class Earley:
@staticmethod
def _process_expansion(x):
return [{'literal': s} if is_terminal(s) else s for s in x]

def build_parser(self, rules, callback):
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules]
return EarleyParser(earley.Parser(rules, 'start'))

class EarleyParser:
def __init__(self, parser):
self.parser = parser

def parse(self, text):
res = self.parser.parse(text)
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet'
return res[0]



class Lark:
def __init__(self, grammar, **options):
"""
grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
options : a dictionary controlling various aspects of Lark.
"""
self.options = LarkOptions(options)

# Some, but not all file-like objects have a 'name' attribute
try:
source = grammar.name
except AttributeError:
source = '<string>'
cache_file = "larkcache_%s" % str(hash(grammar)%(2**32))
else:
cache_file = "larkcache_%s" % os.path.basename(source)

# Drain file-like objects to get their contents
try:
read = grammar.read
except AttributeError:
pass
else:
grammar = read()

assert isinstance(grammar, STRING_TYPE)

if self.options.cache_grammar:
raise NotImplementedError("Not available yet")

self.tokens, self.rules = load_grammar(grammar)

self.lexer = self._build_lexer()
if not self.options.only_lex:
self.parser_engine = {
'lalr': LALR,
'earley': Earley,
}[self.options.parser]()
self.parser = self._build_parser()

def _build_lexer(self):
ignore_tokens = []
tokens = {}
for name, (value, flags) in self.tokens.items():
if 'ignore' in flags:
ignore_tokens.append(name)
tokens[name] = value
return Lexer(tokens.items(), {}, ignore=ignore_tokens)


def _build_parser(self):
transformer = self.options.transformer
callback = Callback()
rules = []
rule_tree_to_text = RuleTreeToText()
for origin, tree in self.rules.items():
for expansion, alias in rule_tree_to_text.transform(tree):
if alias and origin.startswith('_'):
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin)

expand1 = origin.startswith('?')
inline_args = origin.startswith('*') or (alias and alias.startswith('*'))
_origin = origin.lstrip('?*')
if alias:
alias = alias.lstrip('*')
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion))

assert not hasattr(callback, _alias)
f = getattr(transformer, alias or _origin, None)
if f is None:
if alias:
f = self._create_tree_builder_function(alias)
else:
f = self._create_tree_builder_function(_origin)
if expand1:
f = create_expand1_tree_builder_function(f)
else:
if inline_args:
f = create_rule_inline(f)

alias_handler = create_rule_handler(expansion, f)

setattr(callback, _alias, alias_handler)

rules.append((_origin, expansion, _alias))

return self.parser_engine.build_parser(rules, callback)


__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC

def _create_tree_builder_function(self, name):
tree_class = self.options.tree_class
def f(children):
return tree_class(name, children)
return f

def lex(self, text):
return self.lexer.lex(text)

def parse(self, text):
assert not self.options.only_lex
l = list(self.lex(text))
return self.parser.parse(l)


+ 84
- 0
lark/lexer.py 查看文件

@@ -0,0 +1,84 @@
## Lexer Implementation
from utils import Str

class LexError(Exception):
pass

class Token(Str):
def __new__(cls, type, value, pos_in_stream=None):
inst = Str.__new__(cls, value)
inst.type = type
inst.pos_in_stream = pos_in_stream
inst.value = value
return inst

# class Token(object):
# def __init__(self, type, value, lexpos):
# self.type = type
# self.value = value
# self.lexpos = lexpos


def __repr__(self):
return 'Token(%s, %s, %s)' % (self.type, self.value, self.pos_in_stream)

class Regex:
def __init__(self, pattern, flags=()):
self.pattern = pattern
self.flags = flags


import re
LIMIT = 50 # Stupid named groups limit in python re
class Lexer(object):
def __init__(self, tokens, callbacks, ignore=()):
self.ignore = ignore

# Sanitization
token_names = {t[0] for t in tokens}
for t in tokens:
try:
re.compile(t[1])
except:
raise LexError("Cannot compile token: %s: %s" % t)
assert all(t in token_names for t in ignore)

# Init
self.tokens = tokens
self.callbacks = callbacks

self.tokens.sort(key=lambda x:len(x[1]), reverse=True)

self.mres = []
self.name_from_index = []
x = tokens
while x:
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT]))
self.mres.append(mre)
self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} )
x = x[LIMIT:]

def lex(self, stream):
lex_pos = 0
while True:
i = 0
for mre in self.mres:
m = mre.match(stream, lex_pos)
if m:
value = m.group(0)
type_ = self.name_from_index[i][m.lastindex]
t = Token(type_, value, lex_pos)
if t.type in self.callbacks:
self.callbacks[t.type](t)
if t.type not in self.ignore:
yield t
lex_pos += len(value)
break
i += 1
else:
if lex_pos < len(stream):
context = stream[lex_pos:lex_pos+5]
raise LexError("No token defined for: '%s' in %s" % (stream[lex_pos], context))
break



+ 358
- 0
lark/load_grammar.py 查看文件

@@ -0,0 +1,358 @@
import re
from lexer import Lexer, Token
from grammar_analysis import GrammarAnalyzer
from parser import Parser

from tree import Tree as T, Transformer, Visitor

_TOKEN_NAMES = {
':' : 'COLON',
',' : 'COMMA',
';' : 'SEMICOLON',
'+' : 'PLUS',
'-' : 'MINUS',
'*' : 'STAR',
'/' : 'SLASH',
'|' : 'VBAR',
'!' : 'BANG',
'?' : 'QMARK',
'#' : 'HASH',
'$' : 'DOLLAR',
'&' : 'AMPERSAND',
'<' : 'LESSTHAN',
'>' : 'MORETHAN',
'=' : 'EQUAL',
'.' : 'DOT',
'%' : 'PERCENT',
'`' : 'BACKQUOTE',
'^' : 'CIRCUMFLEX',
'"' : 'DBLQUOTE',
'\'' : 'QUOTE',
'~' : 'TILDE',
'@' : 'AT',
'(' : 'LPAR',
')' : 'RPAR',
'{' : 'LBRACE',
'}' : 'RBRACE',
'[' : 'LSQB',
']' : 'RSQB',
}

# Grammar Parser
TOKENS = {
'LPAR': '\(',
'RPAR': '\)',
'LBRA': '\[',
'RBRA': '\]',
'OP': '[+*?]',
'COLON': ':',
'OR': '\|',
'DOT': '\.',
'RULE': '[_?*]?[a-z][_a-z0-9]*',
'TOKEN': '_?[A-Z][_A-Z0-9]*',
'STRING': r'".*?[^\\]"',
'REGEXP': r"/(.|\n)*?[^\\]/",
'NL': r'(\r?\n)+\s*',
'WS': r'[ \t]+',
'COMMENT': r'#[^\n]*\n',
'TO': '->'
}

RULES = [
('start', ['list']),
('list', ['item']),
('list', ['list', 'item']),
('item', ['rule']),
('item', ['token']),
('item', ['NL']),

('rule', ['RULE', 'COLON', 'expansions', 'NL']),
('expansions', ['expansion']),
('expansions', ['expansions', 'OR', 'expansion']),
('expansions', ['expansions', 'NL', 'OR', 'expansion']),

('expansion', ['_expansion']),
('expansion', ['_expansion', 'TO', 'RULE']),

('_expansion', ['expr']),
('_expansion', ['_expansion', 'expr']),

('expr', ['atom']),
('expr', ['atom', 'OP']),

('atom', ['LPAR', 'expansions', 'RPAR']),
('atom', ['maybe']),

('atom', ['RULE']),
('atom', ['TOKEN']),
('atom', ['anontoken']),

('anontoken', ['tokenvalue']),

('maybe', ['LBRA', 'expansions', 'RBRA']),

('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']),
('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']),
('tokenvalue', ['REGEXP']),
('tokenvalue', ['STRING']),
('tokenmods', ['DOT', 'RULE']),
('tokenmods', ['tokenmods', 'DOT', 'RULE']),
]

class SaveDefinitions(object):
def __init__(self):
self.rules = {}
self.tokens = {}
self.i = 0


def atom__3(self, _1, value, _2):
return value
def atom__1(self, value):
return value

def expr__1(self, expr):
return expr
def expr(self, *x):
return T('expr', x)

def expansion__1(self, expansion):
return expansion
def expansion__3(self, expansion, _, alias):
return T('alias', [expansion, alias])
def _expansion(self, *x):
return T('expansion', x)

def expansions(self, *x):
items = [i for i in x if isinstance(i, T)]
return T('expansions', items)

def maybe(self, _1, expr, _2):
return T('expr', [expr, Token('OP', '?', -1)])

def rule(self, name, _1, expansion, _2):
name = name.value
if name in self.rules:
raise ValueError("Rule '%s' defined more than once" % name)

self.rules[name] = expansion

def token(self, *x):
name = x[0].value
if name in self.tokens:
raise ValueError("Token '%s' defined more than once" % name)

if len(x) == 4:
self.tokens[name] = x[2][1], []
else:
self.tokens[name] = x[3][1], x[1].children

def tokenvalue(self, tokenvalue):
value = tokenvalue.value[1:-1]
if tokenvalue.type == 'STRING':
value = re.escape(value)
return tokenvalue, value

def anontoken(self, (token, value)):
if token.type == 'STRING':
try:
token_name = _TOKEN_NAMES[token.value[1:-1]]
except KeyError:
if value.isalnum() and value[0].isalpha():
token_name = value.upper()
else:
token_name = 'ANONSTR_%d' % self.i
self.i += 1
token_name = '__' + token_name

elif token.type == 'REGEXP':
token_name = 'ANONRE_%d' % self.i
self.i += 1
else:
assert False, x

if token_name not in self.tokens:
self.tokens[token_name] = value, []

return Token('TOKEN', token_name, -1)

def tokenmods__2(self, _, rule):
return T('tokenmods', [rule.value])
def tokenmods__3(self, tokenmods, _, rule):
return T('tokenmods', tokenmods.children + [rule.value])

def start(self, *x): pass
def list(self, *x): pass
def item(self, *x): pass


class EBNF_to_BNF(Transformer):
def __init__(self):
self.new_rules = {}
self.prefix = 'anon'
self.i = 0

def _add_recurse_rule(self, type_, expr):
new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
self.i += 1
t = Token('RULE', new_name, -1)
self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])])
return t

def expr(self, rule, op):
if op.value == '?':
return T('expansions', [rule, T('expansion', [])])
elif op.value == '+':
# a : b c+ d
# -->
# a : b _c d
# _c : _c c | c;
return self._add_recurse_rule('plus', rule)
elif op.value == '*':
# a : b c* d
# -->
# a : b _c? d
# _c : _c c | c;
new_name = self._add_recurse_rule('star', rule)
return T('expansions', [new_name, T('expansion', [])])
assert False, op


class SimplifyRule_Visitor(Visitor):

@staticmethod
def _flatten(tree):
while True:
to_expand = [i for i, child in enumerate(tree.children)
if isinstance(child, T) and child.data == tree.data]
if not to_expand:
break
tree.expand_kids_by_index(*to_expand)


def expansion(self, tree):
# rules_list unpacking
# a : b (c|d) e
# -->
# a : b c e | b d e
#
# In AST terms:
# expansion(b, expansions(c, d), e)
# -->
# expansions( expansion(b, c, e), expansion(b, d, e) )

while True:
self._flatten(tree)

for i, child in enumerate(tree.children):
if isinstance(child, T) and child.data == 'expansions':
tree.data = 'expansions'
tree.children = [self.visit(T('expansion', [option if i==j else other
for j, other in enumerate(tree.children)]))
for option in child.children]
break
else:
break

def alias(self, tree):
rule, alias_name = tree.children
if rule.data == 'expansions':
aliases = []
for child in tree.children[0].children:
aliases.append(T('alias', [child, alias_name]))
tree.data = 'expansions'
tree.children = aliases

expansions = _flatten

def dict_update_safe(d1, d2):
for k, v in d2.iteritems():
assert k not in d1
d1[k] = v


def generate_aliases():
sd = SaveDefinitions()
for name, expansion in RULES:
try:
f = getattr(sd, "%s__%s" % (name, len(expansion)))
except AttributeError:
f = getattr(sd, name)
yield name, expansion, f.__name__


def inline_args(f):
def _f(self, args):
return f(*args)
return _f


class GrammarLoader:
def __init__(self):
self.rules = list(generate_aliases())
self.ga = GrammarAnalyzer(self.rules)
self.ga.analyze()
self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT'])
self.simplify_rule = SimplifyRule_Visitor()

def _generate_parser_callbacks(self, callbacks):
d = {alias: inline_args(getattr(callbacks, alias))
for _n, _x, alias in self.rules}
return type('Callback', (), d)()

def load_grammar(self, grammar_text):
sd = SaveDefinitions()
c = self._generate_parser_callbacks(sd)

p = Parser(self.ga, c)
p.parse( list(self.lexer.lex(grammar_text+"\n")) )

ebnf_to_bnf = EBNF_to_BNF()

rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()}
dict_update_safe(rules, ebnf_to_bnf.new_rules)

for r in rules.values():
self.simplify_rule.visit(r)

return sd.tokens, rules

load_grammar = GrammarLoader().load_grammar


def test():
g = """
start: add

# Rules
add: mul
| add _add_sym mul

mul: _atom
| mul _add_mul _atom

neg: "-" _atom

_atom: neg
| number
| "(" add ")"

# Tokens
number: /[\d.]+/
_add_sym: "+" | "-"
_add_mul: "*" | "/"

WS.ignore: /\s+/
"""

g2 = """
start: a
a: "a" (b*|(c d)+) "b"?
b: "b"
c: "c"
d: "+" | "-"
"""
load_grammar(g)




+ 61
- 0
lark/parser.py 查看文件

@@ -0,0 +1,61 @@
from grammar_analysis import ACTION_SHIFT

class ParseError(Exception):
pass

class Parser(object):
def __init__(self, ga, callback, temp=False):
self.ga = ga
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None)
for rule in ga.rules}

def parse(self, seq):
states_idx = self.ga.states_idx

stack = [(None, self.ga.init_state_idx)]
i = 0
res = None

def get_action(key):
state = stack[-1][1]
try:
return states_idx[state][key]
except KeyError:
expected = states_idx[state].keys()
context = ' '.join(['%s(%r)' % (t.type, t.value) for t in seq[i:i+5]])
raise ParseError("Unexpected input %r.\nExpected: %s\nContext: %s" % (key, expected, context))

def reduce(rule):
s = stack[-len(rule.expansion):]
del stack[-len(rule.expansion):]

res = self.callbacks[rule]([x[0] for x in s])

if rule.origin == 'start':
return res

_action, new_state = get_action(rule.origin)
assert _action == ACTION_SHIFT
stack.append((res, new_state))

# Main LALR-parser loop
while i < len(seq):
action, arg = get_action(seq[i].type)

if action == ACTION_SHIFT:
stack.append((seq[i], arg))
i+= 1
else:
reduce(arg)

while len(stack) > 1:
_action, rule = get_action('$end')
assert _action == 'reduce'
res = reduce(rule)
if res:
break

assert stack == [(None, self.ga.init_state_idx)], len(stack)
return res



+ 83
- 0
lark/tree.py 查看文件

@@ -0,0 +1,83 @@

class Tree(object):
def __init__(self, data, children):
self.data = data
self.children = list(children)

def __repr__(self):
return 'Tree(%s, %s)' % (self.data, self.children)

def _pretty(self, level, indent_str):
if len(self.children) == 1 and not isinstance(self.children[0], Tree):
return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n']

l = [ indent_str*level, self.data, '\n' ]
for n in self.children:
if isinstance(n, Tree):
l += n._pretty(level+1, indent_str)
else:
l += [ indent_str*(level+1), '%s' % n, '\n' ]

return l

def pretty(self, indent_str=' '):
return ''.join(self._pretty(0, indent_str))

def expand_kids_by_index(self, *indices):
for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
kid = self.children[i]
self.children[i:i+1] = kid.children


# def find_path(self, pred):
# if pred(self):
# yield []
# else:
# for i, c in enumerate(self.children):
# if isinstance(c, Tree):
# for path in c.find_path(pred):
# yield [i] + path

# def follow_path(self, path):
# x = self
# for step in path:
# x = x.children[step]
# return x

# def set_at_path(self, path, value):
# x = self.follow_path(path[:-1])
# x.children[path[-1]] = value

def clone(self):
return Tree(self.data, [c.clone() if isinstance(c, Tree) else c for c in self.children])


class Transformer(object):
def transform(self, tree):
items = [self.transform(c) if isinstance(c, Tree) else c for c in tree.children]
try:
f = getattr(self, tree.data)
except AttributeError:
return self.__default__(tree.data, items)
else:
return f(*items)


def __default__(self, data, children):
return Tree(data, children)


class Visitor(object):
def visit(self, tree):
for child in tree.children:
if isinstance(child, Tree):
self.visit(child)

f = getattr(self, tree.data, self.__default__)
f(tree)
return tree

def __default__(self, tree):
pass



+ 51
- 0
lark/utils.py 查看文件

@@ -0,0 +1,51 @@
from collections import deque

class fzset(frozenset):
def __repr__(self):
return '{%s}' % ', '.join(map(repr, self))


def classify_bool(seq, pred):
true_elems = []
false_elems = []

for elem in seq:
if pred(elem):
true_elems.append(elem)
else:
false_elems.append(elem)

return true_elems, false_elems

def classify(seq, key=None):
d = {}
for item in seq:
k = key(item) if (key is not None) else item
if k in d:
d[k].append(item)
else:
d[k] = [item]
return d

def bfs(initial, expand):
open_q = deque(list(initial))
visited = set(open_q)
while open_q:
node = open_q.popleft()
yield node
for next_node in expand(node):
if next_node not in visited:
visited.add(next_node)
open_q.append(next_node)




try:
STRING_TYPE = basestring
except NameError: # Python 3
STRING_TYPE = str

Str = type(u'')



Loading…
取消
儲存