@@ -0,0 +1,64 @@ | |||
import sys | |||
from lark.lark import Lark, inline_args | |||
from lark.tree import Transformer | |||
json_grammar = r""" | |||
?start: value | |||
?value: object | |||
| array | |||
| string | |||
| number | |||
| "true" -> true | |||
| "false" -> false | |||
| "null" -> null | |||
array : "[" [value ("," value)*] "]" | |||
object : "{" [pair ("," pair)*] "}" | |||
pair : string ":" value | |||
number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | |||
string : /".*?(?<!\\)"/ | |||
WS.ignore.newline: /[ \t\n]+/ | |||
""" | |||
class TreeToJson(Transformer): | |||
@inline_args | |||
def string(self, s): | |||
return s[1:-1] | |||
array = list | |||
pair = tuple | |||
object = dict | |||
number = inline_args(float) | |||
null = lambda self, _: None | |||
true = lambda self, _: True | |||
false = lambda self, _: False | |||
json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
parse = json_parser.parse | |||
def test(): | |||
test_json = ''' | |||
{ | |||
"empty_object" : {}, | |||
"empty_array" : [], | |||
"booleans" : { "YES" : true, "NO" : false }, | |||
"numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], | |||
"strings" : [ "This", [ "And" , "That" ] ], | |||
"nothing" : null | |||
} | |||
''' | |||
j = parse(test_json) | |||
print j | |||
import json | |||
assert j == json.loads(test_json) | |||
if __name__ == '__main__': | |||
test() | |||
with open(sys.argv[1]) as f: | |||
print parse(f.read()) | |||
@@ -0,0 +1,7 @@ | |||
class GrammarError(Exception): | |||
pass | |||
def is_terminal(sym): | |||
return sym.isupper() or sym[0] == '$' | |||
@@ -1,14 +1,9 @@ | |||
from collections import defaultdict, deque | |||
from utils import classify, classify_bool, bfs, fzset | |||
from common import GrammarError, is_terminal | |||
ACTION_SHIFT = 0 | |||
class GrammarError(Exception): | |||
pass | |||
def is_terminal(sym): | |||
return sym.isupper() or sym[0] == '$' | |||
class Rule(object): | |||
""" | |||
origin : a symbol | |||
@@ -61,9 +56,10 @@ def update_set(set1, set2): | |||
return set1 != copy | |||
class GrammarAnalyzer(object): | |||
def __init__(self, rule_tuples): | |||
def __init__(self, rule_tuples, start_symbol): | |||
self.start_symbol = start_symbol | |||
rule_tuples = list(rule_tuples) | |||
rule_tuples.append(('$root', ['start', '$end'])) | |||
rule_tuples.append(('$root', [start_symbol, '$end'])) | |||
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] | |||
self.rules = set() | |||
@@ -78,7 +74,7 @@ class GrammarAnalyzer(object): | |||
if not (is_terminal(sym) or sym in self.rules_by_origin): | |||
raise GrammarError("Using an undefined rule: %s" % sym) | |||
self.init_state = self.expand_rule('start') | |||
self.init_state = self.expand_rule(start_symbol) | |||
def expand_rule(self, rule): | |||
"Returns all init_ptrs accessible by rule (recursive)" | |||
@@ -7,8 +7,8 @@ from .load_grammar import load_grammar | |||
from .tree import Tree, Transformer | |||
from .lexer import Lexer | |||
from .grammar_analysis import GrammarAnalyzer, is_terminal | |||
from . import parser, earley | |||
from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import ENGINE_DICT | |||
class LarkOptions(object): | |||
"""Specifies the options for Lark | |||
@@ -23,6 +23,7 @@ class LarkOptions(object): | |||
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | |||
cache_grammar - Cache the Lark grammar (Default: False) | |||
postlex - Lexer post-processing (Default: None) | |||
start - The start symbol (Default: start) | |||
""" | |||
__doc__ += OPTIONS_DOC | |||
def __init__(self, options_dict): | |||
@@ -36,6 +37,7 @@ class LarkOptions(object): | |||
self.postlex = o.pop('postlex', None) | |||
self.parser = o.pop('parser', 'earley') | |||
self.transformer = o.pop('transformer', None) | |||
self.start = o.pop('start', 'start') | |||
assert self.parser in ENGINE_DICT | |||
if self.parser == 'earley' and self.transformer: | |||
@@ -47,71 +49,8 @@ class LarkOptions(object): | |||
raise ValueError("Unknown options: %s" % o.keys()) | |||
class Callback(object): | |||
pass | |||
class RuleTreeToText(Transformer): | |||
def expansions(self, x): | |||
return x | |||
def expansion(self, symbols): | |||
return [sym.value for sym in symbols], None | |||
def alias(self, ((expansion, _alias), alias)): | |||
assert _alias is None, (alias, expansion, '-', _alias) | |||
return expansion, alias.value | |||
def create_rule_handler(expansion, usermethod): | |||
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||
if not (is_terminal(sym) and sym.startswith('_'))] | |||
def _build_ast(match): | |||
children = [] | |||
for i, to_expand in to_include: | |||
if to_expand: | |||
children += match[i].children | |||
else: | |||
children.append(match[i]) | |||
return usermethod(children) | |||
return _build_ast | |||
def create_expand1_tree_builder_function(tree_builder): | |||
def f(children): | |||
if len(children) == 1: | |||
return children[0] | |||
else: | |||
return tree_builder(children) | |||
return f | |||
class LALR: | |||
def build_parser(self, rules, callback): | |||
ga = GrammarAnalyzer(rules) | |||
ga.analyze() | |||
return parser.Parser(ga, callback) | |||
class Earley: | |||
@staticmethod | |||
def _process_expansion(x): | |||
return [{'literal': s} if is_terminal(s) else s for s in x] | |||
def build_parser(self, rules, callback): | |||
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||
return EarleyParser(earley.Parser(rules, 'start')) | |||
class EarleyParser: | |||
def __init__(self, parser): | |||
self.parser = parser | |||
def parse(self, text): | |||
res = self.parser.parse(text) | |||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
return res[0] | |||
ENGINE_DICT = { 'lalr': LALR, 'earley': Earley } | |||
class Lark: | |||
def __init__(self, grammar, **options): | |||
""" | |||
@@ -147,6 +86,7 @@ class Lark: | |||
self.lexer = self._build_lexer() | |||
if not self.options.only_lex: | |||
self.parser_engine = ENGINE_DICT[self.options.parser]() | |||
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | |||
self.parser = self._build_parser() | |||
def _build_lexer(self): | |||
@@ -160,50 +100,12 @@ class Lark: | |||
def _build_parser(self): | |||
transformer = self.options.transformer | |||
callback = Callback() | |||
rules = [] | |||
rule_tree_to_text = RuleTreeToText() | |||
for origin, tree in self.rules.items(): | |||
for expansion, alias in rule_tree_to_text.transform(tree): | |||
if alias and origin.startswith('_'): | |||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||
expand1 = origin.startswith('?') | |||
_origin = origin.lstrip('?*') | |||
if alias: | |||
alias = alias.lstrip('*') | |||
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||
try: | |||
f = transformer._get_func(alias or _origin) | |||
# f = getattr(transformer, alias or _origin) | |||
except AttributeError: | |||
if alias: | |||
f = self._create_tree_builder_function(alias) | |||
else: | |||
f = self._create_tree_builder_function(_origin) | |||
if expand1: | |||
f = create_expand1_tree_builder_function(f) | |||
alias_handler = create_rule_handler(expansion, f) | |||
assert not hasattr(callback, _alias) | |||
setattr(callback, _alias, alias_handler) | |||
rules.append((_origin, expansion, _alias)) | |||
return self.parser_engine.build_parser(rules, callback) | |||
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | |||
return self.parser_engine.build_parser(rules, callback, self.options.start) | |||
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||
def _create_tree_builder_function(self, name): | |||
tree_class = self.options.tree_class | |||
def f(children): | |||
return tree_class(name, children) | |||
return f | |||
def lex(self, text): | |||
stream = self.lexer.lex(text) | |||
if self.options.postlex: | |||
@@ -1,16 +1,18 @@ | |||
import re | |||
import codecs | |||
from lexer import Lexer, Token | |||
from grammar_analysis import GrammarAnalyzer | |||
from parser import Parser | |||
from .lexer import Lexer, Token | |||
from tree import Tree as T, Transformer, InlineTransformer, Visitor | |||
from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import LALR | |||
from .common import is_terminal, GrammarError | |||
from .tree import Tree as T, Transformer, InlineTransformer, Visitor | |||
unicode_escape = codecs.getdecoder('unicode_escape') | |||
_TOKEN_NAMES = { | |||
':' : 'COLON', | |||
':' : '_COLON', | |||
',' : 'COMMA', | |||
';' : 'SEMICOLON', | |||
'+' : 'PLUS', | |||
@@ -26,7 +28,7 @@ _TOKEN_NAMES = { | |||
'<' : 'LESSTHAN', | |||
'>' : 'MORETHAN', | |||
'=' : 'EQUAL', | |||
'.' : 'DOT', | |||
'.' : '_DOT', | |||
'%' : 'PERCENT', | |||
'`' : 'BACKQUOTE', | |||
'^' : 'CIRCUMFLEX', | |||
@@ -34,8 +36,8 @@ _TOKEN_NAMES = { | |||
'\'' : 'QUOTE', | |||
'~' : 'TILDE', | |||
'@' : 'AT', | |||
'(' : 'LPAR', | |||
')' : 'RPAR', | |||
'(' : '_LPAR', | |||
')' : '_RPAR', | |||
'{' : 'LBRACE', | |||
'}' : 'RBRACE', | |||
'[' : 'LSQB', | |||
@@ -44,151 +46,58 @@ _TOKEN_NAMES = { | |||
# Grammar Parser | |||
TOKENS = { | |||
'LPAR': '\(', | |||
'RPAR': '\)', | |||
'LBRA': '\[', | |||
'RBRA': '\]', | |||
'_LPAR': '\(', | |||
'_RPAR': '\)', | |||
'_LBRA': '\[', | |||
'_RBRA': '\]', | |||
'OP': '[+*?]', | |||
'COLON': ':', | |||
'OR': '\|', | |||
'DOT': '\.', | |||
'_COLON': ':', | |||
'_OR': '\|', | |||
'_DOT': '\.', | |||
'RULE': '[_?*]?[a-z][_a-z0-9]*', | |||
'TOKEN': '_?[A-Z][_A-Z0-9]*', | |||
'STRING': r'".*?[^\\]"', | |||
'REGEXP': r"/(.|\n)*?[^\\]/", | |||
'NL': r'(\r?\n)+\s*', | |||
'_NL': r'(\r?\n)+\s*', | |||
'WS': r'[ \t]+', | |||
'COMMENT': r'//[^\n]*\n', | |||
'TO': '->' | |||
'_TO': '->' | |||
} | |||
RULES = [ | |||
('start', ['list']), | |||
('list', ['item']), | |||
('list', ['list', 'item']), | |||
('item', ['rule']), | |||
('item', ['token']), | |||
('item', ['NL']), | |||
('rule', ['RULE', 'COLON', 'expansions', 'NL']), | |||
('expansions', ['expansion']), | |||
('expansions', ['expansions', 'OR', 'expansion']), | |||
('expansions', ['expansions', 'NL', 'OR', 'expansion']), | |||
('expansion', ['_expansion']), | |||
('expansion', ['_expansion', 'TO', 'RULE']), | |||
RULES = { | |||
'start': ['list'], | |||
'list': ['item', 'list item'], | |||
'item': ['rule', 'token', '_NL'], | |||
('_expansion', []), | |||
('_expansion', ['_expansion', 'expr']), | |||
'rule': ['RULE _COLON expansions _NL'], | |||
'expansions': ['expansion', | |||
'expansions _OR expansion', | |||
'expansions _NL _OR expansion'], | |||
('expr', ['atom']), | |||
('expr', ['atom', 'OP']), | |||
'expansion': ['_expansion', | |||
'_expansion _TO RULE'], | |||
('atom', ['LPAR', 'expansions', 'RPAR']), | |||
('atom', ['maybe']), | |||
'_expansion': ['', '_expansion expr'], | |||
('atom', ['RULE']), | |||
('atom', ['TOKEN']), | |||
('atom', ['anontoken']), | |||
'?expr': ['atom', | |||
'atom OP'], | |||
('anontoken', ['tokenvalue']), | |||
'?atom': ['_LPAR expansions _RPAR', | |||
'maybe', | |||
'RULE', | |||
'TOKEN', | |||
'anontoken'], | |||
('maybe', ['LBRA', 'expansions', 'RBRA']), | |||
'anontoken': ['tokenvalue'], | |||
('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']), | |||
('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']), | |||
('tokenvalue', ['REGEXP']), | |||
('tokenvalue', ['STRING']), | |||
('tokenmods', ['DOT', 'RULE']), | |||
('tokenmods', ['tokenmods', 'DOT', 'RULE']), | |||
] | |||
class SaveDefinitions(object): | |||
def __init__(self): | |||
self.rules = {} | |||
self.token_set = set() | |||
self.tokens = [] | |||
self.i = 0 | |||
'maybe': ['_LBRA expansions _RBRA'], | |||
'token': ['TOKEN _COLON tokenvalue _NL', | |||
'TOKEN tokenmods _COLON tokenvalue _NL'], | |||
def atom__3(self, _1, value, _2): | |||
return value | |||
def atom__1(self, value): | |||
return value | |||
def expr__1(self, expr): | |||
return expr | |||
def expr(self, *x): | |||
return T('expr', x) | |||
def expansion__1(self, expansion): | |||
return expansion | |||
def expansion__3(self, expansion, _, alias): | |||
return T('alias', [expansion, alias]) | |||
def _expansion(self, *x): | |||
return T('expansion', x) | |||
def expansions(self, *x): | |||
items = [i for i in x if isinstance(i, T)] | |||
return T('expansions', items) | |||
def maybe(self, _1, expr, _2): | |||
return T('expr', [expr, Token('OP', '?', -1)]) | |||
def rule(self, name, _1, expansion, _2): | |||
name = name.value | |||
if name in self.rules: | |||
raise ValueError("Rule '%s' defined more than once" % name) | |||
self.rules[name] = expansion | |||
def token(self, *x): | |||
name = x[0].value | |||
if name in self.token_set: | |||
raise ValueError("Token '%s' defined more than once" % name) | |||
self.token_set.add(name) | |||
if len(x) == 4: | |||
self.tokens.append((name, x[2], [])) | |||
else: | |||
self.tokens.append((name, x[3], x[1].children)) | |||
def tokenvalue(self, tokenvalue): | |||
return tokenvalue | |||
def anontoken(self, token): | |||
if token.type == 'STRING': | |||
value = token.value[1:-1] | |||
try: | |||
token_name = _TOKEN_NAMES[value] | |||
except KeyError: | |||
if value.isalnum() and value[0].isalpha(): | |||
token_name = value.upper() | |||
else: | |||
token_name = 'ANONSTR_%d' % self.i | |||
self.i += 1 | |||
token_name = '__' + token_name | |||
elif token.type == 'REGEXP': | |||
token_name = 'ANONRE_%d' % self.i | |||
self.i += 1 | |||
else: | |||
assert False, x | |||
if token_name not in self.token_set: | |||
self.token_set.add(token_name) | |||
self.tokens.append((token_name, token, [])) | |||
return Token('TOKEN', token_name, -1) | |||
def tokenmods__2(self, _, rule): | |||
return T('tokenmods', [rule.value]) | |||
def tokenmods__3(self, tokenmods, _, rule): | |||
return T('tokenmods', tokenmods.children + [rule.value]) | |||
def start(self, *x): pass | |||
def list(self, *x): pass | |||
def item(self, *x): pass | |||
'?tokenvalue': ['REGEXP', 'STRING'], | |||
'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'], | |||
} | |||
class EBNF_to_BNF(InlineTransformer): | |||
@@ -281,46 +190,110 @@ def dict_update_safe(d1, d2): | |||
d1[k] = v | |||
def generate_aliases(): | |||
sd = SaveDefinitions() | |||
for name, expansion in RULES: | |||
try: | |||
f = getattr(sd, "%s__%s" % (name, len(expansion))) | |||
except AttributeError: | |||
f = getattr(sd, name) | |||
yield name, expansion, f.__name__ | |||
class RuleTreeToText(Transformer): | |||
def expansions(self, x): | |||
return x | |||
def expansion(self, symbols): | |||
return [sym.value for sym in symbols], None | |||
def alias(self, ((expansion, _alias), alias)): | |||
assert _alias is None, (alias, expansion, '-', _alias) | |||
return expansion, alias.value | |||
class SimplifyTree(InlineTransformer): | |||
def maybe(self, expr): | |||
return T('expr', [expr, Token('OP', '?', -1)]) | |||
def tokenmods(self, *args): | |||
if len(args) == 1: | |||
return list(args) | |||
tokenmods, value = args | |||
return tokenmods + [value] | |||
def get_tokens(tree, token_set): | |||
tokens = [] | |||
for t in tree.find_data('token'): | |||
x = t.children | |||
name = x[0].value | |||
assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name | |||
if name in token_set: | |||
raise ValueError("Token '%s' defined more than once" % name) | |||
token_set.add(name) | |||
if len(x) == 2: | |||
yield name, x[1], [] | |||
else: | |||
assert len(x) == 3 | |||
yield name, x[2], x[1] | |||
class ExtractAnonTokens(InlineTransformer): | |||
def __init__(self, tokens, token_set): | |||
self.tokens = tokens | |||
self.token_set = token_set | |||
self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens} | |||
def anontoken(self, token): | |||
if token.type == 'STRING': | |||
value = token.value[1:-1] | |||
try: | |||
# If already defined, use the user-defined token name | |||
token_name = self.token_reverse[value] | |||
except KeyError: | |||
# Try to assign an indicative anon-token name, otherwise use a numbered name | |||
try: | |||
token_name = _TOKEN_NAMES[value] | |||
except KeyError: | |||
if value.isalnum() and value[0].isalpha(): | |||
token_name = value.upper() | |||
else: | |||
token_name = 'ANONSTR_%d' % self.i | |||
self.i += 1 | |||
token_name = '__' + token_name | |||
elif token.type == 'REGEXP': | |||
token_name = 'ANONRE_%d' % self.i | |||
self.i += 1 | |||
else: | |||
assert False, x | |||
if token_name not in self.token_set: | |||
self.token_set.add(token_name) | |||
self.tokens.append((token_name, token, [])) | |||
return Token('TOKEN', token_name, -1) | |||
def inline_args(f): | |||
def _f(self, args): | |||
return f(*args) | |||
return _f | |||
class GrammarLoader: | |||
def __init__(self): | |||
self.rules = list(generate_aliases()) | |||
self.ga = GrammarAnalyzer(self.rules) | |||
self.ga.analyze() | |||
self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | |||
self.simplify_rule = SimplifyRule_Visitor() | |||
def _generate_parser_callbacks(self, callbacks): | |||
d = {alias: inline_args(getattr(callbacks, alias)) | |||
for _n, _x, alias in self.rules} | |||
return type('Callback', (), d)() | |||
d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()} | |||
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) | |||
self.parser = LALR().build_parser(rules, callback, 'start') | |||
self.simplify_tree = SimplifyTree() | |||
self.simplify_rule = SimplifyRule_Visitor() | |||
self.rule_tree_to_text = RuleTreeToText() | |||
def load_grammar(self, grammar_text): | |||
sd = SaveDefinitions() | |||
c = self._generate_parser_callbacks(sd) | |||
p = Parser(self.ga, c) | |||
p.parse( list(self.lexer.lex(grammar_text+"\n")) ) | |||
token_stream = list(self.lexer.lex(grammar_text+"\n")) | |||
tree = self.simplify_tree.transform( self.parser.parse(token_stream) ) | |||
# ================= | |||
# Process Tokens | |||
# ================= | |||
token_set = set() | |||
tokens = list(get_tokens(tree, token_set)) | |||
extract_anon = ExtractAnonTokens(tokens, token_set) | |||
tree = extract_anon.transform(tree) # Adds to tokens | |||
# Tokens | |||
token_ref = {} | |||
re_tokens = [] | |||
str_tokens = [] | |||
for name, token, flags in sd.tokens: | |||
for name, token, flags in tokens: | |||
value = token.value[1:-1] | |||
if '\u' in value: | |||
# XXX for now, you can't mix unicode escaping and unicode characters at the same token | |||
@@ -343,43 +316,70 @@ class GrammarLoader: | |||
re_tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
tokens = str_tokens + re_tokens # Order is important! | |||
# Rules | |||
# ================= | |||
# Process Rules | |||
# ================= | |||
ebnf_to_bnf = EBNF_to_BNF() | |||
rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} | |||
rules = {} | |||
for rule in tree.find_data('rule'): | |||
name, ebnf_tree = rule.children | |||
name = name.value | |||
if name in rules: | |||
raise ValueError("Rule '%s' defined more than once" % name) | |||
rules[name] = ebnf_to_bnf.transform(ebnf_tree) | |||
dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||
for r in rules.values(): | |||
self.simplify_rule.visit(r) | |||
rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()} | |||
# ==================== | |||
# Verify correctness | |||
# ==================== | |||
used_symbols = {symbol for expansions in rules.values() | |||
for expansion, _alias in expansions | |||
for symbol in expansion} | |||
rule_set = {r.lstrip('?') for r in rules} | |||
for sym in used_symbols: | |||
if is_terminal(sym): | |||
if sym not in token_set: | |||
raise GrammarError("Token '%s' used but not defined" % sym) | |||
else: | |||
if sym not in rule_set: | |||
raise GrammarError("Rule '%s' used but not defined" % sym) | |||
return tokens, rules | |||
load_grammar = GrammarLoader().load_grammar | |||
def test(): | |||
g = """ | |||
start: add | |||
# Rules | |||
// Rules | |||
add: mul | |||
| add _add_sym mul | |||
mul: _atom | |||
| mul _add_mul _atom | |||
mul: [mul _add_mul] _atom | |||
neg: "-" _atom | |||
_atom: neg | |||
| number | |||
_atom: "-" _atom -> neg | |||
| NUMBER | |||
| "(" add ")" | |||
# Tokens | |||
number: /[\d.]+/ | |||
// Tokens | |||
NUMBER: /[\d.]+/ | |||
_add_sym: "+" | "-" | |||
_add_mul: "*" | "/" | |||
WS.ignore: /\s+/ | |||
WS.ignore.newline: /\s+/ | |||
""" | |||
g2 = """ | |||
@@ -389,7 +389,9 @@ def test(): | |||
c: "c" | |||
d: "+" | "-" | |||
""" | |||
load_grammar(g) | |||
# print load_grammar(g) | |||
print GrammarLoader().load_grammar2(g) | |||
if __name__ == '__main__': | |||
test() |
@@ -0,0 +1,76 @@ | |||
from .grammar_analysis import is_terminal | |||
class Callback(object): | |||
pass | |||
def create_expand1_tree_builder_function(tree_builder): | |||
def f(children): | |||
if len(children) == 1: | |||
return children[0] | |||
else: | |||
return tree_builder(children) | |||
return f | |||
def create_rule_handler(expansion, usermethod): | |||
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||
if not (is_terminal(sym) and sym.startswith('_'))] | |||
def _build_ast(match): | |||
children = [] | |||
for i, to_expand in to_include: | |||
if to_expand: | |||
children += match[i].children | |||
else: | |||
children.append(match[i]) | |||
return usermethod(children) | |||
return _build_ast | |||
class ParseTreeBuilder: | |||
def __init__(self, tree_class): | |||
self.tree_class = tree_class | |||
def _create_tree_builder_function(self, name): | |||
tree_class = self.tree_class | |||
def f(children): | |||
return tree_class(name, children) | |||
return f | |||
def create_tree_builder(self, rules, transformer): | |||
callback = Callback() | |||
new_rules = [] | |||
for origin, expansions in rules.items(): | |||
expand1 = origin.startswith('?') | |||
_origin = origin.lstrip('?*') | |||
for expansion, alias in expansions: | |||
if alias and origin.startswith('_'): | |||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||
if alias: | |||
alias = alias.lstrip('*') | |||
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||
try: | |||
f = transformer._get_func(alias or _origin) | |||
except AttributeError: | |||
if alias: | |||
f = self._create_tree_builder_function(alias) | |||
else: | |||
f = self._create_tree_builder_function(_origin) | |||
if expand1: | |||
f = create_expand1_tree_builder_function(f) | |||
alias_handler = create_rule_handler(expansion, f) | |||
assert not hasattr(callback, _alias) | |||
setattr(callback, _alias, alias_handler) | |||
new_rules.append(( _origin, expansion, _alias )) | |||
return new_rules, callback | |||
@@ -34,7 +34,7 @@ class Parser(object): | |||
res = self.callbacks[rule]([x[0] for x in s]) | |||
if rule.origin == 'start': | |||
if rule.origin == self.ga.start_symbol and len(stack) == 1: | |||
return res | |||
_action, new_state = get_action(rule.origin) | |||
@@ -0,0 +1,31 @@ | |||
from .grammar_analysis import GrammarAnalyzer | |||
from common import is_terminal | |||
from . import parser, earley | |||
class LALR: | |||
def build_parser(self, rules, callback, start): | |||
ga = GrammarAnalyzer(rules, start) | |||
ga.analyze() | |||
return parser.Parser(ga, callback) | |||
class Earley: | |||
@staticmethod | |||
def _process_expansion(x): | |||
return [{'literal': s} if is_terminal(s) else s for s in x] | |||
def build_parser(self, rules, callback, start): | |||
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||
return EarleyParser(earley.Parser(rules, start)) | |||
class EarleyParser: | |||
def __init__(self, parser): | |||
self.parser = parser | |||
def parse(self, text): | |||
res = self.parser.parse(text) | |||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
return res[0] | |||
ENGINE_DICT = { 'lalr': LALR, 'earley': Earley } | |||
@@ -0,0 +1,14 @@ | |||
from __future__ import absolute_import, print_function | |||
import unittest | |||
import logging | |||
from .test_trees import TestTrees | |||
# from .test_selectors import TestSelectors | |||
from .test_parser import TestLalr | |||
# from .test_grammars import TestPythonG, TestConfigG | |||
logging.basicConfig(level=logging.INFO) | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -0,0 +1,326 @@ | |||
from __future__ import absolute_import | |||
import unittest | |||
import logging | |||
import os | |||
import sys | |||
try: | |||
from cStringIO import StringIO as cStringIO | |||
except ImportError: | |||
# Available only in Python 2.x, 3.x only has io.StringIO from below | |||
cStringIO = None | |||
from io import ( | |||
StringIO as uStringIO, | |||
open, | |||
) | |||
logging.basicConfig(level=logging.INFO) | |||
from lark.lark import Lark | |||
from lark.grammar_analysis import GrammarError | |||
from lark.parser import ParseError | |||
__path__ = os.path.dirname(__file__) | |||
def _read(n, *args): | |||
with open(os.path.join(__path__, n), *args) as f: | |||
return f.read() | |||
class TestLalr(unittest.TestCase): | |||
def test_basic1(self): | |||
g = Lark("""start: a+ b a* "b" a* | |||
b: "b" | |||
a: "a" | |||
""", parser='lalr') | |||
r = g.parse('aaabaab') | |||
self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' ) | |||
r = g.parse('aaabaaba') | |||
self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' ) | |||
self.assertRaises(ParseError, g.parse, 'aaabaa') | |||
def test_basic2(self): | |||
# Multiple parsers and colliding tokens | |||
g = Lark("""start: B A | |||
B: "12" | |||
A: "1" """) | |||
g2 = Lark("""start: B A | |||
B: "12" | |||
A: "2" """) | |||
x = g.parse('121') | |||
assert x.data == 'start' and x.children == ['12', '1'], x | |||
x = g2.parse('122') | |||
assert x.data == 'start' and x.children == ['12', '2'], x | |||
def test_basic3(self): | |||
"Tests that Earley and LALR parsers produce equal trees" | |||
g = Lark("""start: "(" name_list ("," "*" NAME)? ")" | |||
name_list: NAME | name_list "," NAME | |||
NAME: /\w+/ """, parser='lalr') | |||
l = g.parse('(a,b,c,*x)') | |||
g = Lark("""start: "(" name_list ("," "*" NAME)? ")" | |||
name_list: NAME | name_list "," NAME | |||
NAME: /\w+/ """) | |||
l2 = g.parse('(a,b,c,*x)') | |||
assert l == l2, '%s != %s' % (l.pretty(), l2.pretty()) | |||
@unittest.skipIf(cStringIO is None, "cStringIO not available") | |||
def test_stringio_bytes(self): | |||
"""Verify that a Lark can be created from file-like objects other than Python's standard 'file' object""" | |||
Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" ')) | |||
def test_stringio_unicode(self): | |||
"""Verify that a Lark can be created from file-like objects other than Python's standard 'file' object""" | |||
Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" ')) | |||
def test_unicode(self): | |||
g = Lark(u"""start: UNIA UNIB UNIA | |||
UNIA: /\xa3/ | |||
UNIB: /\u0101/ | |||
""") | |||
g.parse(u'\xa3\u0101\u00a3') | |||
def test_unicode2(self): | |||
g = Lark(r"""start: UNIA UNIB UNIA UNIC | |||
UNIA: /\xa3/ | |||
UNIB: "a\u0101b\ " | |||
UNIC: /a?\u0101c\n/ | |||
""") | |||
g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n') | |||
def test_recurse_expansion(self): | |||
"""Verify that stack depth doesn't get exceeded on recursive rules marked for expansion.""" | |||
g = Lark(r"""start: a | start a | |||
a : "a" """) | |||
# Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built | |||
# STree data structures, which uses recursion). | |||
g.parse("a" * (sys.getrecursionlimit() // 4)) | |||
def test_expand1_lists_with_one_item(self): | |||
g = Lark(r"""start: list | |||
?list: item+ | |||
item : A | |||
A: "a" | |||
""") | |||
r = g.parse("a") | |||
# because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item' | |||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',)) | |||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
self.assertEqual(len(r.children), 1) | |||
def test_expand1_lists_with_one_item_2(self): | |||
g = Lark(r"""start: list | |||
?list: item+ "!" | |||
item : A | |||
A: "a" | |||
""") | |||
r = g.parse("a!") | |||
# because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item' | |||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',)) | |||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
self.assertEqual(len(r.children), 1) | |||
def test_dont_expand1_lists_with_multiple_items(self): | |||
g = Lark(r"""start: list | |||
?list: item+ | |||
item : A | |||
A: "a" | |||
""") | |||
r = g.parse("aa") | |||
# because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded | |||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
self.assertEqual(len(r.children), 1) | |||
# Sanity check: verify that 'list' contains the two 'item's we've given it | |||
[list] = r.children | |||
self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) | |||
def test_dont_expand1_lists_with_multiple_items_2(self): | |||
g = Lark(r"""start: list | |||
?list: item+ "!" | |||
item : A | |||
A: "a" | |||
""") | |||
r = g.parse("aa!") | |||
# because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded | |||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
self.assertEqual(len(r.children), 1) | |||
# Sanity check: verify that 'list' contains the two 'item's we've given it | |||
[list] = r.children | |||
self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) | |||
def test_empty_expand1_list(self): | |||
g = Lark(r"""start: list | |||
?list: item* | |||
item : A | |||
A: "a" | |||
""") | |||
r = g.parse("") | |||
# because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded | |||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
self.assertEqual(len(r.children), 1) | |||
# Sanity check: verify that 'list' contains no 'item's as we've given it none | |||
[list] = r.children | |||
self.assertSequenceEqual([item.data for item in list.children], ()) | |||
def test_empty_expand1_list_2(self): | |||
g = Lark(r"""start: list | |||
?list: item* "!"? | |||
item : A | |||
A: "a" | |||
""") | |||
r = g.parse("") | |||
# because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded | |||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
self.assertEqual(len(r.children), 1) | |||
# Sanity check: verify that 'list' contains no 'item's as we've given it none | |||
[list] = r.children | |||
self.assertSequenceEqual([item.data for item in list.children], ()) | |||
def test_empty_flatten_list(self): | |||
g = Lark(r"""start: list | |||
list: | item "," list | |||
item : A | |||
A: "a" | |||
""") | |||
r = g.parse("") | |||
# Because 'list' is a flatten rule it's top-level element should *never* be expanded | |||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
# Sanity check: verify that 'list' contains no 'item's as we've given it none | |||
[list] = r.children | |||
self.assertSequenceEqual([item.data for item in list.children], ()) | |||
@unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") | |||
def test_single_item_flatten_list(self): | |||
g = Lark(r"""start: list | |||
list: | item "," list | |||
item : A | |||
A: "a" | |||
""") | |||
r = g.parse("a,") | |||
# Because 'list' is a flatten rule it's top-level element should *never* be expanded | |||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
# Sanity check: verify that 'list' contains exactly the one 'item' we've given it | |||
[list] = r.children | |||
self.assertSequenceEqual([item.data for item in list.children], ('item',)) | |||
@unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") | |||
def test_multiple_item_flatten_list(self): | |||
g = Lark(r"""start: list | |||
#list: | item "," list | |||
item : A | |||
A: "a" | |||
""") | |||
r = g.parse("a,a,") | |||
# Because 'list' is a flatten rule it's top-level element should *never* be expanded | |||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
# Sanity check: verify that 'list' contains exactly the two 'item's we've given it | |||
[list] = r.children | |||
self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) | |||
@unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") | |||
def test_recurse_flatten(self): | |||
"""Verify that stack depth doesn't get exceeded on recursive rules marked for flattening.""" | |||
g = Lark(r"""start: a | start a | |||
a : A | |||
A : "a" """) | |||
# Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built | |||
# STree data structures, which uses recursion). | |||
g.parse("a" * (sys.getrecursionlimit() // 4)) | |||
def test_token_collision(self): | |||
g = Lark("""start: "Hello" NAME | |||
NAME: /\w+/ | |||
WS.ignore: /\s+/ | |||
""", parser='lalr') | |||
x = g.parse('Hello World') | |||
self.assertSequenceEqual(x.children, ['World']) | |||
def test_undefined_rule(self): | |||
self.assertRaises(GrammarError, Lark, """start: a""", parser='lalr') | |||
def test_undefined_token(self): | |||
self.assertRaises(GrammarError, Lark, """start: A""", parser='lalr') | |||
def test_rule_collision(self): | |||
g = Lark("""start: "a"+ "b" | |||
| "a"+ """, parser='lalr') | |||
x = g.parse('aaaa') | |||
x = g.parse('aaaab') | |||
def test_rule_collision2(self): | |||
g = Lark("""start: "a"* "b" | |||
| "a"+ """, parser='lalr') | |||
x = g.parse('aaaa') | |||
x = g.parse('aaaab') | |||
x = g.parse('b') | |||
def test_regex_embed(self): | |||
g = Lark("""start: A B C | |||
A: /a/ | |||
B: /${A}b/ | |||
C: /${B}c/ | |||
""", parser='lalr') | |||
x = g.parse('aababc') | |||
def test_token_not_anon(self): | |||
"""Tests that "a" is matched as A, rather than an anonymous token. | |||
That means that "a" is not filtered out, despite being an 'immediate string'. | |||
Whether or not this is the intuitive behavior, I'm not sure yet. | |||
-Erez | |||
""" | |||
g = Lark("""start: "a" | |||
A: "a" """, parser='lalr') | |||
x = g.parse('a') | |||
self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous') | |||
self.assertEqual(x.children[0].type, "A") | |||
def test_maybe(self): | |||
g = Lark("""start: ["a"] """, parser='lalr') | |||
x = g.parse('a') | |||
x = g.parse('') | |||
def test_start(self): | |||
g = Lark("""a: "a" a? """, parser='lalr', start='a') | |||
x = g.parse('a') | |||
x = g.parse('aa') | |||
x = g.parse('aaa') | |||
if __name__ == '__main__': | |||
unittest.main() | |||
@@ -0,0 +1,26 @@ | |||
from __future__ import absolute_import | |||
from unittest import TestCase | |||
import logging | |||
import copy | |||
import pickle | |||
from lark.tree import Tree | |||
class TestTrees(TestCase): | |||
def setUp(self): | |||
self.tree1 = Tree('a', [Tree(x, y) for x, y in zip('bcd', 'xyz')]) | |||
def test_deepcopy(self): | |||
assert self.tree1 == copy.deepcopy(self.tree1) | |||
def test_pickle(self): | |||
s = copy.deepcopy(self.tree1) | |||
data = pickle.dumps(s) | |||
assert pickle.loads(data) == s | |||
if __name__ == '__main__': | |||
unittest.main() | |||
@@ -33,6 +33,19 @@ class Tree(object): | |||
def __eq__(self, other): | |||
return self.data == other.data and self.children == other.children | |||
def find_pred(self, pred): | |||
if pred(self): | |||
yield self | |||
else: | |||
for i, c in enumerate(self.children): | |||
if isinstance(c, Tree): | |||
for t in c.find_pred(pred): | |||
yield t | |||
def find_data(self, data): | |||
return self.find_pred(lambda t: t.data == data) | |||
# def find_path(self, pred): | |||
# if pred(self): | |||
# yield [] | |||