@@ -0,0 +1,64 @@ | |||||
import sys | |||||
from lark.lark import Lark, inline_args | |||||
from lark.tree import Transformer | |||||
json_grammar = r""" | |||||
?start: value | |||||
?value: object | |||||
| array | |||||
| string | |||||
| number | |||||
| "true" -> true | |||||
| "false" -> false | |||||
| "null" -> null | |||||
array : "[" [value ("," value)*] "]" | |||||
object : "{" [pair ("," pair)*] "}" | |||||
pair : string ":" value | |||||
number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | |||||
string : /".*?(?<!\\)"/ | |||||
WS.ignore.newline: /[ \t\n]+/ | |||||
""" | |||||
class TreeToJson(Transformer): | |||||
@inline_args | |||||
def string(self, s): | |||||
return s[1:-1] | |||||
array = list | |||||
pair = tuple | |||||
object = dict | |||||
number = inline_args(float) | |||||
null = lambda self, _: None | |||||
true = lambda self, _: True | |||||
false = lambda self, _: False | |||||
json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||||
parse = json_parser.parse | |||||
def test(): | |||||
test_json = ''' | |||||
{ | |||||
"empty_object" : {}, | |||||
"empty_array" : [], | |||||
"booleans" : { "YES" : true, "NO" : false }, | |||||
"numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], | |||||
"strings" : [ "This", [ "And" , "That" ] ], | |||||
"nothing" : null | |||||
} | |||||
''' | |||||
j = parse(test_json) | |||||
print j | |||||
import json | |||||
assert j == json.loads(test_json) | |||||
if __name__ == '__main__': | |||||
test() | |||||
with open(sys.argv[1]) as f: | |||||
print parse(f.read()) | |||||
@@ -0,0 +1,7 @@ | |||||
class GrammarError(Exception): | |||||
pass | |||||
def is_terminal(sym): | |||||
return sym.isupper() or sym[0] == '$' | |||||
@@ -1,14 +1,9 @@ | |||||
from collections import defaultdict, deque | from collections import defaultdict, deque | ||||
from utils import classify, classify_bool, bfs, fzset | from utils import classify, classify_bool, bfs, fzset | ||||
from common import GrammarError, is_terminal | |||||
ACTION_SHIFT = 0 | ACTION_SHIFT = 0 | ||||
class GrammarError(Exception): | |||||
pass | |||||
def is_terminal(sym): | |||||
return sym.isupper() or sym[0] == '$' | |||||
class Rule(object): | class Rule(object): | ||||
""" | """ | ||||
origin : a symbol | origin : a symbol | ||||
@@ -61,9 +56,10 @@ def update_set(set1, set2): | |||||
return set1 != copy | return set1 != copy | ||||
class GrammarAnalyzer(object): | class GrammarAnalyzer(object): | ||||
def __init__(self, rule_tuples): | |||||
def __init__(self, rule_tuples, start_symbol): | |||||
self.start_symbol = start_symbol | |||||
rule_tuples = list(rule_tuples) | rule_tuples = list(rule_tuples) | ||||
rule_tuples.append(('$root', ['start', '$end'])) | |||||
rule_tuples.append(('$root', [start_symbol, '$end'])) | |||||
rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] | rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] | ||||
self.rules = set() | self.rules = set() | ||||
@@ -78,7 +74,7 @@ class GrammarAnalyzer(object): | |||||
if not (is_terminal(sym) or sym in self.rules_by_origin): | if not (is_terminal(sym) or sym in self.rules_by_origin): | ||||
raise GrammarError("Using an undefined rule: %s" % sym) | raise GrammarError("Using an undefined rule: %s" % sym) | ||||
self.init_state = self.expand_rule('start') | |||||
self.init_state = self.expand_rule(start_symbol) | |||||
def expand_rule(self, rule): | def expand_rule(self, rule): | ||||
"Returns all init_ptrs accessible by rule (recursive)" | "Returns all init_ptrs accessible by rule (recursive)" | ||||
@@ -7,8 +7,8 @@ from .load_grammar import load_grammar | |||||
from .tree import Tree, Transformer | from .tree import Tree, Transformer | ||||
from .lexer import Lexer | from .lexer import Lexer | ||||
from .grammar_analysis import GrammarAnalyzer, is_terminal | |||||
from . import parser, earley | |||||
from .parse_tree_builder import ParseTreeBuilder | |||||
from .parser_frontends import ENGINE_DICT | |||||
class LarkOptions(object): | class LarkOptions(object): | ||||
"""Specifies the options for Lark | """Specifies the options for Lark | ||||
@@ -23,6 +23,7 @@ class LarkOptions(object): | |||||
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | ||||
cache_grammar - Cache the Lark grammar (Default: False) | cache_grammar - Cache the Lark grammar (Default: False) | ||||
postlex - Lexer post-processing (Default: None) | postlex - Lexer post-processing (Default: None) | ||||
start - The start symbol (Default: start) | |||||
""" | """ | ||||
__doc__ += OPTIONS_DOC | __doc__ += OPTIONS_DOC | ||||
def __init__(self, options_dict): | def __init__(self, options_dict): | ||||
@@ -36,6 +37,7 @@ class LarkOptions(object): | |||||
self.postlex = o.pop('postlex', None) | self.postlex = o.pop('postlex', None) | ||||
self.parser = o.pop('parser', 'earley') | self.parser = o.pop('parser', 'earley') | ||||
self.transformer = o.pop('transformer', None) | self.transformer = o.pop('transformer', None) | ||||
self.start = o.pop('start', 'start') | |||||
assert self.parser in ENGINE_DICT | assert self.parser in ENGINE_DICT | ||||
if self.parser == 'earley' and self.transformer: | if self.parser == 'earley' and self.transformer: | ||||
@@ -47,71 +49,8 @@ class LarkOptions(object): | |||||
raise ValueError("Unknown options: %s" % o.keys()) | raise ValueError("Unknown options: %s" % o.keys()) | ||||
class Callback(object): | |||||
pass | |||||
class RuleTreeToText(Transformer): | |||||
def expansions(self, x): | |||||
return x | |||||
def expansion(self, symbols): | |||||
return [sym.value for sym in symbols], None | |||||
def alias(self, ((expansion, _alias), alias)): | |||||
assert _alias is None, (alias, expansion, '-', _alias) | |||||
return expansion, alias.value | |||||
def create_rule_handler(expansion, usermethod): | |||||
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||||
if not (is_terminal(sym) and sym.startswith('_'))] | |||||
def _build_ast(match): | |||||
children = [] | |||||
for i, to_expand in to_include: | |||||
if to_expand: | |||||
children += match[i].children | |||||
else: | |||||
children.append(match[i]) | |||||
return usermethod(children) | |||||
return _build_ast | |||||
def create_expand1_tree_builder_function(tree_builder): | |||||
def f(children): | |||||
if len(children) == 1: | |||||
return children[0] | |||||
else: | |||||
return tree_builder(children) | |||||
return f | |||||
class LALR: | |||||
def build_parser(self, rules, callback): | |||||
ga = GrammarAnalyzer(rules) | |||||
ga.analyze() | |||||
return parser.Parser(ga, callback) | |||||
class Earley: | |||||
@staticmethod | |||||
def _process_expansion(x): | |||||
return [{'literal': s} if is_terminal(s) else s for s in x] | |||||
def build_parser(self, rules, callback): | |||||
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||||
return EarleyParser(earley.Parser(rules, 'start')) | |||||
class EarleyParser: | |||||
def __init__(self, parser): | |||||
self.parser = parser | |||||
def parse(self, text): | |||||
res = self.parser.parse(text) | |||||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||||
return res[0] | |||||
ENGINE_DICT = { 'lalr': LALR, 'earley': Earley } | |||||
class Lark: | class Lark: | ||||
def __init__(self, grammar, **options): | def __init__(self, grammar, **options): | ||||
""" | """ | ||||
@@ -147,6 +86,7 @@ class Lark: | |||||
self.lexer = self._build_lexer() | self.lexer = self._build_lexer() | ||||
if not self.options.only_lex: | if not self.options.only_lex: | ||||
self.parser_engine = ENGINE_DICT[self.options.parser]() | self.parser_engine = ENGINE_DICT[self.options.parser]() | ||||
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | |||||
self.parser = self._build_parser() | self.parser = self._build_parser() | ||||
def _build_lexer(self): | def _build_lexer(self): | ||||
@@ -160,50 +100,12 @@ class Lark: | |||||
def _build_parser(self): | def _build_parser(self): | ||||
transformer = self.options.transformer | |||||
callback = Callback() | |||||
rules = [] | |||||
rule_tree_to_text = RuleTreeToText() | |||||
for origin, tree in self.rules.items(): | |||||
for expansion, alias in rule_tree_to_text.transform(tree): | |||||
if alias and origin.startswith('_'): | |||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||||
expand1 = origin.startswith('?') | |||||
_origin = origin.lstrip('?*') | |||||
if alias: | |||||
alias = alias.lstrip('*') | |||||
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||||
try: | |||||
f = transformer._get_func(alias or _origin) | |||||
# f = getattr(transformer, alias or _origin) | |||||
except AttributeError: | |||||
if alias: | |||||
f = self._create_tree_builder_function(alias) | |||||
else: | |||||
f = self._create_tree_builder_function(_origin) | |||||
if expand1: | |||||
f = create_expand1_tree_builder_function(f) | |||||
alias_handler = create_rule_handler(expansion, f) | |||||
assert not hasattr(callback, _alias) | |||||
setattr(callback, _alias, alias_handler) | |||||
rules.append((_origin, expansion, _alias)) | |||||
return self.parser_engine.build_parser(rules, callback) | |||||
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | |||||
return self.parser_engine.build_parser(rules, callback, self.options.start) | |||||
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | ||||
def _create_tree_builder_function(self, name): | |||||
tree_class = self.options.tree_class | |||||
def f(children): | |||||
return tree_class(name, children) | |||||
return f | |||||
def lex(self, text): | def lex(self, text): | ||||
stream = self.lexer.lex(text) | stream = self.lexer.lex(text) | ||||
if self.options.postlex: | if self.options.postlex: | ||||
@@ -1,16 +1,18 @@ | |||||
import re | import re | ||||
import codecs | import codecs | ||||
from lexer import Lexer, Token | |||||
from grammar_analysis import GrammarAnalyzer | |||||
from parser import Parser | |||||
from .lexer import Lexer, Token | |||||
from tree import Tree as T, Transformer, InlineTransformer, Visitor | |||||
from .parse_tree_builder import ParseTreeBuilder | |||||
from .parser_frontends import LALR | |||||
from .common import is_terminal, GrammarError | |||||
from .tree import Tree as T, Transformer, InlineTransformer, Visitor | |||||
unicode_escape = codecs.getdecoder('unicode_escape') | unicode_escape = codecs.getdecoder('unicode_escape') | ||||
_TOKEN_NAMES = { | _TOKEN_NAMES = { | ||||
':' : 'COLON', | |||||
':' : '_COLON', | |||||
',' : 'COMMA', | ',' : 'COMMA', | ||||
';' : 'SEMICOLON', | ';' : 'SEMICOLON', | ||||
'+' : 'PLUS', | '+' : 'PLUS', | ||||
@@ -26,7 +28,7 @@ _TOKEN_NAMES = { | |||||
'<' : 'LESSTHAN', | '<' : 'LESSTHAN', | ||||
'>' : 'MORETHAN', | '>' : 'MORETHAN', | ||||
'=' : 'EQUAL', | '=' : 'EQUAL', | ||||
'.' : 'DOT', | |||||
'.' : '_DOT', | |||||
'%' : 'PERCENT', | '%' : 'PERCENT', | ||||
'`' : 'BACKQUOTE', | '`' : 'BACKQUOTE', | ||||
'^' : 'CIRCUMFLEX', | '^' : 'CIRCUMFLEX', | ||||
@@ -34,8 +36,8 @@ _TOKEN_NAMES = { | |||||
'\'' : 'QUOTE', | '\'' : 'QUOTE', | ||||
'~' : 'TILDE', | '~' : 'TILDE', | ||||
'@' : 'AT', | '@' : 'AT', | ||||
'(' : 'LPAR', | |||||
')' : 'RPAR', | |||||
'(' : '_LPAR', | |||||
')' : '_RPAR', | |||||
'{' : 'LBRACE', | '{' : 'LBRACE', | ||||
'}' : 'RBRACE', | '}' : 'RBRACE', | ||||
'[' : 'LSQB', | '[' : 'LSQB', | ||||
@@ -44,151 +46,58 @@ _TOKEN_NAMES = { | |||||
# Grammar Parser | # Grammar Parser | ||||
TOKENS = { | TOKENS = { | ||||
'LPAR': '\(', | |||||
'RPAR': '\)', | |||||
'LBRA': '\[', | |||||
'RBRA': '\]', | |||||
'_LPAR': '\(', | |||||
'_RPAR': '\)', | |||||
'_LBRA': '\[', | |||||
'_RBRA': '\]', | |||||
'OP': '[+*?]', | 'OP': '[+*?]', | ||||
'COLON': ':', | |||||
'OR': '\|', | |||||
'DOT': '\.', | |||||
'_COLON': ':', | |||||
'_OR': '\|', | |||||
'_DOT': '\.', | |||||
'RULE': '[_?*]?[a-z][_a-z0-9]*', | 'RULE': '[_?*]?[a-z][_a-z0-9]*', | ||||
'TOKEN': '_?[A-Z][_A-Z0-9]*', | 'TOKEN': '_?[A-Z][_A-Z0-9]*', | ||||
'STRING': r'".*?[^\\]"', | 'STRING': r'".*?[^\\]"', | ||||
'REGEXP': r"/(.|\n)*?[^\\]/", | 'REGEXP': r"/(.|\n)*?[^\\]/", | ||||
'NL': r'(\r?\n)+\s*', | |||||
'_NL': r'(\r?\n)+\s*', | |||||
'WS': r'[ \t]+', | 'WS': r'[ \t]+', | ||||
'COMMENT': r'//[^\n]*\n', | 'COMMENT': r'//[^\n]*\n', | ||||
'TO': '->' | |||||
'_TO': '->' | |||||
} | } | ||||
RULES = [ | |||||
('start', ['list']), | |||||
('list', ['item']), | |||||
('list', ['list', 'item']), | |||||
('item', ['rule']), | |||||
('item', ['token']), | |||||
('item', ['NL']), | |||||
('rule', ['RULE', 'COLON', 'expansions', 'NL']), | |||||
('expansions', ['expansion']), | |||||
('expansions', ['expansions', 'OR', 'expansion']), | |||||
('expansions', ['expansions', 'NL', 'OR', 'expansion']), | |||||
('expansion', ['_expansion']), | |||||
('expansion', ['_expansion', 'TO', 'RULE']), | |||||
RULES = { | |||||
'start': ['list'], | |||||
'list': ['item', 'list item'], | |||||
'item': ['rule', 'token', '_NL'], | |||||
('_expansion', []), | |||||
('_expansion', ['_expansion', 'expr']), | |||||
'rule': ['RULE _COLON expansions _NL'], | |||||
'expansions': ['expansion', | |||||
'expansions _OR expansion', | |||||
'expansions _NL _OR expansion'], | |||||
('expr', ['atom']), | |||||
('expr', ['atom', 'OP']), | |||||
'expansion': ['_expansion', | |||||
'_expansion _TO RULE'], | |||||
('atom', ['LPAR', 'expansions', 'RPAR']), | |||||
('atom', ['maybe']), | |||||
'_expansion': ['', '_expansion expr'], | |||||
('atom', ['RULE']), | |||||
('atom', ['TOKEN']), | |||||
('atom', ['anontoken']), | |||||
'?expr': ['atom', | |||||
'atom OP'], | |||||
('anontoken', ['tokenvalue']), | |||||
'?atom': ['_LPAR expansions _RPAR', | |||||
'maybe', | |||||
'RULE', | |||||
'TOKEN', | |||||
'anontoken'], | |||||
('maybe', ['LBRA', 'expansions', 'RBRA']), | |||||
'anontoken': ['tokenvalue'], | |||||
('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']), | |||||
('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']), | |||||
('tokenvalue', ['REGEXP']), | |||||
('tokenvalue', ['STRING']), | |||||
('tokenmods', ['DOT', 'RULE']), | |||||
('tokenmods', ['tokenmods', 'DOT', 'RULE']), | |||||
] | |||||
class SaveDefinitions(object): | |||||
def __init__(self): | |||||
self.rules = {} | |||||
self.token_set = set() | |||||
self.tokens = [] | |||||
self.i = 0 | |||||
'maybe': ['_LBRA expansions _RBRA'], | |||||
'token': ['TOKEN _COLON tokenvalue _NL', | |||||
'TOKEN tokenmods _COLON tokenvalue _NL'], | |||||
def atom__3(self, _1, value, _2): | |||||
return value | |||||
def atom__1(self, value): | |||||
return value | |||||
def expr__1(self, expr): | |||||
return expr | |||||
def expr(self, *x): | |||||
return T('expr', x) | |||||
def expansion__1(self, expansion): | |||||
return expansion | |||||
def expansion__3(self, expansion, _, alias): | |||||
return T('alias', [expansion, alias]) | |||||
def _expansion(self, *x): | |||||
return T('expansion', x) | |||||
def expansions(self, *x): | |||||
items = [i for i in x if isinstance(i, T)] | |||||
return T('expansions', items) | |||||
def maybe(self, _1, expr, _2): | |||||
return T('expr', [expr, Token('OP', '?', -1)]) | |||||
def rule(self, name, _1, expansion, _2): | |||||
name = name.value | |||||
if name in self.rules: | |||||
raise ValueError("Rule '%s' defined more than once" % name) | |||||
self.rules[name] = expansion | |||||
def token(self, *x): | |||||
name = x[0].value | |||||
if name in self.token_set: | |||||
raise ValueError("Token '%s' defined more than once" % name) | |||||
self.token_set.add(name) | |||||
if len(x) == 4: | |||||
self.tokens.append((name, x[2], [])) | |||||
else: | |||||
self.tokens.append((name, x[3], x[1].children)) | |||||
def tokenvalue(self, tokenvalue): | |||||
return tokenvalue | |||||
def anontoken(self, token): | |||||
if token.type == 'STRING': | |||||
value = token.value[1:-1] | |||||
try: | |||||
token_name = _TOKEN_NAMES[value] | |||||
except KeyError: | |||||
if value.isalnum() and value[0].isalpha(): | |||||
token_name = value.upper() | |||||
else: | |||||
token_name = 'ANONSTR_%d' % self.i | |||||
self.i += 1 | |||||
token_name = '__' + token_name | |||||
elif token.type == 'REGEXP': | |||||
token_name = 'ANONRE_%d' % self.i | |||||
self.i += 1 | |||||
else: | |||||
assert False, x | |||||
if token_name not in self.token_set: | |||||
self.token_set.add(token_name) | |||||
self.tokens.append((token_name, token, [])) | |||||
return Token('TOKEN', token_name, -1) | |||||
def tokenmods__2(self, _, rule): | |||||
return T('tokenmods', [rule.value]) | |||||
def tokenmods__3(self, tokenmods, _, rule): | |||||
return T('tokenmods', tokenmods.children + [rule.value]) | |||||
def start(self, *x): pass | |||||
def list(self, *x): pass | |||||
def item(self, *x): pass | |||||
'?tokenvalue': ['REGEXP', 'STRING'], | |||||
'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'], | |||||
} | |||||
class EBNF_to_BNF(InlineTransformer): | class EBNF_to_BNF(InlineTransformer): | ||||
@@ -281,46 +190,110 @@ def dict_update_safe(d1, d2): | |||||
d1[k] = v | d1[k] = v | ||||
def generate_aliases(): | |||||
sd = SaveDefinitions() | |||||
for name, expansion in RULES: | |||||
try: | |||||
f = getattr(sd, "%s__%s" % (name, len(expansion))) | |||||
except AttributeError: | |||||
f = getattr(sd, name) | |||||
yield name, expansion, f.__name__ | |||||
class RuleTreeToText(Transformer): | |||||
def expansions(self, x): | |||||
return x | |||||
def expansion(self, symbols): | |||||
return [sym.value for sym in symbols], None | |||||
def alias(self, ((expansion, _alias), alias)): | |||||
assert _alias is None, (alias, expansion, '-', _alias) | |||||
return expansion, alias.value | |||||
class SimplifyTree(InlineTransformer): | |||||
def maybe(self, expr): | |||||
return T('expr', [expr, Token('OP', '?', -1)]) | |||||
def tokenmods(self, *args): | |||||
if len(args) == 1: | |||||
return list(args) | |||||
tokenmods, value = args | |||||
return tokenmods + [value] | |||||
def get_tokens(tree, token_set): | |||||
tokens = [] | |||||
for t in tree.find_data('token'): | |||||
x = t.children | |||||
name = x[0].value | |||||
assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name | |||||
if name in token_set: | |||||
raise ValueError("Token '%s' defined more than once" % name) | |||||
token_set.add(name) | |||||
if len(x) == 2: | |||||
yield name, x[1], [] | |||||
else: | |||||
assert len(x) == 3 | |||||
yield name, x[2], x[1] | |||||
class ExtractAnonTokens(InlineTransformer): | |||||
def __init__(self, tokens, token_set): | |||||
self.tokens = tokens | |||||
self.token_set = token_set | |||||
self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens} | |||||
def anontoken(self, token): | |||||
if token.type == 'STRING': | |||||
value = token.value[1:-1] | |||||
try: | |||||
# If already defined, use the user-defined token name | |||||
token_name = self.token_reverse[value] | |||||
except KeyError: | |||||
# Try to assign an indicative anon-token name, otherwise use a numbered name | |||||
try: | |||||
token_name = _TOKEN_NAMES[value] | |||||
except KeyError: | |||||
if value.isalnum() and value[0].isalpha(): | |||||
token_name = value.upper() | |||||
else: | |||||
token_name = 'ANONSTR_%d' % self.i | |||||
self.i += 1 | |||||
token_name = '__' + token_name | |||||
elif token.type == 'REGEXP': | |||||
token_name = 'ANONRE_%d' % self.i | |||||
self.i += 1 | |||||
else: | |||||
assert False, x | |||||
if token_name not in self.token_set: | |||||
self.token_set.add(token_name) | |||||
self.tokens.append((token_name, token, [])) | |||||
return Token('TOKEN', token_name, -1) | |||||
def inline_args(f): | |||||
def _f(self, args): | |||||
return f(*args) | |||||
return _f | |||||
class GrammarLoader: | class GrammarLoader: | ||||
def __init__(self): | def __init__(self): | ||||
self.rules = list(generate_aliases()) | |||||
self.ga = GrammarAnalyzer(self.rules) | |||||
self.ga.analyze() | |||||
self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | ||||
self.simplify_rule = SimplifyRule_Visitor() | |||||
def _generate_parser_callbacks(self, callbacks): | |||||
d = {alias: inline_args(getattr(callbacks, alias)) | |||||
for _n, _x, alias in self.rules} | |||||
return type('Callback', (), d)() | |||||
d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()} | |||||
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) | |||||
self.parser = LALR().build_parser(rules, callback, 'start') | |||||
self.simplify_tree = SimplifyTree() | |||||
self.simplify_rule = SimplifyRule_Visitor() | |||||
self.rule_tree_to_text = RuleTreeToText() | |||||
def load_grammar(self, grammar_text): | def load_grammar(self, grammar_text): | ||||
sd = SaveDefinitions() | |||||
c = self._generate_parser_callbacks(sd) | |||||
p = Parser(self.ga, c) | |||||
p.parse( list(self.lexer.lex(grammar_text+"\n")) ) | |||||
token_stream = list(self.lexer.lex(grammar_text+"\n")) | |||||
tree = self.simplify_tree.transform( self.parser.parse(token_stream) ) | |||||
# ================= | |||||
# Process Tokens | |||||
# ================= | |||||
token_set = set() | |||||
tokens = list(get_tokens(tree, token_set)) | |||||
extract_anon = ExtractAnonTokens(tokens, token_set) | |||||
tree = extract_anon.transform(tree) # Adds to tokens | |||||
# Tokens | |||||
token_ref = {} | token_ref = {} | ||||
re_tokens = [] | re_tokens = [] | ||||
str_tokens = [] | str_tokens = [] | ||||
for name, token, flags in sd.tokens: | |||||
for name, token, flags in tokens: | |||||
value = token.value[1:-1] | value = token.value[1:-1] | ||||
if '\u' in value: | if '\u' in value: | ||||
# XXX for now, you can't mix unicode escaping and unicode characters at the same token | # XXX for now, you can't mix unicode escaping and unicode characters at the same token | ||||
@@ -343,43 +316,70 @@ class GrammarLoader: | |||||
re_tokens.sort(key=lambda x:len(x[1]), reverse=True) | re_tokens.sort(key=lambda x:len(x[1]), reverse=True) | ||||
tokens = str_tokens + re_tokens # Order is important! | tokens = str_tokens + re_tokens # Order is important! | ||||
# Rules | |||||
# ================= | |||||
# Process Rules | |||||
# ================= | |||||
ebnf_to_bnf = EBNF_to_BNF() | ebnf_to_bnf = EBNF_to_BNF() | ||||
rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} | |||||
rules = {} | |||||
for rule in tree.find_data('rule'): | |||||
name, ebnf_tree = rule.children | |||||
name = name.value | |||||
if name in rules: | |||||
raise ValueError("Rule '%s' defined more than once" % name) | |||||
rules[name] = ebnf_to_bnf.transform(ebnf_tree) | |||||
dict_update_safe(rules, ebnf_to_bnf.new_rules) | dict_update_safe(rules, ebnf_to_bnf.new_rules) | ||||
for r in rules.values(): | for r in rules.values(): | ||||
self.simplify_rule.visit(r) | self.simplify_rule.visit(r) | ||||
rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()} | |||||
# ==================== | |||||
# Verify correctness | |||||
# ==================== | |||||
used_symbols = {symbol for expansions in rules.values() | |||||
for expansion, _alias in expansions | |||||
for symbol in expansion} | |||||
rule_set = {r.lstrip('?') for r in rules} | |||||
for sym in used_symbols: | |||||
if is_terminal(sym): | |||||
if sym not in token_set: | |||||
raise GrammarError("Token '%s' used but not defined" % sym) | |||||
else: | |||||
if sym not in rule_set: | |||||
raise GrammarError("Rule '%s' used but not defined" % sym) | |||||
return tokens, rules | return tokens, rules | ||||
load_grammar = GrammarLoader().load_grammar | load_grammar = GrammarLoader().load_grammar | ||||
def test(): | def test(): | ||||
g = """ | g = """ | ||||
start: add | start: add | ||||
# Rules | |||||
// Rules | |||||
add: mul | add: mul | ||||
| add _add_sym mul | | add _add_sym mul | ||||
mul: _atom | |||||
| mul _add_mul _atom | |||||
mul: [mul _add_mul] _atom | |||||
neg: "-" _atom | |||||
_atom: neg | |||||
| number | |||||
_atom: "-" _atom -> neg | |||||
| NUMBER | |||||
| "(" add ")" | | "(" add ")" | ||||
# Tokens | |||||
number: /[\d.]+/ | |||||
// Tokens | |||||
NUMBER: /[\d.]+/ | |||||
_add_sym: "+" | "-" | _add_sym: "+" | "-" | ||||
_add_mul: "*" | "/" | _add_mul: "*" | "/" | ||||
WS.ignore: /\s+/ | |||||
WS.ignore.newline: /\s+/ | |||||
""" | """ | ||||
g2 = """ | g2 = """ | ||||
@@ -389,7 +389,9 @@ def test(): | |||||
c: "c" | c: "c" | ||||
d: "+" | "-" | d: "+" | "-" | ||||
""" | """ | ||||
load_grammar(g) | |||||
# print load_grammar(g) | |||||
print GrammarLoader().load_grammar2(g) | |||||
if __name__ == '__main__': | |||||
test() |
@@ -0,0 +1,76 @@ | |||||
from .grammar_analysis import is_terminal | |||||
class Callback(object): | |||||
pass | |||||
def create_expand1_tree_builder_function(tree_builder): | |||||
def f(children): | |||||
if len(children) == 1: | |||||
return children[0] | |||||
else: | |||||
return tree_builder(children) | |||||
return f | |||||
def create_rule_handler(expansion, usermethod): | |||||
to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||||
if not (is_terminal(sym) and sym.startswith('_'))] | |||||
def _build_ast(match): | |||||
children = [] | |||||
for i, to_expand in to_include: | |||||
if to_expand: | |||||
children += match[i].children | |||||
else: | |||||
children.append(match[i]) | |||||
return usermethod(children) | |||||
return _build_ast | |||||
class ParseTreeBuilder: | |||||
def __init__(self, tree_class): | |||||
self.tree_class = tree_class | |||||
def _create_tree_builder_function(self, name): | |||||
tree_class = self.tree_class | |||||
def f(children): | |||||
return tree_class(name, children) | |||||
return f | |||||
def create_tree_builder(self, rules, transformer): | |||||
callback = Callback() | |||||
new_rules = [] | |||||
for origin, expansions in rules.items(): | |||||
expand1 = origin.startswith('?') | |||||
_origin = origin.lstrip('?*') | |||||
for expansion, alias in expansions: | |||||
if alias and origin.startswith('_'): | |||||
raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||||
if alias: | |||||
alias = alias.lstrip('*') | |||||
_alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||||
try: | |||||
f = transformer._get_func(alias or _origin) | |||||
except AttributeError: | |||||
if alias: | |||||
f = self._create_tree_builder_function(alias) | |||||
else: | |||||
f = self._create_tree_builder_function(_origin) | |||||
if expand1: | |||||
f = create_expand1_tree_builder_function(f) | |||||
alias_handler = create_rule_handler(expansion, f) | |||||
assert not hasattr(callback, _alias) | |||||
setattr(callback, _alias, alias_handler) | |||||
new_rules.append(( _origin, expansion, _alias )) | |||||
return new_rules, callback | |||||
@@ -34,7 +34,7 @@ class Parser(object): | |||||
res = self.callbacks[rule]([x[0] for x in s]) | res = self.callbacks[rule]([x[0] for x in s]) | ||||
if rule.origin == 'start': | |||||
if rule.origin == self.ga.start_symbol and len(stack) == 1: | |||||
return res | return res | ||||
_action, new_state = get_action(rule.origin) | _action, new_state = get_action(rule.origin) | ||||
@@ -0,0 +1,31 @@ | |||||
from .grammar_analysis import GrammarAnalyzer | |||||
from common import is_terminal | |||||
from . import parser, earley | |||||
class LALR: | |||||
def build_parser(self, rules, callback, start): | |||||
ga = GrammarAnalyzer(rules, start) | |||||
ga.analyze() | |||||
return parser.Parser(ga, callback) | |||||
class Earley: | |||||
@staticmethod | |||||
def _process_expansion(x): | |||||
return [{'literal': s} if is_terminal(s) else s for s in x] | |||||
def build_parser(self, rules, callback, start): | |||||
rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||||
return EarleyParser(earley.Parser(rules, start)) | |||||
class EarleyParser: | |||||
def __init__(self, parser): | |||||
self.parser = parser | |||||
def parse(self, text): | |||||
res = self.parser.parse(text) | |||||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||||
return res[0] | |||||
ENGINE_DICT = { 'lalr': LALR, 'earley': Earley } | |||||
@@ -0,0 +1,14 @@ | |||||
from __future__ import absolute_import, print_function | |||||
import unittest | |||||
import logging | |||||
from .test_trees import TestTrees | |||||
# from .test_selectors import TestSelectors | |||||
from .test_parser import TestLalr | |||||
# from .test_grammars import TestPythonG, TestConfigG | |||||
logging.basicConfig(level=logging.INFO) | |||||
if __name__ == '__main__': | |||||
unittest.main() |
@@ -0,0 +1,326 @@ | |||||
from __future__ import absolute_import | |||||
import unittest | |||||
import logging | |||||
import os | |||||
import sys | |||||
try: | |||||
from cStringIO import StringIO as cStringIO | |||||
except ImportError: | |||||
# Available only in Python 2.x, 3.x only has io.StringIO from below | |||||
cStringIO = None | |||||
from io import ( | |||||
StringIO as uStringIO, | |||||
open, | |||||
) | |||||
logging.basicConfig(level=logging.INFO) | |||||
from lark.lark import Lark | |||||
from lark.grammar_analysis import GrammarError | |||||
from lark.parser import ParseError | |||||
__path__ = os.path.dirname(__file__) | |||||
def _read(n, *args): | |||||
with open(os.path.join(__path__, n), *args) as f: | |||||
return f.read() | |||||
class TestLalr(unittest.TestCase): | |||||
def test_basic1(self): | |||||
g = Lark("""start: a+ b a* "b" a* | |||||
b: "b" | |||||
a: "a" | |||||
""", parser='lalr') | |||||
r = g.parse('aaabaab') | |||||
self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' ) | |||||
r = g.parse('aaabaaba') | |||||
self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' ) | |||||
self.assertRaises(ParseError, g.parse, 'aaabaa') | |||||
def test_basic2(self): | |||||
# Multiple parsers and colliding tokens | |||||
g = Lark("""start: B A | |||||
B: "12" | |||||
A: "1" """) | |||||
g2 = Lark("""start: B A | |||||
B: "12" | |||||
A: "2" """) | |||||
x = g.parse('121') | |||||
assert x.data == 'start' and x.children == ['12', '1'], x | |||||
x = g2.parse('122') | |||||
assert x.data == 'start' and x.children == ['12', '2'], x | |||||
def test_basic3(self): | |||||
"Tests that Earley and LALR parsers produce equal trees" | |||||
g = Lark("""start: "(" name_list ("," "*" NAME)? ")" | |||||
name_list: NAME | name_list "," NAME | |||||
NAME: /\w+/ """, parser='lalr') | |||||
l = g.parse('(a,b,c,*x)') | |||||
g = Lark("""start: "(" name_list ("," "*" NAME)? ")" | |||||
name_list: NAME | name_list "," NAME | |||||
NAME: /\w+/ """) | |||||
l2 = g.parse('(a,b,c,*x)') | |||||
assert l == l2, '%s != %s' % (l.pretty(), l2.pretty()) | |||||
@unittest.skipIf(cStringIO is None, "cStringIO not available") | |||||
def test_stringio_bytes(self): | |||||
"""Verify that a Lark can be created from file-like objects other than Python's standard 'file' object""" | |||||
Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" ')) | |||||
def test_stringio_unicode(self): | |||||
"""Verify that a Lark can be created from file-like objects other than Python's standard 'file' object""" | |||||
Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" ')) | |||||
def test_unicode(self): | |||||
g = Lark(u"""start: UNIA UNIB UNIA | |||||
UNIA: /\xa3/ | |||||
UNIB: /\u0101/ | |||||
""") | |||||
g.parse(u'\xa3\u0101\u00a3') | |||||
def test_unicode2(self): | |||||
g = Lark(r"""start: UNIA UNIB UNIA UNIC | |||||
UNIA: /\xa3/ | |||||
UNIB: "a\u0101b\ " | |||||
UNIC: /a?\u0101c\n/ | |||||
""") | |||||
g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n') | |||||
def test_recurse_expansion(self): | |||||
"""Verify that stack depth doesn't get exceeded on recursive rules marked for expansion.""" | |||||
g = Lark(r"""start: a | start a | |||||
a : "a" """) | |||||
# Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built | |||||
# STree data structures, which uses recursion). | |||||
g.parse("a" * (sys.getrecursionlimit() // 4)) | |||||
def test_expand1_lists_with_one_item(self): | |||||
g = Lark(r"""start: list | |||||
?list: item+ | |||||
item : A | |||||
A: "a" | |||||
""") | |||||
r = g.parse("a") | |||||
# because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item' | |||||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',)) | |||||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||||
self.assertEqual(len(r.children), 1) | |||||
def test_expand1_lists_with_one_item_2(self): | |||||
g = Lark(r"""start: list | |||||
?list: item+ "!" | |||||
item : A | |||||
A: "a" | |||||
""") | |||||
r = g.parse("a!") | |||||
# because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item' | |||||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',)) | |||||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||||
self.assertEqual(len(r.children), 1) | |||||
def test_dont_expand1_lists_with_multiple_items(self): | |||||
g = Lark(r"""start: list | |||||
?list: item+ | |||||
item : A | |||||
A: "a" | |||||
""") | |||||
r = g.parse("aa") | |||||
# because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded | |||||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||||
self.assertEqual(len(r.children), 1) | |||||
# Sanity check: verify that 'list' contains the two 'item's we've given it | |||||
[list] = r.children | |||||
self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) | |||||
def test_dont_expand1_lists_with_multiple_items_2(self): | |||||
g = Lark(r"""start: list | |||||
?list: item+ "!" | |||||
item : A | |||||
A: "a" | |||||
""") | |||||
r = g.parse("aa!") | |||||
# because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded | |||||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||||
self.assertEqual(len(r.children), 1) | |||||
# Sanity check: verify that 'list' contains the two 'item's we've given it | |||||
[list] = r.children | |||||
self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) | |||||
def test_empty_expand1_list(self): | |||||
g = Lark(r"""start: list | |||||
?list: item* | |||||
item : A | |||||
A: "a" | |||||
""") | |||||
r = g.parse("") | |||||
# because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded | |||||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||||
self.assertEqual(len(r.children), 1) | |||||
# Sanity check: verify that 'list' contains no 'item's as we've given it none | |||||
[list] = r.children | |||||
self.assertSequenceEqual([item.data for item in list.children], ()) | |||||
def test_empty_expand1_list_2(self): | |||||
g = Lark(r"""start: list | |||||
?list: item* "!"? | |||||
item : A | |||||
A: "a" | |||||
""") | |||||
r = g.parse("") | |||||
# because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded | |||||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||||
# regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||||
self.assertEqual(len(r.children), 1) | |||||
# Sanity check: verify that 'list' contains no 'item's as we've given it none | |||||
[list] = r.children | |||||
self.assertSequenceEqual([item.data for item in list.children], ()) | |||||
def test_empty_flatten_list(self): | |||||
g = Lark(r"""start: list | |||||
list: | item "," list | |||||
item : A | |||||
A: "a" | |||||
""") | |||||
r = g.parse("") | |||||
# Because 'list' is a flatten rule it's top-level element should *never* be expanded | |||||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||||
# Sanity check: verify that 'list' contains no 'item's as we've given it none | |||||
[list] = r.children | |||||
self.assertSequenceEqual([item.data for item in list.children], ()) | |||||
@unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") | |||||
def test_single_item_flatten_list(self): | |||||
g = Lark(r"""start: list | |||||
list: | item "," list | |||||
item : A | |||||
A: "a" | |||||
""") | |||||
r = g.parse("a,") | |||||
# Because 'list' is a flatten rule it's top-level element should *never* be expanded | |||||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||||
# Sanity check: verify that 'list' contains exactly the one 'item' we've given it | |||||
[list] = r.children | |||||
self.assertSequenceEqual([item.data for item in list.children], ('item',)) | |||||
@unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") | |||||
def test_multiple_item_flatten_list(self): | |||||
g = Lark(r"""start: list | |||||
#list: | item "," list | |||||
item : A | |||||
A: "a" | |||||
""") | |||||
r = g.parse("a,a,") | |||||
# Because 'list' is a flatten rule it's top-level element should *never* be expanded | |||||
self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||||
# Sanity check: verify that 'list' contains exactly the two 'item's we've given it | |||||
[list] = r.children | |||||
self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) | |||||
@unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") | |||||
def test_recurse_flatten(self): | |||||
"""Verify that stack depth doesn't get exceeded on recursive rules marked for flattening.""" | |||||
g = Lark(r"""start: a | start a | |||||
a : A | |||||
A : "a" """) | |||||
# Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built | |||||
# STree data structures, which uses recursion). | |||||
g.parse("a" * (sys.getrecursionlimit() // 4)) | |||||
def test_token_collision(self): | |||||
g = Lark("""start: "Hello" NAME | |||||
NAME: /\w+/ | |||||
WS.ignore: /\s+/ | |||||
""", parser='lalr') | |||||
x = g.parse('Hello World') | |||||
self.assertSequenceEqual(x.children, ['World']) | |||||
def test_undefined_rule(self): | |||||
self.assertRaises(GrammarError, Lark, """start: a""", parser='lalr') | |||||
def test_undefined_token(self): | |||||
self.assertRaises(GrammarError, Lark, """start: A""", parser='lalr') | |||||
def test_rule_collision(self): | |||||
g = Lark("""start: "a"+ "b" | |||||
| "a"+ """, parser='lalr') | |||||
x = g.parse('aaaa') | |||||
x = g.parse('aaaab') | |||||
def test_rule_collision2(self): | |||||
g = Lark("""start: "a"* "b" | |||||
| "a"+ """, parser='lalr') | |||||
x = g.parse('aaaa') | |||||
x = g.parse('aaaab') | |||||
x = g.parse('b') | |||||
def test_regex_embed(self): | |||||
g = Lark("""start: A B C | |||||
A: /a/ | |||||
B: /${A}b/ | |||||
C: /${B}c/ | |||||
""", parser='lalr') | |||||
x = g.parse('aababc') | |||||
def test_token_not_anon(self): | |||||
"""Tests that "a" is matched as A, rather than an anonymous token. | |||||
That means that "a" is not filtered out, despite being an 'immediate string'. | |||||
Whether or not this is the intuitive behavior, I'm not sure yet. | |||||
-Erez | |||||
""" | |||||
g = Lark("""start: "a" | |||||
A: "a" """, parser='lalr') | |||||
x = g.parse('a') | |||||
self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous') | |||||
self.assertEqual(x.children[0].type, "A") | |||||
def test_maybe(self): | |||||
g = Lark("""start: ["a"] """, parser='lalr') | |||||
x = g.parse('a') | |||||
x = g.parse('') | |||||
def test_start(self): | |||||
g = Lark("""a: "a" a? """, parser='lalr', start='a') | |||||
x = g.parse('a') | |||||
x = g.parse('aa') | |||||
x = g.parse('aaa') | |||||
if __name__ == '__main__': | |||||
unittest.main() | |||||
@@ -0,0 +1,26 @@ | |||||
from __future__ import absolute_import | |||||
from unittest import TestCase | |||||
import logging | |||||
import copy | |||||
import pickle | |||||
from lark.tree import Tree | |||||
class TestTrees(TestCase): | |||||
def setUp(self): | |||||
self.tree1 = Tree('a', [Tree(x, y) for x, y in zip('bcd', 'xyz')]) | |||||
def test_deepcopy(self): | |||||
assert self.tree1 == copy.deepcopy(self.tree1) | |||||
def test_pickle(self): | |||||
s = copy.deepcopy(self.tree1) | |||||
data = pickle.dumps(s) | |||||
assert pickle.loads(data) == s | |||||
if __name__ == '__main__': | |||||
unittest.main() | |||||
@@ -33,6 +33,19 @@ class Tree(object): | |||||
def __eq__(self, other): | def __eq__(self, other): | ||||
return self.data == other.data and self.children == other.children | return self.data == other.data and self.children == other.children | ||||
def find_pred(self, pred): | |||||
if pred(self): | |||||
yield self | |||||
else: | |||||
for i, c in enumerate(self.children): | |||||
if isinstance(c, Tree): | |||||
for t in c.find_pred(pred): | |||||
yield t | |||||
def find_data(self, data): | |||||
return self.find_pred(lambda t: t.data == data) | |||||
# def find_path(self, pred): | # def find_path(self, pred): | ||||
# if pred(self): | # if pred(self): | ||||
# yield [] | # yield [] | ||||