| @@ -0,0 +1,64 @@ | |||
| import sys | |||
| from lark.lark import Lark, inline_args | |||
| from lark.tree import Transformer | |||
| json_grammar = r""" | |||
| ?start: value | |||
| ?value: object | |||
| | array | |||
| | string | |||
| | number | |||
| | "true" -> true | |||
| | "false" -> false | |||
| | "null" -> null | |||
| array : "[" [value ("," value)*] "]" | |||
| object : "{" [pair ("," pair)*] "}" | |||
| pair : string ":" value | |||
| number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | |||
| string : /".*?(?<!\\)"/ | |||
| WS.ignore.newline: /[ \t\n]+/ | |||
| """ | |||
| class TreeToJson(Transformer): | |||
| @inline_args | |||
| def string(self, s): | |||
| return s[1:-1] | |||
| array = list | |||
| pair = tuple | |||
| object = dict | |||
| number = inline_args(float) | |||
| null = lambda self, _: None | |||
| true = lambda self, _: True | |||
| false = lambda self, _: False | |||
| json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
| parse = json_parser.parse | |||
| def test(): | |||
| test_json = ''' | |||
| { | |||
| "empty_object" : {}, | |||
| "empty_array" : [], | |||
| "booleans" : { "YES" : true, "NO" : false }, | |||
| "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], | |||
| "strings" : [ "This", [ "And" , "That" ] ], | |||
| "nothing" : null | |||
| } | |||
| ''' | |||
| j = parse(test_json) | |||
| print j | |||
| import json | |||
| assert j == json.loads(test_json) | |||
| if __name__ == '__main__': | |||
| test() | |||
| with open(sys.argv[1]) as f: | |||
| print parse(f.read()) | |||
| @@ -0,0 +1,7 @@ | |||
| class GrammarError(Exception): | |||
| pass | |||
| def is_terminal(sym): | |||
| return sym.isupper() or sym[0] == '$' | |||
| @@ -1,14 +1,9 @@ | |||
| from collections import defaultdict, deque | |||
| from utils import classify, classify_bool, bfs, fzset | |||
| from common import GrammarError, is_terminal | |||
| ACTION_SHIFT = 0 | |||
| class GrammarError(Exception): | |||
| pass | |||
| def is_terminal(sym): | |||
| return sym.isupper() or sym[0] == '$' | |||
| class Rule(object): | |||
| """ | |||
| origin : a symbol | |||
| @@ -61,9 +56,10 @@ def update_set(set1, set2): | |||
| return set1 != copy | |||
| class GrammarAnalyzer(object): | |||
| def __init__(self, rule_tuples): | |||
| def __init__(self, rule_tuples, start_symbol): | |||
| self.start_symbol = start_symbol | |||
| rule_tuples = list(rule_tuples) | |||
| rule_tuples.append(('$root', ['start', '$end'])) | |||
| rule_tuples.append(('$root', [start_symbol, '$end'])) | |||
| rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] | |||
| self.rules = set() | |||
| @@ -78,7 +74,7 @@ class GrammarAnalyzer(object): | |||
| if not (is_terminal(sym) or sym in self.rules_by_origin): | |||
| raise GrammarError("Using an undefined rule: %s" % sym) | |||
| self.init_state = self.expand_rule('start') | |||
| self.init_state = self.expand_rule(start_symbol) | |||
| def expand_rule(self, rule): | |||
| "Returns all init_ptrs accessible by rule (recursive)" | |||
| @@ -7,8 +7,8 @@ from .load_grammar import load_grammar | |||
| from .tree import Tree, Transformer | |||
| from .lexer import Lexer | |||
| from .grammar_analysis import GrammarAnalyzer, is_terminal | |||
| from . import parser, earley | |||
| from .parse_tree_builder import ParseTreeBuilder | |||
| from .parser_frontends import ENGINE_DICT | |||
| class LarkOptions(object): | |||
| """Specifies the options for Lark | |||
| @@ -23,6 +23,7 @@ class LarkOptions(object): | |||
| keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | |||
| cache_grammar - Cache the Lark grammar (Default: False) | |||
| postlex - Lexer post-processing (Default: None) | |||
| start - The start symbol (Default: start) | |||
| """ | |||
| __doc__ += OPTIONS_DOC | |||
| def __init__(self, options_dict): | |||
| @@ -36,6 +37,7 @@ class LarkOptions(object): | |||
| self.postlex = o.pop('postlex', None) | |||
| self.parser = o.pop('parser', 'earley') | |||
| self.transformer = o.pop('transformer', None) | |||
| self.start = o.pop('start', 'start') | |||
| assert self.parser in ENGINE_DICT | |||
| if self.parser == 'earley' and self.transformer: | |||
| @@ -47,71 +49,8 @@ class LarkOptions(object): | |||
| raise ValueError("Unknown options: %s" % o.keys()) | |||
| class Callback(object): | |||
| pass | |||
| class RuleTreeToText(Transformer): | |||
| def expansions(self, x): | |||
| return x | |||
| def expansion(self, symbols): | |||
| return [sym.value for sym in symbols], None | |||
| def alias(self, ((expansion, _alias), alias)): | |||
| assert _alias is None, (alias, expansion, '-', _alias) | |||
| return expansion, alias.value | |||
| def create_rule_handler(expansion, usermethod): | |||
| to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||
| if not (is_terminal(sym) and sym.startswith('_'))] | |||
| def _build_ast(match): | |||
| children = [] | |||
| for i, to_expand in to_include: | |||
| if to_expand: | |||
| children += match[i].children | |||
| else: | |||
| children.append(match[i]) | |||
| return usermethod(children) | |||
| return _build_ast | |||
| def create_expand1_tree_builder_function(tree_builder): | |||
| def f(children): | |||
| if len(children) == 1: | |||
| return children[0] | |||
| else: | |||
| return tree_builder(children) | |||
| return f | |||
| class LALR: | |||
| def build_parser(self, rules, callback): | |||
| ga = GrammarAnalyzer(rules) | |||
| ga.analyze() | |||
| return parser.Parser(ga, callback) | |||
| class Earley: | |||
| @staticmethod | |||
| def _process_expansion(x): | |||
| return [{'literal': s} if is_terminal(s) else s for s in x] | |||
| def build_parser(self, rules, callback): | |||
| rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||
| return EarleyParser(earley.Parser(rules, 'start')) | |||
| class EarleyParser: | |||
| def __init__(self, parser): | |||
| self.parser = parser | |||
| def parse(self, text): | |||
| res = self.parser.parse(text) | |||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
| return res[0] | |||
| ENGINE_DICT = { 'lalr': LALR, 'earley': Earley } | |||
| class Lark: | |||
| def __init__(self, grammar, **options): | |||
| """ | |||
| @@ -147,6 +86,7 @@ class Lark: | |||
| self.lexer = self._build_lexer() | |||
| if not self.options.only_lex: | |||
| self.parser_engine = ENGINE_DICT[self.options.parser]() | |||
| self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | |||
| self.parser = self._build_parser() | |||
| def _build_lexer(self): | |||
| @@ -160,50 +100,12 @@ class Lark: | |||
| def _build_parser(self): | |||
| transformer = self.options.transformer | |||
| callback = Callback() | |||
| rules = [] | |||
| rule_tree_to_text = RuleTreeToText() | |||
| for origin, tree in self.rules.items(): | |||
| for expansion, alias in rule_tree_to_text.transform(tree): | |||
| if alias and origin.startswith('_'): | |||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||
| expand1 = origin.startswith('?') | |||
| _origin = origin.lstrip('?*') | |||
| if alias: | |||
| alias = alias.lstrip('*') | |||
| _alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||
| try: | |||
| f = transformer._get_func(alias or _origin) | |||
| # f = getattr(transformer, alias or _origin) | |||
| except AttributeError: | |||
| if alias: | |||
| f = self._create_tree_builder_function(alias) | |||
| else: | |||
| f = self._create_tree_builder_function(_origin) | |||
| if expand1: | |||
| f = create_expand1_tree_builder_function(f) | |||
| alias_handler = create_rule_handler(expansion, f) | |||
| assert not hasattr(callback, _alias) | |||
| setattr(callback, _alias, alias_handler) | |||
| rules.append((_origin, expansion, _alias)) | |||
| return self.parser_engine.build_parser(rules, callback) | |||
| rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | |||
| return self.parser_engine.build_parser(rules, callback, self.options.start) | |||
| __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||
| def _create_tree_builder_function(self, name): | |||
| tree_class = self.options.tree_class | |||
| def f(children): | |||
| return tree_class(name, children) | |||
| return f | |||
| def lex(self, text): | |||
| stream = self.lexer.lex(text) | |||
| if self.options.postlex: | |||
| @@ -1,16 +1,18 @@ | |||
| import re | |||
| import codecs | |||
| from lexer import Lexer, Token | |||
| from grammar_analysis import GrammarAnalyzer | |||
| from parser import Parser | |||
| from .lexer import Lexer, Token | |||
| from tree import Tree as T, Transformer, InlineTransformer, Visitor | |||
| from .parse_tree_builder import ParseTreeBuilder | |||
| from .parser_frontends import LALR | |||
| from .common import is_terminal, GrammarError | |||
| from .tree import Tree as T, Transformer, InlineTransformer, Visitor | |||
| unicode_escape = codecs.getdecoder('unicode_escape') | |||
| _TOKEN_NAMES = { | |||
| ':' : 'COLON', | |||
| ':' : '_COLON', | |||
| ',' : 'COMMA', | |||
| ';' : 'SEMICOLON', | |||
| '+' : 'PLUS', | |||
| @@ -26,7 +28,7 @@ _TOKEN_NAMES = { | |||
| '<' : 'LESSTHAN', | |||
| '>' : 'MORETHAN', | |||
| '=' : 'EQUAL', | |||
| '.' : 'DOT', | |||
| '.' : '_DOT', | |||
| '%' : 'PERCENT', | |||
| '`' : 'BACKQUOTE', | |||
| '^' : 'CIRCUMFLEX', | |||
| @@ -34,8 +36,8 @@ _TOKEN_NAMES = { | |||
| '\'' : 'QUOTE', | |||
| '~' : 'TILDE', | |||
| '@' : 'AT', | |||
| '(' : 'LPAR', | |||
| ')' : 'RPAR', | |||
| '(' : '_LPAR', | |||
| ')' : '_RPAR', | |||
| '{' : 'LBRACE', | |||
| '}' : 'RBRACE', | |||
| '[' : 'LSQB', | |||
| @@ -44,151 +46,58 @@ _TOKEN_NAMES = { | |||
| # Grammar Parser | |||
| TOKENS = { | |||
| 'LPAR': '\(', | |||
| 'RPAR': '\)', | |||
| 'LBRA': '\[', | |||
| 'RBRA': '\]', | |||
| '_LPAR': '\(', | |||
| '_RPAR': '\)', | |||
| '_LBRA': '\[', | |||
| '_RBRA': '\]', | |||
| 'OP': '[+*?]', | |||
| 'COLON': ':', | |||
| 'OR': '\|', | |||
| 'DOT': '\.', | |||
| '_COLON': ':', | |||
| '_OR': '\|', | |||
| '_DOT': '\.', | |||
| 'RULE': '[_?*]?[a-z][_a-z0-9]*', | |||
| 'TOKEN': '_?[A-Z][_A-Z0-9]*', | |||
| 'STRING': r'".*?[^\\]"', | |||
| 'REGEXP': r"/(.|\n)*?[^\\]/", | |||
| 'NL': r'(\r?\n)+\s*', | |||
| '_NL': r'(\r?\n)+\s*', | |||
| 'WS': r'[ \t]+', | |||
| 'COMMENT': r'//[^\n]*\n', | |||
| 'TO': '->' | |||
| '_TO': '->' | |||
| } | |||
| RULES = [ | |||
| ('start', ['list']), | |||
| ('list', ['item']), | |||
| ('list', ['list', 'item']), | |||
| ('item', ['rule']), | |||
| ('item', ['token']), | |||
| ('item', ['NL']), | |||
| ('rule', ['RULE', 'COLON', 'expansions', 'NL']), | |||
| ('expansions', ['expansion']), | |||
| ('expansions', ['expansions', 'OR', 'expansion']), | |||
| ('expansions', ['expansions', 'NL', 'OR', 'expansion']), | |||
| ('expansion', ['_expansion']), | |||
| ('expansion', ['_expansion', 'TO', 'RULE']), | |||
| RULES = { | |||
| 'start': ['list'], | |||
| 'list': ['item', 'list item'], | |||
| 'item': ['rule', 'token', '_NL'], | |||
| ('_expansion', []), | |||
| ('_expansion', ['_expansion', 'expr']), | |||
| 'rule': ['RULE _COLON expansions _NL'], | |||
| 'expansions': ['expansion', | |||
| 'expansions _OR expansion', | |||
| 'expansions _NL _OR expansion'], | |||
| ('expr', ['atom']), | |||
| ('expr', ['atom', 'OP']), | |||
| 'expansion': ['_expansion', | |||
| '_expansion _TO RULE'], | |||
| ('atom', ['LPAR', 'expansions', 'RPAR']), | |||
| ('atom', ['maybe']), | |||
| '_expansion': ['', '_expansion expr'], | |||
| ('atom', ['RULE']), | |||
| ('atom', ['TOKEN']), | |||
| ('atom', ['anontoken']), | |||
| '?expr': ['atom', | |||
| 'atom OP'], | |||
| ('anontoken', ['tokenvalue']), | |||
| '?atom': ['_LPAR expansions _RPAR', | |||
| 'maybe', | |||
| 'RULE', | |||
| 'TOKEN', | |||
| 'anontoken'], | |||
| ('maybe', ['LBRA', 'expansions', 'RBRA']), | |||
| 'anontoken': ['tokenvalue'], | |||
| ('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']), | |||
| ('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']), | |||
| ('tokenvalue', ['REGEXP']), | |||
| ('tokenvalue', ['STRING']), | |||
| ('tokenmods', ['DOT', 'RULE']), | |||
| ('tokenmods', ['tokenmods', 'DOT', 'RULE']), | |||
| ] | |||
| class SaveDefinitions(object): | |||
| def __init__(self): | |||
| self.rules = {} | |||
| self.token_set = set() | |||
| self.tokens = [] | |||
| self.i = 0 | |||
| 'maybe': ['_LBRA expansions _RBRA'], | |||
| 'token': ['TOKEN _COLON tokenvalue _NL', | |||
| 'TOKEN tokenmods _COLON tokenvalue _NL'], | |||
| def atom__3(self, _1, value, _2): | |||
| return value | |||
| def atom__1(self, value): | |||
| return value | |||
| def expr__1(self, expr): | |||
| return expr | |||
| def expr(self, *x): | |||
| return T('expr', x) | |||
| def expansion__1(self, expansion): | |||
| return expansion | |||
| def expansion__3(self, expansion, _, alias): | |||
| return T('alias', [expansion, alias]) | |||
| def _expansion(self, *x): | |||
| return T('expansion', x) | |||
| def expansions(self, *x): | |||
| items = [i for i in x if isinstance(i, T)] | |||
| return T('expansions', items) | |||
| def maybe(self, _1, expr, _2): | |||
| return T('expr', [expr, Token('OP', '?', -1)]) | |||
| def rule(self, name, _1, expansion, _2): | |||
| name = name.value | |||
| if name in self.rules: | |||
| raise ValueError("Rule '%s' defined more than once" % name) | |||
| self.rules[name] = expansion | |||
| def token(self, *x): | |||
| name = x[0].value | |||
| if name in self.token_set: | |||
| raise ValueError("Token '%s' defined more than once" % name) | |||
| self.token_set.add(name) | |||
| if len(x) == 4: | |||
| self.tokens.append((name, x[2], [])) | |||
| else: | |||
| self.tokens.append((name, x[3], x[1].children)) | |||
| def tokenvalue(self, tokenvalue): | |||
| return tokenvalue | |||
| def anontoken(self, token): | |||
| if token.type == 'STRING': | |||
| value = token.value[1:-1] | |||
| try: | |||
| token_name = _TOKEN_NAMES[value] | |||
| except KeyError: | |||
| if value.isalnum() and value[0].isalpha(): | |||
| token_name = value.upper() | |||
| else: | |||
| token_name = 'ANONSTR_%d' % self.i | |||
| self.i += 1 | |||
| token_name = '__' + token_name | |||
| elif token.type == 'REGEXP': | |||
| token_name = 'ANONRE_%d' % self.i | |||
| self.i += 1 | |||
| else: | |||
| assert False, x | |||
| if token_name not in self.token_set: | |||
| self.token_set.add(token_name) | |||
| self.tokens.append((token_name, token, [])) | |||
| return Token('TOKEN', token_name, -1) | |||
| def tokenmods__2(self, _, rule): | |||
| return T('tokenmods', [rule.value]) | |||
| def tokenmods__3(self, tokenmods, _, rule): | |||
| return T('tokenmods', tokenmods.children + [rule.value]) | |||
| def start(self, *x): pass | |||
| def list(self, *x): pass | |||
| def item(self, *x): pass | |||
| '?tokenvalue': ['REGEXP', 'STRING'], | |||
| 'tokenmods': ['_DOT RULE', 'tokenmods _DOT RULE'], | |||
| } | |||
| class EBNF_to_BNF(InlineTransformer): | |||
| @@ -281,46 +190,110 @@ def dict_update_safe(d1, d2): | |||
| d1[k] = v | |||
| def generate_aliases(): | |||
| sd = SaveDefinitions() | |||
| for name, expansion in RULES: | |||
| try: | |||
| f = getattr(sd, "%s__%s" % (name, len(expansion))) | |||
| except AttributeError: | |||
| f = getattr(sd, name) | |||
| yield name, expansion, f.__name__ | |||
| class RuleTreeToText(Transformer): | |||
| def expansions(self, x): | |||
| return x | |||
| def expansion(self, symbols): | |||
| return [sym.value for sym in symbols], None | |||
| def alias(self, ((expansion, _alias), alias)): | |||
| assert _alias is None, (alias, expansion, '-', _alias) | |||
| return expansion, alias.value | |||
| class SimplifyTree(InlineTransformer): | |||
| def maybe(self, expr): | |||
| return T('expr', [expr, Token('OP', '?', -1)]) | |||
| def tokenmods(self, *args): | |||
| if len(args) == 1: | |||
| return list(args) | |||
| tokenmods, value = args | |||
| return tokenmods + [value] | |||
| def get_tokens(tree, token_set): | |||
| tokens = [] | |||
| for t in tree.find_data('token'): | |||
| x = t.children | |||
| name = x[0].value | |||
| assert not name.startswith('__'), 'Names starting with double-underscore are reserved (Error at %s)' % name | |||
| if name in token_set: | |||
| raise ValueError("Token '%s' defined more than once" % name) | |||
| token_set.add(name) | |||
| if len(x) == 2: | |||
| yield name, x[1], [] | |||
| else: | |||
| assert len(x) == 3 | |||
| yield name, x[2], x[1] | |||
| class ExtractAnonTokens(InlineTransformer): | |||
| def __init__(self, tokens, token_set): | |||
| self.tokens = tokens | |||
| self.token_set = token_set | |||
| self.token_reverse = {value[1:-1]: name for name, value, _flags in tokens} | |||
| def anontoken(self, token): | |||
| if token.type == 'STRING': | |||
| value = token.value[1:-1] | |||
| try: | |||
| # If already defined, use the user-defined token name | |||
| token_name = self.token_reverse[value] | |||
| except KeyError: | |||
| # Try to assign an indicative anon-token name, otherwise use a numbered name | |||
| try: | |||
| token_name = _TOKEN_NAMES[value] | |||
| except KeyError: | |||
| if value.isalnum() and value[0].isalpha(): | |||
| token_name = value.upper() | |||
| else: | |||
| token_name = 'ANONSTR_%d' % self.i | |||
| self.i += 1 | |||
| token_name = '__' + token_name | |||
| elif token.type == 'REGEXP': | |||
| token_name = 'ANONRE_%d' % self.i | |||
| self.i += 1 | |||
| else: | |||
| assert False, x | |||
| if token_name not in self.token_set: | |||
| self.token_set.add(token_name) | |||
| self.tokens.append((token_name, token, [])) | |||
| return Token('TOKEN', token_name, -1) | |||
| def inline_args(f): | |||
| def _f(self, args): | |||
| return f(*args) | |||
| return _f | |||
| class GrammarLoader: | |||
| def __init__(self): | |||
| self.rules = list(generate_aliases()) | |||
| self.ga = GrammarAnalyzer(self.rules) | |||
| self.ga.analyze() | |||
| self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | |||
| self.simplify_rule = SimplifyRule_Visitor() | |||
| def _generate_parser_callbacks(self, callbacks): | |||
| d = {alias: inline_args(getattr(callbacks, alias)) | |||
| for _n, _x, alias in self.rules} | |||
| return type('Callback', (), d)() | |||
| d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()} | |||
| rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) | |||
| self.parser = LALR().build_parser(rules, callback, 'start') | |||
| self.simplify_tree = SimplifyTree() | |||
| self.simplify_rule = SimplifyRule_Visitor() | |||
| self.rule_tree_to_text = RuleTreeToText() | |||
| def load_grammar(self, grammar_text): | |||
| sd = SaveDefinitions() | |||
| c = self._generate_parser_callbacks(sd) | |||
| p = Parser(self.ga, c) | |||
| p.parse( list(self.lexer.lex(grammar_text+"\n")) ) | |||
| token_stream = list(self.lexer.lex(grammar_text+"\n")) | |||
| tree = self.simplify_tree.transform( self.parser.parse(token_stream) ) | |||
| # ================= | |||
| # Process Tokens | |||
| # ================= | |||
| token_set = set() | |||
| tokens = list(get_tokens(tree, token_set)) | |||
| extract_anon = ExtractAnonTokens(tokens, token_set) | |||
| tree = extract_anon.transform(tree) # Adds to tokens | |||
| # Tokens | |||
| token_ref = {} | |||
| re_tokens = [] | |||
| str_tokens = [] | |||
| for name, token, flags in sd.tokens: | |||
| for name, token, flags in tokens: | |||
| value = token.value[1:-1] | |||
| if '\u' in value: | |||
| # XXX for now, you can't mix unicode escaping and unicode characters at the same token | |||
| @@ -343,43 +316,70 @@ class GrammarLoader: | |||
| re_tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
| tokens = str_tokens + re_tokens # Order is important! | |||
| # Rules | |||
| # ================= | |||
| # Process Rules | |||
| # ================= | |||
| ebnf_to_bnf = EBNF_to_BNF() | |||
| rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} | |||
| rules = {} | |||
| for rule in tree.find_data('rule'): | |||
| name, ebnf_tree = rule.children | |||
| name = name.value | |||
| if name in rules: | |||
| raise ValueError("Rule '%s' defined more than once" % name) | |||
| rules[name] = ebnf_to_bnf.transform(ebnf_tree) | |||
| dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||
| for r in rules.values(): | |||
| self.simplify_rule.visit(r) | |||
| rules = {origin: self.rule_tree_to_text.transform(tree) for origin, tree in rules.items()} | |||
| # ==================== | |||
| # Verify correctness | |||
| # ==================== | |||
| used_symbols = {symbol for expansions in rules.values() | |||
| for expansion, _alias in expansions | |||
| for symbol in expansion} | |||
| rule_set = {r.lstrip('?') for r in rules} | |||
| for sym in used_symbols: | |||
| if is_terminal(sym): | |||
| if sym not in token_set: | |||
| raise GrammarError("Token '%s' used but not defined" % sym) | |||
| else: | |||
| if sym not in rule_set: | |||
| raise GrammarError("Rule '%s' used but not defined" % sym) | |||
| return tokens, rules | |||
| load_grammar = GrammarLoader().load_grammar | |||
| def test(): | |||
| g = """ | |||
| start: add | |||
| # Rules | |||
| // Rules | |||
| add: mul | |||
| | add _add_sym mul | |||
| mul: _atom | |||
| | mul _add_mul _atom | |||
| mul: [mul _add_mul] _atom | |||
| neg: "-" _atom | |||
| _atom: neg | |||
| | number | |||
| _atom: "-" _atom -> neg | |||
| | NUMBER | |||
| | "(" add ")" | |||
| # Tokens | |||
| number: /[\d.]+/ | |||
| // Tokens | |||
| NUMBER: /[\d.]+/ | |||
| _add_sym: "+" | "-" | |||
| _add_mul: "*" | "/" | |||
| WS.ignore: /\s+/ | |||
| WS.ignore.newline: /\s+/ | |||
| """ | |||
| g2 = """ | |||
| @@ -389,7 +389,9 @@ def test(): | |||
| c: "c" | |||
| d: "+" | "-" | |||
| """ | |||
| load_grammar(g) | |||
| # print load_grammar(g) | |||
| print GrammarLoader().load_grammar2(g) | |||
| if __name__ == '__main__': | |||
| test() | |||
| @@ -0,0 +1,76 @@ | |||
| from .grammar_analysis import is_terminal | |||
| class Callback(object): | |||
| pass | |||
| def create_expand1_tree_builder_function(tree_builder): | |||
| def f(children): | |||
| if len(children) == 1: | |||
| return children[0] | |||
| else: | |||
| return tree_builder(children) | |||
| return f | |||
| def create_rule_handler(expansion, usermethod): | |||
| to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) | |||
| if not (is_terminal(sym) and sym.startswith('_'))] | |||
| def _build_ast(match): | |||
| children = [] | |||
| for i, to_expand in to_include: | |||
| if to_expand: | |||
| children += match[i].children | |||
| else: | |||
| children.append(match[i]) | |||
| return usermethod(children) | |||
| return _build_ast | |||
| class ParseTreeBuilder: | |||
| def __init__(self, tree_class): | |||
| self.tree_class = tree_class | |||
| def _create_tree_builder_function(self, name): | |||
| tree_class = self.tree_class | |||
| def f(children): | |||
| return tree_class(name, children) | |||
| return f | |||
| def create_tree_builder(self, rules, transformer): | |||
| callback = Callback() | |||
| new_rules = [] | |||
| for origin, expansions in rules.items(): | |||
| expand1 = origin.startswith('?') | |||
| _origin = origin.lstrip('?*') | |||
| for expansion, alias in expansions: | |||
| if alias and origin.startswith('_'): | |||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) | |||
| if alias: | |||
| alias = alias.lstrip('*') | |||
| _alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) | |||
| try: | |||
| f = transformer._get_func(alias or _origin) | |||
| except AttributeError: | |||
| if alias: | |||
| f = self._create_tree_builder_function(alias) | |||
| else: | |||
| f = self._create_tree_builder_function(_origin) | |||
| if expand1: | |||
| f = create_expand1_tree_builder_function(f) | |||
| alias_handler = create_rule_handler(expansion, f) | |||
| assert not hasattr(callback, _alias) | |||
| setattr(callback, _alias, alias_handler) | |||
| new_rules.append(( _origin, expansion, _alias )) | |||
| return new_rules, callback | |||
| @@ -34,7 +34,7 @@ class Parser(object): | |||
| res = self.callbacks[rule]([x[0] for x in s]) | |||
| if rule.origin == 'start': | |||
| if rule.origin == self.ga.start_symbol and len(stack) == 1: | |||
| return res | |||
| _action, new_state = get_action(rule.origin) | |||
| @@ -0,0 +1,31 @@ | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| from common import is_terminal | |||
| from . import parser, earley | |||
| class LALR: | |||
| def build_parser(self, rules, callback, start): | |||
| ga = GrammarAnalyzer(rules, start) | |||
| ga.analyze() | |||
| return parser.Parser(ga, callback) | |||
| class Earley: | |||
| @staticmethod | |||
| def _process_expansion(x): | |||
| return [{'literal': s} if is_terminal(s) else s for s in x] | |||
| def build_parser(self, rules, callback, start): | |||
| rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] | |||
| return EarleyParser(earley.Parser(rules, start)) | |||
| class EarleyParser: | |||
| def __init__(self, parser): | |||
| self.parser = parser | |||
| def parse(self, text): | |||
| res = self.parser.parse(text) | |||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
| return res[0] | |||
| ENGINE_DICT = { 'lalr': LALR, 'earley': Earley } | |||
| @@ -0,0 +1,14 @@ | |||
| from __future__ import absolute_import, print_function | |||
| import unittest | |||
| import logging | |||
| from .test_trees import TestTrees | |||
| # from .test_selectors import TestSelectors | |||
| from .test_parser import TestLalr | |||
| # from .test_grammars import TestPythonG, TestConfigG | |||
| logging.basicConfig(level=logging.INFO) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,326 @@ | |||
| from __future__ import absolute_import | |||
| import unittest | |||
| import logging | |||
| import os | |||
| import sys | |||
| try: | |||
| from cStringIO import StringIO as cStringIO | |||
| except ImportError: | |||
| # Available only in Python 2.x, 3.x only has io.StringIO from below | |||
| cStringIO = None | |||
| from io import ( | |||
| StringIO as uStringIO, | |||
| open, | |||
| ) | |||
| logging.basicConfig(level=logging.INFO) | |||
| from lark.lark import Lark | |||
| from lark.grammar_analysis import GrammarError | |||
| from lark.parser import ParseError | |||
| __path__ = os.path.dirname(__file__) | |||
| def _read(n, *args): | |||
| with open(os.path.join(__path__, n), *args) as f: | |||
| return f.read() | |||
| class TestLalr(unittest.TestCase): | |||
| def test_basic1(self): | |||
| g = Lark("""start: a+ b a* "b" a* | |||
| b: "b" | |||
| a: "a" | |||
| """, parser='lalr') | |||
| r = g.parse('aaabaab') | |||
| self.assertEqual( ''.join(x.data for x in r.children), 'aaabaa' ) | |||
| r = g.parse('aaabaaba') | |||
| self.assertEqual( ''.join(x.data for x in r.children), 'aaabaaa' ) | |||
| self.assertRaises(ParseError, g.parse, 'aaabaa') | |||
| def test_basic2(self): | |||
| # Multiple parsers and colliding tokens | |||
| g = Lark("""start: B A | |||
| B: "12" | |||
| A: "1" """) | |||
| g2 = Lark("""start: B A | |||
| B: "12" | |||
| A: "2" """) | |||
| x = g.parse('121') | |||
| assert x.data == 'start' and x.children == ['12', '1'], x | |||
| x = g2.parse('122') | |||
| assert x.data == 'start' and x.children == ['12', '2'], x | |||
| def test_basic3(self): | |||
| "Tests that Earley and LALR parsers produce equal trees" | |||
| g = Lark("""start: "(" name_list ("," "*" NAME)? ")" | |||
| name_list: NAME | name_list "," NAME | |||
| NAME: /\w+/ """, parser='lalr') | |||
| l = g.parse('(a,b,c,*x)') | |||
| g = Lark("""start: "(" name_list ("," "*" NAME)? ")" | |||
| name_list: NAME | name_list "," NAME | |||
| NAME: /\w+/ """) | |||
| l2 = g.parse('(a,b,c,*x)') | |||
| assert l == l2, '%s != %s' % (l.pretty(), l2.pretty()) | |||
| @unittest.skipIf(cStringIO is None, "cStringIO not available") | |||
| def test_stringio_bytes(self): | |||
| """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object""" | |||
| Lark(cStringIO(b'start: a+ b a* "b" a*\n b: "b"\n a: "a" ')) | |||
| def test_stringio_unicode(self): | |||
| """Verify that a Lark can be created from file-like objects other than Python's standard 'file' object""" | |||
| Lark(uStringIO(u'start: a+ b a* "b" a*\n b: "b"\n a: "a" ')) | |||
| def test_unicode(self): | |||
| g = Lark(u"""start: UNIA UNIB UNIA | |||
| UNIA: /\xa3/ | |||
| UNIB: /\u0101/ | |||
| """) | |||
| g.parse(u'\xa3\u0101\u00a3') | |||
| def test_unicode2(self): | |||
| g = Lark(r"""start: UNIA UNIB UNIA UNIC | |||
| UNIA: /\xa3/ | |||
| UNIB: "a\u0101b\ " | |||
| UNIC: /a?\u0101c\n/ | |||
| """) | |||
| g.parse(u'\xa3a\u0101b\\ \u00a3\u0101c\n') | |||
| def test_recurse_expansion(self): | |||
| """Verify that stack depth doesn't get exceeded on recursive rules marked for expansion.""" | |||
| g = Lark(r"""start: a | start a | |||
| a : "a" """) | |||
| # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built | |||
| # STree data structures, which uses recursion). | |||
| g.parse("a" * (sys.getrecursionlimit() // 4)) | |||
| def test_expand1_lists_with_one_item(self): | |||
| g = Lark(r"""start: list | |||
| ?list: item+ | |||
| item : A | |||
| A: "a" | |||
| """) | |||
| r = g.parse("a") | |||
| # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item' | |||
| self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',)) | |||
| # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
| self.assertEqual(len(r.children), 1) | |||
| def test_expand1_lists_with_one_item_2(self): | |||
| g = Lark(r"""start: list | |||
| ?list: item+ "!" | |||
| item : A | |||
| A: "a" | |||
| """) | |||
| r = g.parse("a!") | |||
| # because 'list' is an expand-if-contains-one rule and we only provided one element it should have expanded to 'item' | |||
| self.assertSequenceEqual([subtree.data for subtree in r.children], ('item',)) | |||
| # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
| self.assertEqual(len(r.children), 1) | |||
| def test_dont_expand1_lists_with_multiple_items(self): | |||
| g = Lark(r"""start: list | |||
| ?list: item+ | |||
| item : A | |||
| A: "a" | |||
| """) | |||
| r = g.parse("aa") | |||
| # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded | |||
| self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
| # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
| self.assertEqual(len(r.children), 1) | |||
| # Sanity check: verify that 'list' contains the two 'item's we've given it | |||
| [list] = r.children | |||
| self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) | |||
| def test_dont_expand1_lists_with_multiple_items_2(self): | |||
| g = Lark(r"""start: list | |||
| ?list: item+ "!" | |||
| item : A | |||
| A: "a" | |||
| """) | |||
| r = g.parse("aa!") | |||
| # because 'list' is an expand-if-contains-one rule and we've provided more than one element it should *not* have expanded | |||
| self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
| # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
| self.assertEqual(len(r.children), 1) | |||
| # Sanity check: verify that 'list' contains the two 'item's we've given it | |||
| [list] = r.children | |||
| self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) | |||
| def test_empty_expand1_list(self): | |||
| g = Lark(r"""start: list | |||
| ?list: item* | |||
| item : A | |||
| A: "a" | |||
| """) | |||
| r = g.parse("") | |||
| # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded | |||
| self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
| # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
| self.assertEqual(len(r.children), 1) | |||
| # Sanity check: verify that 'list' contains no 'item's as we've given it none | |||
| [list] = r.children | |||
| self.assertSequenceEqual([item.data for item in list.children], ()) | |||
| def test_empty_expand1_list_2(self): | |||
| g = Lark(r"""start: list | |||
| ?list: item* "!"? | |||
| item : A | |||
| A: "a" | |||
| """) | |||
| r = g.parse("") | |||
| # because 'list' is an expand-if-contains-one rule and we've provided less than one element (i.e. none) it should *not* have expanded | |||
| self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
| # regardless of the amount of items: there should be only *one* child in 'start' because 'list' isn't an expand-all rule | |||
| self.assertEqual(len(r.children), 1) | |||
| # Sanity check: verify that 'list' contains no 'item's as we've given it none | |||
| [list] = r.children | |||
| self.assertSequenceEqual([item.data for item in list.children], ()) | |||
| def test_empty_flatten_list(self): | |||
| g = Lark(r"""start: list | |||
| list: | item "," list | |||
| item : A | |||
| A: "a" | |||
| """) | |||
| r = g.parse("") | |||
| # Because 'list' is a flatten rule it's top-level element should *never* be expanded | |||
| self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
| # Sanity check: verify that 'list' contains no 'item's as we've given it none | |||
| [list] = r.children | |||
| self.assertSequenceEqual([item.data for item in list.children], ()) | |||
| @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") | |||
| def test_single_item_flatten_list(self): | |||
| g = Lark(r"""start: list | |||
| list: | item "," list | |||
| item : A | |||
| A: "a" | |||
| """) | |||
| r = g.parse("a,") | |||
| # Because 'list' is a flatten rule it's top-level element should *never* be expanded | |||
| self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
| # Sanity check: verify that 'list' contains exactly the one 'item' we've given it | |||
| [list] = r.children | |||
| self.assertSequenceEqual([item.data for item in list.children], ('item',)) | |||
| @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") | |||
| def test_multiple_item_flatten_list(self): | |||
| g = Lark(r"""start: list | |||
| #list: | item "," list | |||
| item : A | |||
| A: "a" | |||
| """) | |||
| r = g.parse("a,a,") | |||
| # Because 'list' is a flatten rule it's top-level element should *never* be expanded | |||
| self.assertSequenceEqual([subtree.data for subtree in r.children], ('list',)) | |||
| # Sanity check: verify that 'list' contains exactly the two 'item's we've given it | |||
| [list] = r.children | |||
| self.assertSequenceEqual([item.data for item in list.children], ('item', 'item')) | |||
| @unittest.skipIf(True, "Flattening list isn't implemented (and may never be)") | |||
| def test_recurse_flatten(self): | |||
| """Verify that stack depth doesn't get exceeded on recursive rules marked for flattening.""" | |||
| g = Lark(r"""start: a | start a | |||
| a : A | |||
| A : "a" """) | |||
| # Force PLY to write to the debug log, but prevent writing it to the terminal (uses repr() on the half-built | |||
| # STree data structures, which uses recursion). | |||
| g.parse("a" * (sys.getrecursionlimit() // 4)) | |||
| def test_token_collision(self): | |||
| g = Lark("""start: "Hello" NAME | |||
| NAME: /\w+/ | |||
| WS.ignore: /\s+/ | |||
| """, parser='lalr') | |||
| x = g.parse('Hello World') | |||
| self.assertSequenceEqual(x.children, ['World']) | |||
| def test_undefined_rule(self): | |||
| self.assertRaises(GrammarError, Lark, """start: a""", parser='lalr') | |||
| def test_undefined_token(self): | |||
| self.assertRaises(GrammarError, Lark, """start: A""", parser='lalr') | |||
| def test_rule_collision(self): | |||
| g = Lark("""start: "a"+ "b" | |||
| | "a"+ """, parser='lalr') | |||
| x = g.parse('aaaa') | |||
| x = g.parse('aaaab') | |||
| def test_rule_collision2(self): | |||
| g = Lark("""start: "a"* "b" | |||
| | "a"+ """, parser='lalr') | |||
| x = g.parse('aaaa') | |||
| x = g.parse('aaaab') | |||
| x = g.parse('b') | |||
| def test_regex_embed(self): | |||
| g = Lark("""start: A B C | |||
| A: /a/ | |||
| B: /${A}b/ | |||
| C: /${B}c/ | |||
| """, parser='lalr') | |||
| x = g.parse('aababc') | |||
| def test_token_not_anon(self): | |||
| """Tests that "a" is matched as A, rather than an anonymous token. | |||
| That means that "a" is not filtered out, despite being an 'immediate string'. | |||
| Whether or not this is the intuitive behavior, I'm not sure yet. | |||
| -Erez | |||
| """ | |||
| g = Lark("""start: "a" | |||
| A: "a" """, parser='lalr') | |||
| x = g.parse('a') | |||
| self.assertEqual(len(x.children), 1, '"a" should not be considered anonymous') | |||
| self.assertEqual(x.children[0].type, "A") | |||
| def test_maybe(self): | |||
| g = Lark("""start: ["a"] """, parser='lalr') | |||
| x = g.parse('a') | |||
| x = g.parse('') | |||
| def test_start(self): | |||
| g = Lark("""a: "a" a? """, parser='lalr', start='a') | |||
| x = g.parse('a') | |||
| x = g.parse('aa') | |||
| x = g.parse('aaa') | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -0,0 +1,26 @@ | |||
| from __future__ import absolute_import | |||
| from unittest import TestCase | |||
| import logging | |||
| import copy | |||
| import pickle | |||
| from lark.tree import Tree | |||
| class TestTrees(TestCase): | |||
| def setUp(self): | |||
| self.tree1 = Tree('a', [Tree(x, y) for x, y in zip('bcd', 'xyz')]) | |||
| def test_deepcopy(self): | |||
| assert self.tree1 == copy.deepcopy(self.tree1) | |||
| def test_pickle(self): | |||
| s = copy.deepcopy(self.tree1) | |||
| data = pickle.dumps(s) | |||
| assert pickle.loads(data) == s | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||
| @@ -33,6 +33,19 @@ class Tree(object): | |||
| def __eq__(self, other): | |||
| return self.data == other.data and self.children == other.children | |||
| def find_pred(self, pred): | |||
| if pred(self): | |||
| yield self | |||
| else: | |||
| for i, c in enumerate(self.children): | |||
| if isinstance(c, Tree): | |||
| for t in c.find_pred(pred): | |||
| yield t | |||
| def find_data(self, data): | |||
| return self.find_pred(lambda t: t.data == data) | |||
| # def find_path(self, pred): | |||
| # if pred(self): | |||
| # yield [] | |||