diff --git a/examples/calc.py b/examples/calc.py index 6d0fa1f..e5df631 100644 --- a/examples/calc.py +++ b/examples/calc.py @@ -22,15 +22,16 @@ calc_grammar = """ | product "*" atom -> mul | product "/" atom -> div - ?atom: NUMBER -> number + ?atom: DECIMAL -> number | "-" atom -> neg | NAME -> var | "(" sum ")" - NAME: /[a-zA-Z]\w+/ // Regexp form - NUMBER: ("0".."9"|".")+ // EBNF form (compiles to regexp) + %import common.CNAME -> NAME + %import common.DECIMAL + %import common.WS_INLINE - %ignore " "|"\t" + %ignore WS_INLINE """ class CalculateTree(InlineTransformer): diff --git a/examples/json_parser.py b/examples/json_parser.py index 0a5ac0e..53b3afb 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -24,10 +24,14 @@ json_grammar = r""" object : "{" [pair ("," pair)*] "}" pair : string ":" value - number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ - string : /".*?(?' + '_TO': '->', + '_IGNORE': r'%ignore', + '_IMPORT': r'%import', } RULES = { @@ -93,16 +99,22 @@ RULES = { '?atom': ['_LPAR expansions _RPAR', 'maybe', - 'RULE', - 'TOKEN', + 'name', 'tokenvalue', 'range'], + '?name': ['RULE', 'TOKEN'], + 'maybe': ['_LBRA expansions _RBRA'], 'range': ['STRING _DOT _DOT STRING'], 'token': ['TOKEN _COLON expansions _NL'], - 'statement': ['_PERCENT RULE expansions _NL'], + 'statement': ['ignore', 'import'], + 'ignore': ['_IGNORE expansions _NL'], + 'import': ['_IMPORT import_args _NL', + '_IMPORT import_args _TO TOKEN'], + 'import_args': ['_import_args'], + '_import_args': ['name', '_import_args _DOT name'], 'tokenvalue': ['REGEXP', 'STRING'], } @@ -301,7 +313,7 @@ class TokenTreeToRegexp(Transformer): def expansions(self, exps): if len(exps) == 1: return exps[0] - return TokenValue__Regexp('%s' % ('|'.join(i.to_regexp() for i in exps))) + return TokenValue__Regexp('(?:%s)' % ('|'.join(i.to_regexp() for i in exps))) def range(self, items): assert all(i.type=='STRING' for i in items) items = [i[1:-1] for i in items] @@ -314,43 +326,20 @@ class TokenTreeToRegexp(Transformer): return TokenValue__Regexp('(?:%s)%s' % (inner.to_regexp(), op)) class Grammar: - def __init__(self, ruledefs, tokendefs, extra): - self.tokendefs = tokendefs - self.ruledefs = ruledefs + def __init__(self, rule_defs, token_defs, extra): + self.token_defs = token_defs + self.rule_defs = rule_defs self.extra = extra def compile(self, lexer=False): assert lexer - tokendefs = [(name.value, t) for name, t in self.tokendefs] - - ignore = [] - for i, t in enumerate(self.extra['ignore']): - name = '__IGNORE_%d'%i - tokendefs.append((name, t)) - ignore.append(name) - self.extra['ignore'] = ignore + tokendefs = list(self.token_defs) # ================= # Compile Tokens # ================= token_to_regexp = TokenTreeToRegexp() - token_dict = dict(tokendefs) - assert len(token_dict) == len(tokendefs), "Same name defined twice?" - - # Resolve token assignments - while True: - changed = False - for name, token_tree in tokendefs: - for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')): - for i, item in enumerate(exp.children): - if isinstance(item, Token): - assert item.type != 'RULE', "Rules aren't allowed inside tokens" - if item.type == 'TOKEN': - exp.children[i] = token_dict[item] - changed = True - if not changed: - break # Convert tokens to strings/regexps tokens = [] @@ -362,9 +351,9 @@ class Grammar: tokendef = TokenDef__Regexp(name, regexp.to_regexp()) tokens.append(tokendef) - # Resolve regexp assignments of the form /..${X}../ - # Not sure this is even important, since you can express most regexps with EBNF - # TODO a nicer implementation of this + # Resolve regexp assignments of the form /..${X}../ + # XXX This is deprecated, since you can express most regexps with EBNF + # XXX Also, since this happens after import, it can be a source of bugs token_dict = {td.name: td.to_regexp() for td in tokens} while True: changed = False @@ -389,7 +378,7 @@ class Grammar: rule_tree_to_text = RuleTreeToText() rules = {} - for name, rule_tree in self.ruledefs: + for name, rule_tree in self.rule_defs: assert name not in rules tree = extract_anon.transform(rule_tree) # Adds to tokens rules[name] = ebnf_to_bnf.transform(tree) @@ -417,6 +406,36 @@ class GrammarRule: +_imported_grammars = {} +def import_grammar(grammar_path): + if grammar_path not in _imported_grammars: + for import_path in IMPORT_PATHS: + with open(os.path.join(import_path, grammar_path)) as f: + text = f.read() + grammar = load_grammar(text) + _imported_grammars[grammar_path] = grammar + + return _imported_grammars[grammar_path] + + +def resolve_token_references(token_defs): + token_dict = dict(token_defs) + assert len(token_dict) == len(token_defs), "Same name defined twice?" + + while True: + changed = False + for name, token_tree in token_defs: + for exp in chain(token_tree.find_data('expansion'), token_tree.find_data('expr')): + for i, item in enumerate(exp.children): + if isinstance(item, Token): + if item.type == 'RULE': + raise GrammarError("Rules aren't allowed inside tokens (%s in %s)" % (item, name)) + if item.type == 'TOKEN': + exp.children[i] = token_dict[item] + changed = True + if not changed: + break + class GrammarLoader: def __init__(self): @@ -451,11 +470,51 @@ class GrammarLoader: statements = [c.children for c in tree.children if c.data=='statement'] assert len(token_defs) + len(rule_defs) + len(statements) == len(tree.children) - # Verify correctness - token_names = set() + token_defs = [(name.value, t) for name, t in token_defs] + + # Execute statements + ignore = [] + for (stmt,) in statements: + if stmt.data == 'ignore': + expansions ,= stmt.children + ignore.append(expansions) + elif stmt.data == 'import': + dotted_path = stmt.children[0].children + name = stmt.children[1] if len(stmt.children)>1 else dotted_path[-1] + grammar_path = os.path.join(*dotted_path[:-1]) + '.g' + g = import_grammar(grammar_path) + token_tree = dict(g.token_defs)[dotted_path[-1]] + token_defs.append([name.value, token_tree]) + else: + assert False, command + + + # Verify correctness 1 for name, _ in token_defs: if name.startswith('__'): raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) + # Handle ignore tokens + ignore_names = [] + for i, t in enumerate(ignore): + if t.data == 'expansions' and len(t.children) == 1: + x ,= t.children + if x.data == 'expansion' and len(x.children) == 1: + item ,= x.children + if isinstance(item, Token) and item.type == 'TOKEN': + # XXX is this really a wise solution? -- Erez + ignore_names.append(item.value) + continue + + name = '__IGNORE_%d'%i + token_defs.append((name, t)) + ignore_names.append(name) + + # Resolve token references + resolve_token_references(token_defs) + + # Verify correctness 2 + token_names = set() + for name, _ in token_defs: if name in token_names: raise GrammarError("Token '%s' defined more than once" % name) token_names.add(name) @@ -467,7 +526,7 @@ class GrammarLoader: if r.name.startswith('__'): raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) if r.name in rule_names: - raise GrammarError("Token '%s' defined more than once" % name) + raise GrammarError("Rule '%s' defined more than once" % r.name) rule_names.add(r.name) for r in rules: @@ -481,14 +540,10 @@ class GrammarLoader: if sym not in rule_names: raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, r.name)) - ignore = [] - for command, expansions in statements: - if command == 'ignore': - ignore.append(expansions) - else: - assert False, command + # TODO don't include unused tokens, they can only cause trouble! + + return Grammar(rule_defs, token_defs, {'ignore': ignore_names}) - return Grammar(rule_defs, token_defs, {'ignore': ignore}) load_grammar = GrammarLoader().load_grammar diff --git a/lark/tree.py b/lark/tree.py index d3e8c69..9e818ec 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -39,11 +39,11 @@ class Tree(object): def find_pred(self, pred): if pred(self): yield self - else: - for c in self.children: - if isinstance(c, Tree): - for t in c.find_pred(pred): - yield t + + for c in self.children: + if isinstance(c, Tree): + for t in c.find_pred(pred): + yield t def find_data(self, data): return self.find_pred(lambda t: t.data == data)