diff --git a/lark/lark.py b/lark/lark.py index def7368..41d0ddf 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,7 +1,6 @@ from __future__ import absolute_import -import functools -import types +import os from .utils import STRING_TYPE, inline_args from .load_grammar import load_grammar @@ -153,7 +152,7 @@ class Lark: def _build_lexer(self): ignore_tokens = [] tokens = [] - for name, (value, flags) in self.tokens: + for name, value, flags in self.tokens: if 'ignore' in flags: ignore_tokens.append(name) tokens.append((name, value)) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 49f3c86..5f37468 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -58,7 +58,7 @@ TOKENS = { 'REGEXP': r"/(.|\n)*?[^\\]/", 'NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', - 'COMMENT': r'#[^\n]*\n', + 'COMMENT': r'//[^\n]*\n', 'TO': '->' } @@ -106,7 +106,8 @@ RULES = [ class SaveDefinitions(object): def __init__(self): self.rules = {} - self.tokens = {} + self.token_set = set() + self.tokens = [] self.i = 0 @@ -143,13 +144,14 @@ class SaveDefinitions(object): def token(self, *x): name = x[0].value - if name in self.tokens: + if name in self.token_set: raise ValueError("Token '%s' defined more than once" % name) + self.token_set.add(name) if len(x) == 4: - self.tokens[name] = x[2], [] + self.tokens.append((name, x[2], [])) else: - self.tokens[name] = x[3], x[1].children + self.tokens.append((name, x[3], x[1].children)) def tokenvalue(self, tokenvalue): return tokenvalue @@ -173,8 +175,9 @@ class SaveDefinitions(object): else: assert False, x - if token_name not in self.tokens: - self.tokens[token_name] = token, [] + if token_name not in self.token_set: + self.token_set.add(token_name) + self.tokens.append((token_name, token, [])) return Token('TOKEN', token_name, -1) @@ -191,14 +194,19 @@ class SaveDefinitions(object): class EBNF_to_BNF(InlineTransformer): def __init__(self): self.new_rules = {} + self.rules_by_expr = {} self.prefix = 'anon' self.i = 0 def _add_recurse_rule(self, type_, expr): + if expr in self.rules_by_expr: + return self.rules_by_expr[expr] + new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) self.i += 1 t = Token('RULE', new_name, -1) self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) + self.rules_by_expr[expr] = t return t def expr(self, rule, op): @@ -309,9 +317,10 @@ class GrammarLoader: p.parse( list(self.lexer.lex(grammar_text+"\n")) ) # Tokens + token_ref = {} re_tokens = [] str_tokens = [] - for name, (token, flags) in sd.tokens.items(): + for name, token, flags in sd.tokens: value = token.value[1:-1] if '\u' in value: # XXX for now, you can't mix unicode escaping and unicode characters at the same token @@ -319,13 +328,19 @@ class GrammarLoader: if token.type == 'STRING': value = re.escape(value) - str_tokens.append((name, (value, flags))) + str_tokens.append((name, value, flags)) else: assert token.type == 'REGEXP' - re_tokens.append((name, (value, flags))) + sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value) + if sp: + value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x + for x in sp) + + re_tokens.append((name, value, flags)) + token_ref[name] = value - str_tokens.sort(key=lambda x:len(x[1][0]), reverse=True) - re_tokens.sort(key=lambda x:len(x[1][0]), reverse=True) + str_tokens.sort(key=lambda x:len(x[1]), reverse=True) + re_tokens.sort(key=lambda x:len(x[1]), reverse=True) tokens = str_tokens + re_tokens # Order is important! # Rules