@@ -112,39 +112,30 @@ class Lark: | |||
self.tokens, self.rules = load_grammar(grammar) | |||
self.lexer = self._build_lexer() | |||
if not self.options.only_lex: | |||
self.parser_engine = ENGINE_DICT[self.options.parser]() | |||
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | |||
self.parser = self._build_parser() | |||
if self.profiler: self.profiler.enter_section('outside_lark') | |||
self.lexer = self._build_lexer() | |||
if self.profiler: self.profiler.enter_section('outside_lark') | |||
def _create_unless_callback(self, strs): | |||
def f(t): | |||
if t in strs: | |||
t.type = strs[t] | |||
return t | |||
return f | |||
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||
def _build_lexer(self): | |||
ignore_tokens = [] | |||
tokens = [] | |||
callbacks = {} | |||
for name, value, flags in self.tokens: | |||
for tokendef, flags in self.tokens: | |||
for flag in flags: | |||
if flag == 'ignore': | |||
ignore_tokens.append(name) | |||
elif isinstance(flag, tuple) and flag[0] == 'unless': | |||
_, strs = flag | |||
callbacks[name] = self._create_unless_callback(strs) | |||
ignore_tokens.append(tokendef.name) | |||
else: | |||
raise GrammarError("No such flag: %s" % flag) | |||
tokens.append((name, value)) | |||
return Lexer(tokens, callbacks, ignore=ignore_tokens) | |||
tokens.append(tokendef) | |||
return Lexer(tokens, ignore=ignore_tokens) | |||
def _build_parser(self): | |||
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | |||
@@ -155,8 +146,6 @@ class Lark: | |||
return self.parser_engine.build_parser(rules, callback, self.options.start) | |||
__init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||
def lex(self, text): | |||
stream = self.lexer.lex(text) | |||
if self.options.postlex: | |||
@@ -2,11 +2,32 @@ | |||
import re | |||
from .utils import Str | |||
from .utils import Str, classify | |||
class LexError(Exception): | |||
pass | |||
class TokenDef(object): | |||
def __init__(self, name, value): | |||
self.name = name | |||
self.value = value | |||
def __repr__(self): | |||
return ('%s(%r, %r)' % (type(self).__name__, self.name, self.value)) | |||
class TokenDef__Str(TokenDef): | |||
def to_regexp(self): | |||
return re.escape(self.value) | |||
priority = 0 | |||
class TokenDef__Regexp(TokenDef): | |||
def to_regexp(self): | |||
return self.value | |||
priority = 1 | |||
class UnexpectedInput(LexError): | |||
def __init__(self, seq, lex_pos, line, column): | |||
context = seq[lex_pos:lex_pos+5] | |||
@@ -41,31 +62,63 @@ class Regex: | |||
self.pattern = pattern | |||
self.flags = flags | |||
def _regexp_has_newline(r): | |||
return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | |||
def _create_unless_callback(strs): | |||
def f(t): | |||
if t in strs: | |||
t.type = strs[t] | |||
return t | |||
return f | |||
def _create_unless(tokens): | |||
tokens_by_type = classify(tokens, type) | |||
assert len(tokens_by_type) <= 2, tokens_by_type.keys() | |||
embedded_strs = set() | |||
callback = {} | |||
for retok in tokens_by_type.get(TokenDef__Regexp, []): | |||
unless = {} | |||
for strtok in tokens_by_type.get(TokenDef__Str, []): | |||
m = re.match(retok.value, strtok.value) | |||
if m and m.group(0) == strtok.value: | |||
embedded_strs.add(strtok.name) | |||
unless[strtok.value] = strtok.name | |||
if unless: | |||
callback[retok.name] = _create_unless_callback(unless) | |||
tokens = [t for t in tokens if t.name not in embedded_strs] | |||
return tokens, callback | |||
class Lexer(object): | |||
def __init__(self, tokens, callbacks, ignore=()): | |||
def __init__(self, tokens, ignore=()): | |||
assert all(isinstance(t, TokenDef) for t in tokens) | |||
self.ignore = ignore | |||
self.newline_char = '\n' | |||
tokens = list(tokens) | |||
# Sanitization | |||
token_names = {t[0] for t in tokens} | |||
for t in tokens: | |||
try: | |||
re.compile(t[1]) | |||
re.compile(t.to_regexp()) | |||
except: | |||
raise LexError("Cannot compile token: %s: %s" % t) | |||
token_names = {t.name for t in tokens} | |||
assert all(t in token_names for t in ignore) | |||
# Init | |||
self.tokens = tokens | |||
self.callbacks = callbacks | |||
self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.to_regexp())] | |||
self.ignore_types = [t for t in ignore] | |||
self.token_types = list(token_names) | |||
self.type_index = {name:i for i,name in enumerate(self.token_types)} | |||
tokens, self.callback = _create_unless(tokens) | |||
assert all(self.callback.values()) | |||
self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]] | |||
self.ignore_types = [self.type_index[t] for t in ignore] | |||
tokens.sort(key=lambda x:(x.priority, len(x.value)), reverse=True) | |||
self.tokens = tokens | |||
self.mres = self._build_mres(tokens, len(tokens)) | |||
@@ -77,11 +130,11 @@ class Lexer(object): | |||
mres = [] | |||
while tokens: | |||
try: | |||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size])) | |||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.to_regexp()) for t in tokens[:max_size])) | |||
except AssertionError: # Yes, this is what Python provides us.. :/ | |||
return self._build_mres(tokens, max_size//2) | |||
mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} )) | |||
mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | |||
tokens = tokens[max_size:] | |||
return mres | |||
@@ -96,16 +149,16 @@ class Lexer(object): | |||
m = mre.match(stream, lex_pos) | |||
if m: | |||
value = m.group(0) | |||
type_num = type_from_index[m.lastindex] | |||
if type_num not in ignore_types: | |||
t = Token(self.token_types[type_num], value, lex_pos) | |||
type_ = type_from_index[m.lastindex] | |||
if type_ not in ignore_types: | |||
t = Token(type_, value, lex_pos) | |||
t.line = line | |||
t.column = lex_pos - col_start_pos | |||
if t.type in self.callbacks: | |||
t = self.callbacks[t.type](t) | |||
if t.type in self.callback: | |||
t = self.callback[t.type](t) | |||
yield t | |||
if type_num in newline_types: | |||
if type_ in newline_types: | |||
newlines = value.count(self.newline_char) | |||
if newlines: | |||
line += newlines | |||
@@ -117,4 +170,3 @@ class Lexer(object): | |||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||
break | |||
@@ -1,7 +1,7 @@ | |||
import re | |||
import codecs | |||
from .lexer import Lexer, Token, UnexpectedInput | |||
from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp | |||
from .parse_tree_builder import ParseTreeBuilder | |||
from .parser_frontends import LALR | |||
@@ -278,7 +278,8 @@ class ExtractAnonTokens(InlineTransformer): | |||
class GrammarLoader: | |||
def __init__(self): | |||
self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | |||
tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()] | |||
self.lexer = Lexer(tokens, ignore=['WS', 'COMMENT']) | |||
d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()} | |||
rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) | |||
@@ -312,47 +313,27 @@ class GrammarLoader: | |||
extract_anon = ExtractAnonTokens(tokens, token_set) | |||
tree = extract_anon.transform(tree) # Adds to tokens | |||
tokens2 = [] | |||
token_ref = {} | |||
tokendefs = [] | |||
for name, token, flags in tokens: | |||
value = token.value[1:-1] | |||
if r'\u' in value: | |||
# XXX for now, you can't mix unicode escaping and unicode characters at the same token | |||
value = unicode_escape(value)[0] | |||
tokens2.append((name, token.type, value, flags)) | |||
token_ref = {} | |||
re_tokens = [] | |||
str_tokens = [] | |||
for name, type_, value, flags in tokens2: | |||
if type_ == 'STRING': | |||
str_tokens.append((name, value, flags)) | |||
else: | |||
assert type_ == 'REGEXP' | |||
if token.type == 'REGEXP': | |||
sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value) | |||
if sp: | |||
value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x | |||
for x in sp) | |||
re_tokens.append((name, value, flags)) | |||
token_ref[name] = value | |||
tokendef = TokenDef__Regexp(name, value) | |||
else: | |||
assert token.type == 'STRING' | |||
tokendef = TokenDef__Str(name, value) | |||
embedded_strs = set() | |||
for re_name, re_value, re_flags in re_tokens: | |||
unless = {} | |||
for str_name, str_value, _sf in str_tokens: | |||
m = re.match(re_value, str_value) | |||
if m and m.group(0) == str_value: | |||
assert not _sf, "You just broke Lark! Please email me with your grammar" | |||
embedded_strs.add(str_name) | |||
unless[str_value] = str_name | |||
if unless: | |||
re_flags.append(('unless', unless)) | |||
str_tokens = [(n, re.escape(v), f) for n, v, f in str_tokens if n not in embedded_strs] | |||
str_tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
re_tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
tokens = str_tokens + re_tokens # Order is important! | |||
tokendefs.append((tokendef, flags)) | |||
# ================= | |||
# Process Rules | |||
@@ -391,7 +372,7 @@ class GrammarLoader: | |||
if sym not in rule_set: | |||
raise GrammarError("Rule '%s' used but not defined" % sym) | |||
return tokens, rules | |||
return tokendefs, rules | |||
load_grammar = GrammarLoader().load_grammar | |||
@@ -3,15 +3,15 @@ from ..common import ParseError, UnexpectedToken | |||
class Parser(object): | |||
def __init__(self, ga, callback): | |||
self.ga = ga | |||
def __init__(self, analysis, callback): | |||
self.analysis = analysis | |||
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||
for rule in ga.rules} | |||
for rule in analysis.rules} | |||
def parse(self, seq): | |||
states_idx = self.ga.states_idx | |||
states_idx = self.analysis.states_idx | |||
state_stack = [self.ga.init_state_idx] | |||
state_stack = [self.analysis.init_state_idx] | |||
value_stack = [] | |||
i = 0 | |||
@@ -39,7 +39,7 @@ class Parser(object): | |||
res = self.callbacks[rule](s) | |||
if len(state_stack) == 1 and rule.origin == self.ga.start_symbol: | |||
if len(state_stack) == 1 and rule.origin == self.analysis.start_symbol: | |||
return res | |||
_action, new_state = get_action(rule.origin) | |||
@@ -63,7 +63,7 @@ class Parser(object): | |||
assert _action == 'reduce' | |||
res = reduce(*rule) | |||
if res: | |||
assert state_stack == [self.ga.init_state_idx] and not value_stack, len(state_stack) | |||
assert state_stack == [self.analysis.init_state_idx] and not value_stack, len(state_stack) | |||
return res | |||