| @@ -112,39 +112,30 @@ class Lark: | |||
| self.tokens, self.rules = load_grammar(grammar) | |||
| self.lexer = self._build_lexer() | |||
| if not self.options.only_lex: | |||
| self.parser_engine = ENGINE_DICT[self.options.parser]() | |||
| self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | |||
| self.parser = self._build_parser() | |||
| if self.profiler: self.profiler.enter_section('outside_lark') | |||
| self.lexer = self._build_lexer() | |||
| if self.profiler: self.profiler.enter_section('outside_lark') | |||
| def _create_unless_callback(self, strs): | |||
| def f(t): | |||
| if t in strs: | |||
| t.type = strs[t] | |||
| return t | |||
| return f | |||
| __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||
| def _build_lexer(self): | |||
| ignore_tokens = [] | |||
| tokens = [] | |||
| callbacks = {} | |||
| for name, value, flags in self.tokens: | |||
| for tokendef, flags in self.tokens: | |||
| for flag in flags: | |||
| if flag == 'ignore': | |||
| ignore_tokens.append(name) | |||
| elif isinstance(flag, tuple) and flag[0] == 'unless': | |||
| _, strs = flag | |||
| callbacks[name] = self._create_unless_callback(strs) | |||
| ignore_tokens.append(tokendef.name) | |||
| else: | |||
| raise GrammarError("No such flag: %s" % flag) | |||
| tokens.append((name, value)) | |||
| return Lexer(tokens, callbacks, ignore=ignore_tokens) | |||
| tokens.append(tokendef) | |||
| return Lexer(tokens, ignore=ignore_tokens) | |||
| def _build_parser(self): | |||
| rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | |||
| @@ -155,8 +146,6 @@ class Lark: | |||
| return self.parser_engine.build_parser(rules, callback, self.options.start) | |||
| __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | |||
| def lex(self, text): | |||
| stream = self.lexer.lex(text) | |||
| if self.options.postlex: | |||
| @@ -2,11 +2,32 @@ | |||
| import re | |||
| from .utils import Str | |||
| from .utils import Str, classify | |||
| class LexError(Exception): | |||
| pass | |||
| class TokenDef(object): | |||
| def __init__(self, name, value): | |||
| self.name = name | |||
| self.value = value | |||
| def __repr__(self): | |||
| return ('%s(%r, %r)' % (type(self).__name__, self.name, self.value)) | |||
| class TokenDef__Str(TokenDef): | |||
| def to_regexp(self): | |||
| return re.escape(self.value) | |||
| priority = 0 | |||
| class TokenDef__Regexp(TokenDef): | |||
| def to_regexp(self): | |||
| return self.value | |||
| priority = 1 | |||
| class UnexpectedInput(LexError): | |||
| def __init__(self, seq, lex_pos, line, column): | |||
| context = seq[lex_pos:lex_pos+5] | |||
| @@ -41,31 +62,63 @@ class Regex: | |||
| self.pattern = pattern | |||
| self.flags = flags | |||
| def _regexp_has_newline(r): | |||
| return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | |||
| def _create_unless_callback(strs): | |||
| def f(t): | |||
| if t in strs: | |||
| t.type = strs[t] | |||
| return t | |||
| return f | |||
| def _create_unless(tokens): | |||
| tokens_by_type = classify(tokens, type) | |||
| assert len(tokens_by_type) <= 2, tokens_by_type.keys() | |||
| embedded_strs = set() | |||
| callback = {} | |||
| for retok in tokens_by_type.get(TokenDef__Regexp, []): | |||
| unless = {} | |||
| for strtok in tokens_by_type.get(TokenDef__Str, []): | |||
| m = re.match(retok.value, strtok.value) | |||
| if m and m.group(0) == strtok.value: | |||
| embedded_strs.add(strtok.name) | |||
| unless[strtok.value] = strtok.name | |||
| if unless: | |||
| callback[retok.name] = _create_unless_callback(unless) | |||
| tokens = [t for t in tokens if t.name not in embedded_strs] | |||
| return tokens, callback | |||
| class Lexer(object): | |||
| def __init__(self, tokens, callbacks, ignore=()): | |||
| def __init__(self, tokens, ignore=()): | |||
| assert all(isinstance(t, TokenDef) for t in tokens) | |||
| self.ignore = ignore | |||
| self.newline_char = '\n' | |||
| tokens = list(tokens) | |||
| # Sanitization | |||
| token_names = {t[0] for t in tokens} | |||
| for t in tokens: | |||
| try: | |||
| re.compile(t[1]) | |||
| re.compile(t.to_regexp()) | |||
| except: | |||
| raise LexError("Cannot compile token: %s: %s" % t) | |||
| token_names = {t.name for t in tokens} | |||
| assert all(t in token_names for t in ignore) | |||
| # Init | |||
| self.tokens = tokens | |||
| self.callbacks = callbacks | |||
| self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.to_regexp())] | |||
| self.ignore_types = [t for t in ignore] | |||
| self.token_types = list(token_names) | |||
| self.type_index = {name:i for i,name in enumerate(self.token_types)} | |||
| tokens, self.callback = _create_unless(tokens) | |||
| assert all(self.callback.values()) | |||
| self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1] or '(?s)' in t[1]] | |||
| self.ignore_types = [self.type_index[t] for t in ignore] | |||
| tokens.sort(key=lambda x:(x.priority, len(x.value)), reverse=True) | |||
| self.tokens = tokens | |||
| self.mres = self._build_mres(tokens, len(tokens)) | |||
| @@ -77,11 +130,11 @@ class Lexer(object): | |||
| mres = [] | |||
| while tokens: | |||
| try: | |||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size])) | |||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.to_regexp()) for t in tokens[:max_size])) | |||
| except AssertionError: # Yes, this is what Python provides us.. :/ | |||
| return self._build_mres(tokens, max_size//2) | |||
| mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} )) | |||
| mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | |||
| tokens = tokens[max_size:] | |||
| return mres | |||
| @@ -96,16 +149,16 @@ class Lexer(object): | |||
| m = mre.match(stream, lex_pos) | |||
| if m: | |||
| value = m.group(0) | |||
| type_num = type_from_index[m.lastindex] | |||
| if type_num not in ignore_types: | |||
| t = Token(self.token_types[type_num], value, lex_pos) | |||
| type_ = type_from_index[m.lastindex] | |||
| if type_ not in ignore_types: | |||
| t = Token(type_, value, lex_pos) | |||
| t.line = line | |||
| t.column = lex_pos - col_start_pos | |||
| if t.type in self.callbacks: | |||
| t = self.callbacks[t.type](t) | |||
| if t.type in self.callback: | |||
| t = self.callback[t.type](t) | |||
| yield t | |||
| if type_num in newline_types: | |||
| if type_ in newline_types: | |||
| newlines = value.count(self.newline_char) | |||
| if newlines: | |||
| line += newlines | |||
| @@ -117,4 +170,3 @@ class Lexer(object): | |||
| raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||
| break | |||
| @@ -1,7 +1,7 @@ | |||
| import re | |||
| import codecs | |||
| from .lexer import Lexer, Token, UnexpectedInput | |||
| from .lexer import Lexer, Token, UnexpectedInput, TokenDef__Str, TokenDef__Regexp | |||
| from .parse_tree_builder import ParseTreeBuilder | |||
| from .parser_frontends import LALR | |||
| @@ -278,7 +278,8 @@ class ExtractAnonTokens(InlineTransformer): | |||
| class GrammarLoader: | |||
| def __init__(self): | |||
| self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) | |||
| tokens = [TokenDef__Regexp(name, value) for name, value in TOKENS.items()] | |||
| self.lexer = Lexer(tokens, ignore=['WS', 'COMMENT']) | |||
| d = {r: [(x.split(), None) for x in xs] for r, xs in RULES.items()} | |||
| rules, callback = ParseTreeBuilder(T).create_tree_builder(d, None) | |||
| @@ -312,47 +313,27 @@ class GrammarLoader: | |||
| extract_anon = ExtractAnonTokens(tokens, token_set) | |||
| tree = extract_anon.transform(tree) # Adds to tokens | |||
| tokens2 = [] | |||
| token_ref = {} | |||
| tokendefs = [] | |||
| for name, token, flags in tokens: | |||
| value = token.value[1:-1] | |||
| if r'\u' in value: | |||
| # XXX for now, you can't mix unicode escaping and unicode characters at the same token | |||
| value = unicode_escape(value)[0] | |||
| tokens2.append((name, token.type, value, flags)) | |||
| token_ref = {} | |||
| re_tokens = [] | |||
| str_tokens = [] | |||
| for name, type_, value, flags in tokens2: | |||
| if type_ == 'STRING': | |||
| str_tokens.append((name, value, flags)) | |||
| else: | |||
| assert type_ == 'REGEXP' | |||
| if token.type == 'REGEXP': | |||
| sp = re.split(r'(\$\{%s})' % TOKENS['TOKEN'], value) | |||
| if sp: | |||
| value = ''.join(token_ref[x[2:-1]] if x.startswith('${') and x.endswith('}') else x | |||
| for x in sp) | |||
| re_tokens.append((name, value, flags)) | |||
| token_ref[name] = value | |||
| tokendef = TokenDef__Regexp(name, value) | |||
| else: | |||
| assert token.type == 'STRING' | |||
| tokendef = TokenDef__Str(name, value) | |||
| embedded_strs = set() | |||
| for re_name, re_value, re_flags in re_tokens: | |||
| unless = {} | |||
| for str_name, str_value, _sf in str_tokens: | |||
| m = re.match(re_value, str_value) | |||
| if m and m.group(0) == str_value: | |||
| assert not _sf, "You just broke Lark! Please email me with your grammar" | |||
| embedded_strs.add(str_name) | |||
| unless[str_value] = str_name | |||
| if unless: | |||
| re_flags.append(('unless', unless)) | |||
| str_tokens = [(n, re.escape(v), f) for n, v, f in str_tokens if n not in embedded_strs] | |||
| str_tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
| re_tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
| tokens = str_tokens + re_tokens # Order is important! | |||
| tokendefs.append((tokendef, flags)) | |||
| # ================= | |||
| # Process Rules | |||
| @@ -391,7 +372,7 @@ class GrammarLoader: | |||
| if sym not in rule_set: | |||
| raise GrammarError("Rule '%s' used but not defined" % sym) | |||
| return tokens, rules | |||
| return tokendefs, rules | |||
| load_grammar = GrammarLoader().load_grammar | |||
| @@ -3,15 +3,15 @@ from ..common import ParseError, UnexpectedToken | |||
| class Parser(object): | |||
| def __init__(self, ga, callback): | |||
| self.ga = ga | |||
| def __init__(self, analysis, callback): | |||
| self.analysis = analysis | |||
| self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||
| for rule in ga.rules} | |||
| for rule in analysis.rules} | |||
| def parse(self, seq): | |||
| states_idx = self.ga.states_idx | |||
| states_idx = self.analysis.states_idx | |||
| state_stack = [self.ga.init_state_idx] | |||
| state_stack = [self.analysis.init_state_idx] | |||
| value_stack = [] | |||
| i = 0 | |||
| @@ -39,7 +39,7 @@ class Parser(object): | |||
| res = self.callbacks[rule](s) | |||
| if len(state_stack) == 1 and rule.origin == self.ga.start_symbol: | |||
| if len(state_stack) == 1 and rule.origin == self.analysis.start_symbol: | |||
| return res | |||
| _action, new_state = get_action(rule.origin) | |||
| @@ -63,7 +63,7 @@ class Parser(object): | |||
| assert _action == 'reduce' | |||
| res = reduce(*rule) | |||
| if res: | |||
| assert state_stack == [self.ga.init_state_idx] and not value_stack, len(state_stack) | |||
| assert state_stack == [self.analysis.init_state_idx] and not value_stack, len(state_stack) | |||
| return res | |||