| @@ -41,10 +41,11 @@ class UnexpectedToken(ParseError): | |||||
| class LexerConf: | class LexerConf: | ||||
| def __init__(self, tokens, ignore=(), postlex=None): | |||||
| def __init__(self, tokens, ignore=(), postlex=None, callbacks={}): | |||||
| self.tokens = tokens | self.tokens = tokens | ||||
| self.ignore = ignore | self.ignore = ignore | ||||
| self.postlex = postlex | self.postlex = postlex | ||||
| self.callbacks = callbacks | |||||
| class ParserConf: | class ParserConf: | ||||
| def __init__(self, rules, callback, start): | def __init__(self, rules, callback, start): | ||||
| @@ -39,7 +39,8 @@ class LarkOptions(object): | |||||
| postlex - Lexer post-processing (Requires standard lexer. Default: None) | postlex - Lexer post-processing (Requires standard lexer. Default: None) | ||||
| start - The start symbol (Default: start) | start - The start symbol (Default: start) | ||||
| profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) | profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) | ||||
| propagate_positions - Experimental. Don't use yet. | |||||
| propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. | |||||
| lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. | |||||
| """ | """ | ||||
| __doc__ += OPTIONS_DOC | __doc__ += OPTIONS_DOC | ||||
| def __init__(self, options_dict): | def __init__(self, options_dict): | ||||
| @@ -58,6 +59,7 @@ class LarkOptions(object): | |||||
| self.ambiguity = o.pop('ambiguity', 'auto') | self.ambiguity = o.pop('ambiguity', 'auto') | ||||
| self.propagate_positions = o.pop('propagate_positions', False) | self.propagate_positions = o.pop('propagate_positions', False) | ||||
| self.earley__predict_all = o.pop('earley__predict_all', False) | self.earley__predict_all = o.pop('earley__predict_all', False) | ||||
| self.lexer_callbacks = o.pop('lexer_callbacks', {}) | |||||
| assert self.parser in ('earley', 'lalr', 'cyk', None) | assert self.parser in ('earley', 'lalr', 'cyk', None) | ||||
| @@ -153,7 +155,7 @@ class Lark: | |||||
| # Compile the EBNF grammar into BNF | # Compile the EBNF grammar into BNF | ||||
| tokens, self.rules, self.ignore_tokens = self.grammar.compile(lexer=bool(lexer), start=self.options.start) | tokens, self.rules, self.ignore_tokens = self.grammar.compile(lexer=bool(lexer), start=self.options.start) | ||||
| self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex) | |||||
| self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) | |||||
| if self.options.parser: | if self.options.parser: | ||||
| self.parser = self._build_parser() | self.parser = self._build_parser() | ||||
| @@ -165,7 +167,7 @@ class Lark: | |||||
| __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC | ||||
| def _build_lexer(self): | def _build_lexer(self): | ||||
| return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore) | |||||
| return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) | |||||
| def _build_parser(self): | def _build_parser(self): | ||||
| self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
| @@ -82,6 +82,7 @@ class _Lex: | |||||
| ignore_types = list(ignore_types) | ignore_types = list(ignore_types) | ||||
| line_ctr = LineCounter() | line_ctr = LineCounter() | ||||
| t = None | |||||
| while True: | while True: | ||||
| lexer = self.lexer | lexer = self.lexer | ||||
| for mre, type_from_index in lexer.mres: | for mre, type_from_index in lexer.mres: | ||||
| @@ -94,8 +95,15 @@ class _Lex: | |||||
| if t.type in lexer.callback: | if t.type in lexer.callback: | ||||
| t = lexer.callback[t.type](t) | t = lexer.callback[t.type](t) | ||||
| yield t | yield t | ||||
| else: | |||||
| if type_ in lexer.callback: | |||||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||||
| lexer.callback[type_](t) | |||||
| line_ctr.feed(value, type_ in newline_types) | line_ctr.feed(value, type_ in newline_types) | ||||
| if t: | |||||
| t.end_line = line_ctr.line | |||||
| t.end_column = line_ctr.column | |||||
| break | break | ||||
| else: | else: | ||||
| if line_ctr.char_pos < len(stream): | if line_ctr.char_pos < len(stream): | ||||
| @@ -163,7 +171,7 @@ def _regexp_has_newline(r): | |||||
| return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | ||||
| class Lexer: | class Lexer: | ||||
| def __init__(self, tokens, ignore=()): | |||||
| def __init__(self, tokens, ignore=(), user_callbacks={}): | |||||
| assert all(isinstance(t, TokenDef) for t in tokens), tokens | assert all(isinstance(t, TokenDef) for t in tokens), tokens | ||||
| tokens = list(tokens) | tokens = list(tokens) | ||||
| @@ -189,6 +197,10 @@ class Lexer: | |||||
| tokens, self.callback = _create_unless(tokens) | tokens, self.callback = _create_unless(tokens) | ||||
| assert all(self.callback.values()) | assert all(self.callback.values()) | ||||
| for type_, f in user_callbacks.items(): | |||||
| assert type_ not in self.callback | |||||
| self.callback[type_] = f | |||||
| self.tokens = tokens | self.tokens = tokens | ||||
| self.mres = build_mres(tokens) | self.mres = build_mres(tokens) | ||||
| @@ -198,7 +210,7 @@ class Lexer: | |||||
| class ContextualLexer: | class ContextualLexer: | ||||
| def __init__(self, tokens, states, ignore=(), always_accept=()): | |||||
| def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}): | |||||
| tokens_by_name = {} | tokens_by_name = {} | ||||
| for t in tokens: | for t in tokens: | ||||
| assert t.name not in tokens_by_name, t | assert t.name not in tokens_by_name, t | ||||
| @@ -213,12 +225,12 @@ class ContextualLexer: | |||||
| except KeyError: | except KeyError: | ||||
| accepts = set(accepts) | set(ignore) | set(always_accept) | accepts = set(accepts) | set(ignore) | set(always_accept) | ||||
| state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] | state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] | ||||
| lexer = Lexer(state_tokens, ignore=ignore) | |||||
| lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
| lexer_by_tokens[key] = lexer | lexer_by_tokens[key] = lexer | ||||
| self.lexers[state] = lexer | self.lexers[state] = lexer | ||||
| self.root_lexer = Lexer(tokens, ignore=ignore) | |||||
| self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks) | |||||
| self.set_parser_state(None) # Needs to be set on the outside | self.set_parser_state(None) # Needs to be set on the outside | ||||
| @@ -92,7 +92,7 @@ class PropagatePositions: | |||||
| for a in reversed(children): | for a in reversed(children): | ||||
| with suppress(AttributeError): | with suppress(AttributeError): | ||||
| res.end_line = a.end_line | res.end_line = a.end_line | ||||
| res.end_col = a.end_col | |||||
| res.end_column = a.end_column | |||||
| break | break | ||||
| return res | return res | ||||
| @@ -11,13 +11,13 @@ from .tree import Tree | |||||
| class WithLexer: | class WithLexer: | ||||
| def init_traditional_lexer(self, lexer_conf): | def init_traditional_lexer(self, lexer_conf): | ||||
| self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
| self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) | |||||
| self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) | |||||
| def init_contextual_lexer(self, lexer_conf, parser_conf): | def init_contextual_lexer(self, lexer_conf, parser_conf): | ||||
| self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
| d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} | d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} | ||||
| always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | ||||
| self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) | |||||
| self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) | |||||
| def lex(self, text): | def lex(self, text): | ||||
| stream = self.lexer.lex(text) | stream = self.lexer.lex(text) | ||||
| @@ -839,12 +839,13 @@ def _make_parser_test(LEXER, PARSER): | |||||
| self.assertEqual(d.line, 2) | self.assertEqual(d.line, 2) | ||||
| self.assertEqual(d.column, 1) | self.assertEqual(d.column, 1) | ||||
| # self.assertEqual(a.end_line, 1) | |||||
| # self.assertEqual(a.end_col, 1) | |||||
| # self.assertEqual(bc.end_line, 2) | |||||
| # self.assertEqual(bc.end_col, 1) | |||||
| # self.assertEqual(d.end_line, 2) | |||||
| # self.assertEqual(d.end_col, 2) | |||||
| if LEXER != 'dynamic': | |||||
| self.assertEqual(a.end_line, 1) | |||||
| self.assertEqual(a.end_column, 1) | |||||
| self.assertEqual(bc.end_line, 2) | |||||
| self.assertEqual(bc.end_column, 1) | |||||
| self.assertEqual(d.end_line, 2) | |||||
| self.assertEqual(d.end_column, 2) | |||||