From 22e525f53e7cbdb7a251fc9ad485ce9ee82062b4 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 28 Jan 2018 15:42:40 +0200 Subject: [PATCH] Fixed propagate positions. Added lexer_callbacks option. --- lark/common.py | 3 ++- lark/lark.py | 8 +++++--- lark/lexer.py | 20 ++++++++++++++++---- lark/parse_tree_builder.py | 2 +- lark/parser_frontends.py | 4 ++-- tests/test_parser.py | 13 +++++++------ 6 files changed, 33 insertions(+), 17 deletions(-) diff --git a/lark/common.py b/lark/common.py index f745018..c0c6cf3 100644 --- a/lark/common.py +++ b/lark/common.py @@ -41,10 +41,11 @@ class UnexpectedToken(ParseError): class LexerConf: - def __init__(self, tokens, ignore=(), postlex=None): + def __init__(self, tokens, ignore=(), postlex=None, callbacks={}): self.tokens = tokens self.ignore = ignore self.postlex = postlex + self.callbacks = callbacks class ParserConf: def __init__(self, rules, callback, start): diff --git a/lark/lark.py b/lark/lark.py index fa564ed..fb5e04f 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -39,7 +39,8 @@ class LarkOptions(object): postlex - Lexer post-processing (Requires standard lexer. Default: None) start - The start symbol (Default: start) profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) - propagate_positions - Experimental. Don't use yet. + propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. + lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. """ __doc__ += OPTIONS_DOC def __init__(self, options_dict): @@ -58,6 +59,7 @@ class LarkOptions(object): self.ambiguity = o.pop('ambiguity', 'auto') self.propagate_positions = o.pop('propagate_positions', False) self.earley__predict_all = o.pop('earley__predict_all', False) + self.lexer_callbacks = o.pop('lexer_callbacks', {}) assert self.parser in ('earley', 'lalr', 'cyk', None) @@ -153,7 +155,7 @@ class Lark: # Compile the EBNF grammar into BNF tokens, self.rules, self.ignore_tokens = self.grammar.compile(lexer=bool(lexer), start=self.options.start) - self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex) + self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) if self.options.parser: self.parser = self._build_parser() @@ -165,7 +167,7 @@ class Lark: __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC def _build_lexer(self): - return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore) + return Lexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) diff --git a/lark/lexer.py b/lark/lexer.py index 64cfb46..bd22a20 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -82,6 +82,7 @@ class _Lex: ignore_types = list(ignore_types) line_ctr = LineCounter() + t = None while True: lexer = self.lexer for mre, type_from_index in lexer.mres: @@ -94,8 +95,15 @@ class _Lex: if t.type in lexer.callback: t = lexer.callback[t.type](t) yield t + else: + if type_ in lexer.callback: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t) line_ctr.feed(value, type_ in newline_types) + if t: + t.end_line = line_ctr.line + t.end_column = line_ctr.column break else: if line_ctr.char_pos < len(stream): @@ -163,7 +171,7 @@ def _regexp_has_newline(r): return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) class Lexer: - def __init__(self, tokens, ignore=()): + def __init__(self, tokens, ignore=(), user_callbacks={}): assert all(isinstance(t, TokenDef) for t in tokens), tokens tokens = list(tokens) @@ -189,6 +197,10 @@ class Lexer: tokens, self.callback = _create_unless(tokens) assert all(self.callback.values()) + for type_, f in user_callbacks.items(): + assert type_ not in self.callback + self.callback[type_] = f + self.tokens = tokens self.mres = build_mres(tokens) @@ -198,7 +210,7 @@ class Lexer: class ContextualLexer: - def __init__(self, tokens, states, ignore=(), always_accept=()): + def __init__(self, tokens, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} for t in tokens: assert t.name not in tokens_by_name, t @@ -213,12 +225,12 @@ class ContextualLexer: except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] - lexer = Lexer(state_tokens, ignore=ignore) + lexer = Lexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - self.root_lexer = Lexer(tokens, ignore=ignore) + self.root_lexer = Lexer(tokens, ignore=ignore, user_callbacks=user_callbacks) self.set_parser_state(None) # Needs to be set on the outside diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 7e52125..e0041b9 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -92,7 +92,7 @@ class PropagatePositions: for a in reversed(children): with suppress(AttributeError): res.end_line = a.end_line - res.end_col = a.end_col + res.end_column = a.end_column break return res diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index b7a3abc..515f018 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -11,13 +11,13 @@ from .tree import Tree class WithLexer: def init_traditional_lexer(self, lexer_conf): self.lexer_conf = lexer_conf - self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) + self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) def init_contextual_lexer(self, lexer_conf, parser_conf): self.lexer_conf = lexer_conf d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () - self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) + self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) def lex(self, text): stream = self.lexer.lex(text) diff --git a/tests/test_parser.py b/tests/test_parser.py index 38ada24..ed716bb 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -839,12 +839,13 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(d.line, 2) self.assertEqual(d.column, 1) - # self.assertEqual(a.end_line, 1) - # self.assertEqual(a.end_col, 1) - # self.assertEqual(bc.end_line, 2) - # self.assertEqual(bc.end_col, 1) - # self.assertEqual(d.end_line, 2) - # self.assertEqual(d.end_col, 2) + if LEXER != 'dynamic': + self.assertEqual(a.end_line, 1) + self.assertEqual(a.end_column, 1) + self.assertEqual(bc.end_line, 2) + self.assertEqual(bc.end_column, 1) + self.assertEqual(d.end_line, 2) + self.assertEqual(d.end_column, 2)