diff --git a/lark/lexer.py b/lark/lexer.py index efdbba3..75c0d18 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,6 +3,7 @@ import re from .utils import Str, classify +from .common import is_terminal class LexError(Exception): pass @@ -169,3 +170,64 @@ class Lexer(object): if lex_pos < len(stream): raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) break + + +class ContextualLexer: + def __init__(self, tokens, states, ignore=()): + tokens_by_name = {} + for t in tokens: + assert t.name not in tokens_by_name + tokens_by_name[t.name] = t + + lexer_by_tokens = {} + self.lexers = {} + for state, accepts in states.items(): + key = frozenset(accepts) + try: + lexer = lexer_by_tokens[key] + except KeyError: + accepts = list(accepts) # For python3 + accepts += ignore + # if '_NEWLINE' in tokens_by_name and '_NEWLINE' not in accepts: + # accepts.append('_NEWLINE') # XXX hack for now + state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] + lexer = Lexer(state_tokens, ignore=ignore) + lexer_by_tokens[key] = lexer + + self.lexers[state] = lexer + + self.root_lexer = Lexer(tokens, ignore=ignore) + + def lex(self, stream, parser): + lex_pos = 0 + line = 1 + col_start_pos = 0 + newline_types = list(self.root_lexer.newline_types) + ignore_types = list(self.root_lexer.ignore_types) + while True: + lexer = self.lexers[parser.state] + for mre, type_from_index in lexer.mres: + m = mre.match(stream, lex_pos) + if m: + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, lex_pos) + t.line = line + t.column = lex_pos - col_start_pos + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + yield t + + if type_ in newline_types: + newlines = value.count(lexer.newline_char) + if newlines: + line += newlines + col_start_pos = lex_pos + value.rindex(lexer.newline_char) + lex_pos += len(value) + break + else: + if lex_pos < len(stream): + raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) + break + diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index af1275d..0b9719b 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,7 +1,7 @@ import re import sre_parse -from .lexer import Lexer +from .lexer import Lexer, ContextualLexer from .parsers.lalr_analysis import GrammarAnalyzer from .common import is_terminal, GrammarError @@ -31,6 +31,25 @@ class LALR(WithLexer): tokens = list(self.lex(text)) return self.parser.parse(tokens) +class LALR_ContextualLexer: + def __init__(self, lexer_conf, parser_conf): + self.lexer_conf = lexer_conf + self.parser_conf = parser_conf + + self.analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) + self.analyzer.analyze() + + d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} + self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore) + + + def parse(self, text): + parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) + l = self.lexer.lex(text, parser) + return parser.parse(l, True) + + + class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf): WithLexer.__init__(self, lexer_conf) @@ -82,4 +101,4 @@ class Earley_NoLex: assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] -ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex } +ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer } diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 12c15e5..f87ad7d 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -7,13 +7,14 @@ class Parser(object): self.analysis = analysis self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in analysis.rules} + self.state = self.analysis.init_state_idx - def parse(self, seq): + def parse(self, stream, set_state=False): # XXX no set_state + stream = iter(stream) states_idx = self.analysis.states_idx state_stack = [self.analysis.init_state_idx] value_stack = [] - i = 0 def get_action(key): state = state_stack[-1] @@ -21,13 +22,8 @@ class Parser(object): return states_idx[state][key] except KeyError: expected = states_idx[state].keys() - try: - token = seq[i] - except IndexError: - assert key == '$end' - token = seq[-1] - raise UnexpectedToken(token, expected, seq, i) + raise UnexpectedToken(token, expected, [], 0) def reduce(rule, size): if size: @@ -48,15 +44,20 @@ class Parser(object): value_stack.append(res) # Main LALR-parser loop - while i < len(seq): - action, arg = get_action(seq[i].type) - - if action == ACTION_SHIFT: - state_stack.append(arg) - value_stack.append(seq[i]) - i+= 1 - else: - reduce(*arg) + try: + token = next(stream) + while True: + action, arg = get_action(token.type) + + if action == ACTION_SHIFT: + state_stack.append(arg) + value_stack.append(token) + if set_state: self.state = arg + token = next(stream) + else: + reduce(*arg) + except StopIteration: + pass while True: _action, rule = get_action('$end') diff --git a/tests/__main__.py b/tests/__main__.py index 0aa10fe..7a6f9b3 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -5,7 +5,7 @@ import logging from .test_trees import TestTrees # from .test_selectors import TestSelectors -from .test_parser import TestLalr, TestEarley, TestParsers +from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers # from .test_grammars import TestPythonG, TestConfigG logging.basicConfig(level=logging.INFO) diff --git a/tests/test_parser.py b/tests/test_parser.py index 014d220..213e977 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -356,11 +356,10 @@ def _make_parser_test(PARSER): _TestParser.__name__ = _NAME globals()[_NAME] = _TestParser -for PARSER in ['lalr', 'earley']: +for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']: _make_parser_test(PARSER) - if __name__ == '__main__': unittest.main()