| @@ -0,0 +1,37 @@ | |||||
| # | |||||
| # This example demonstrates the power of the contextual lexer, by parsing a config file. | |||||
| # | |||||
| # The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily | |||||
| # choose one over the other, which would lead to a (confusing) parse error. | |||||
| # However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows | |||||
| # which one of them to expect at each point during the parse. | |||||
| # The lexer then only matches the tokens that the parser expects. | |||||
| # The result is a correct parse, something that is impossible with a regular lexer. | |||||
| # | |||||
| # Another approach is to discard a lexer altogether and use the Earley algorithm. | |||||
| # It will handle more cases than the contextual lexer, but at the cost of performance. | |||||
| # See examples/conf_nolex.py for an example of that approach. | |||||
| # | |||||
| from lark import Lark | |||||
| parser = Lark(r""" | |||||
| start: _NL? section+ | |||||
| section: "[" NAME "]" _NL item+ | |||||
| item: NAME "=" VALUE _NL | |||||
| NAME: /[a-zA-Z_]\w*/ | |||||
| VALUE: /.*/ | |||||
| WS.ignore: /[\t \f]+/ | |||||
| COMMENT.ignore: /\#[^\n]*/ | |||||
| _NL: /(\r?\n)+/ | |||||
| """, parser="lalr_contextual_lexer") | |||||
| sample_conf = """ | |||||
| [bla] | |||||
| a=Hello | |||||
| this="that",4 | |||||
| """ | |||||
| print parser.parse(sample_conf).pretty() | |||||
| @@ -1,5 +1,5 @@ | |||||
| # | # | ||||
| # This example demonstrates lex-less parsing using the earley_nolex frontend | |||||
| # This example demonstrates scanless parsing using the earley_nolex frontend | |||||
| # | # | ||||
| # Using a lexer for configuration files is tricky, because values don't | # Using a lexer for configuration files is tricky, because values don't | ||||
| # have to be surrounded by delimiters. | # have to be surrounded by delimiters. | ||||
| @@ -7,6 +7,10 @@ | |||||
| # | # | ||||
| # Future versions of lark will make it easier to write these kinds of grammars. | # Future versions of lark will make it easier to write these kinds of grammars. | ||||
| # | # | ||||
| # Another approach is to use the contextual lexer. It is less powerful than the scanless approach, | |||||
| # but it can handle some ambiguity in lexing and it's much faster since it uses LALR(1). | |||||
| # See examples/conf.py for an example of that approach. | |||||
| # | |||||
| from lark import Lark, Transformer | from lark import Lark, Transformer | ||||
| @@ -17,6 +17,8 @@ class UnexpectedToken(ParseError): | |||||
| context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | ||||
| except AttributeError: | except AttributeError: | ||||
| context = seq[index:index+5] | context = seq[index:index+5] | ||||
| except TypeError: | |||||
| context = "<no context>" | |||||
| message = ("Unexpected token %r at line %s, column %s.\n" | message = ("Unexpected token %r at line %s, column %s.\n" | ||||
| "Expected: %s\n" | "Expected: %s\n" | ||||
| "Context: %s" % (token, self.line, self.column, expected, context)) | "Context: %s" % (token, self.line, self.column, expected, context)) | ||||
| @@ -26,7 +26,6 @@ class Indenter: | |||||
| assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | ||||
| def process(self, stream): | def process(self, stream): | ||||
| for token in stream: | for token in stream: | ||||
| if token.type == self.NL_type: | if token.type == self.NL_type: | ||||
| @@ -37,7 +36,7 @@ class Indenter: | |||||
| if token.type in self.OPEN_PAREN_types: | if token.type in self.OPEN_PAREN_types: | ||||
| self.paren_level += 1 | self.paren_level += 1 | ||||
| if token.type in self.CLOSE_PAREN_types: | |||||
| elif token.type in self.CLOSE_PAREN_types: | |||||
| self.paren_level -= 1 | self.paren_level -= 1 | ||||
| assert self.paren_level >= 0 | assert self.paren_level >= 0 | ||||
| @@ -47,3 +46,7 @@ class Indenter: | |||||
| assert self.indent_level == [0], self.indent_level | assert self.indent_level == [0], self.indent_level | ||||
| # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? | |||||
| @property | |||||
| def always_accept(self): | |||||
| return (self.NL_type,) | |||||
| @@ -3,6 +3,7 @@ | |||||
| import re | import re | ||||
| from .utils import Str, classify | from .utils import Str, classify | ||||
| from .common import is_terminal | |||||
| class LexError(Exception): | class LexError(Exception): | ||||
| pass | pass | ||||
| @@ -169,3 +170,64 @@ class Lexer(object): | |||||
| if lex_pos < len(stream): | if lex_pos < len(stream): | ||||
| raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | ||||
| break | break | ||||
| class ContextualLexer: | |||||
| def __init__(self, tokens, states, ignore=(), always_accept=()): | |||||
| tokens_by_name = {} | |||||
| for t in tokens: | |||||
| assert t.name not in tokens_by_name | |||||
| tokens_by_name[t.name] = t | |||||
| lexer_by_tokens = {} | |||||
| self.lexers = {} | |||||
| for state, accepts in states.items(): | |||||
| key = frozenset(accepts) | |||||
| try: | |||||
| lexer = lexer_by_tokens[key] | |||||
| except KeyError: | |||||
| accepts = set(accepts) # For python3 | |||||
| accepts |= set(ignore) | |||||
| accepts |= set(always_accept) | |||||
| state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] | |||||
| lexer = Lexer(state_tokens, ignore=ignore) | |||||
| lexer_by_tokens[key] = lexer | |||||
| self.lexers[state] = lexer | |||||
| self.root_lexer = Lexer(tokens, ignore=ignore) | |||||
| def lex(self, stream, parser): | |||||
| lex_pos = 0 | |||||
| line = 1 | |||||
| col_start_pos = 0 | |||||
| newline_types = list(self.root_lexer.newline_types) | |||||
| ignore_types = list(self.root_lexer.ignore_types) | |||||
| while True: | |||||
| lexer = self.lexers[parser.state] | |||||
| for mre, type_from_index in lexer.mres: | |||||
| m = mre.match(stream, lex_pos) | |||||
| if m: | |||||
| value = m.group(0) | |||||
| type_ = type_from_index[m.lastindex] | |||||
| if type_ not in ignore_types: | |||||
| t = Token(type_, value, lex_pos) | |||||
| t.line = line | |||||
| t.column = lex_pos - col_start_pos | |||||
| if t.type in lexer.callback: | |||||
| t = lexer.callback[t.type](t) | |||||
| yield t | |||||
| if type_ in newline_types: | |||||
| newlines = value.count(lexer.newline_char) | |||||
| if newlines: | |||||
| line += newlines | |||||
| col_start_pos = lex_pos + value.rindex(lexer.newline_char) | |||||
| lex_pos += len(value) | |||||
| break | |||||
| else: | |||||
| if lex_pos < len(stream): | |||||
| print("Allowed tokens:", lexer.tokens) | |||||
| raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||||
| break | |||||
| @@ -1,7 +1,7 @@ | |||||
| import re | import re | ||||
| import sre_parse | import sre_parse | ||||
| from .lexer import Lexer | |||||
| from .lexer import Lexer, ContextualLexer | |||||
| from .parsers.lalr_analysis import GrammarAnalyzer | from .parsers.lalr_analysis import GrammarAnalyzer | ||||
| from .common import is_terminal, GrammarError | from .common import is_terminal, GrammarError | ||||
| @@ -31,6 +31,29 @@ class LALR(WithLexer): | |||||
| tokens = list(self.lex(text)) | tokens = list(self.lex(text)) | ||||
| return self.parser.parse(tokens) | return self.parser.parse(tokens) | ||||
| class LALR_ContextualLexer: | |||||
| def __init__(self, lexer_conf, parser_conf): | |||||
| self.lexer_conf = lexer_conf | |||||
| self.parser_conf = parser_conf | |||||
| self.analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||||
| self.analyzer.analyze() | |||||
| d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} | |||||
| self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, | |||||
| always_accept=lexer_conf.postlex.always_accept | |||||
| if lexer_conf.postlex else ()) | |||||
| def parse(self, text): | |||||
| parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) | |||||
| tokens = self.lexer.lex(text, parser) | |||||
| if self.lexer_conf.postlex: | |||||
| tokens = self.lexer_conf.postlex.process(tokens) | |||||
| return parser.parse(tokens, True) | |||||
| class Earley(WithLexer): | class Earley(WithLexer): | ||||
| def __init__(self, lexer_conf, parser_conf): | def __init__(self, lexer_conf, parser_conf): | ||||
| WithLexer.__init__(self, lexer_conf) | WithLexer.__init__(self, lexer_conf) | ||||
| @@ -82,4 +105,4 @@ class Earley_NoLex: | |||||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | ||||
| return res[0] | return res[0] | ||||
| ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex } | |||||
| ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer } | |||||
| @@ -7,13 +7,15 @@ class Parser(object): | |||||
| self.analysis = analysis | self.analysis = analysis | ||||
| self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | ||||
| for rule in analysis.rules} | for rule in analysis.rules} | ||||
| self.state = self.analysis.init_state_idx | |||||
| def parse(self, seq): | |||||
| def parse(self, seq, set_state=False): | |||||
| i = 0 | |||||
| stream = iter(seq) | |||||
| states_idx = self.analysis.states_idx | states_idx = self.analysis.states_idx | ||||
| state_stack = [self.analysis.init_state_idx] | state_stack = [self.analysis.init_state_idx] | ||||
| value_stack = [] | value_stack = [] | ||||
| i = 0 | |||||
| def get_action(key): | def get_action(key): | ||||
| state = state_stack[-1] | state = state_stack[-1] | ||||
| @@ -21,11 +23,6 @@ class Parser(object): | |||||
| return states_idx[state][key] | return states_idx[state][key] | ||||
| except KeyError: | except KeyError: | ||||
| expected = states_idx[state].keys() | expected = states_idx[state].keys() | ||||
| try: | |||||
| token = seq[i] | |||||
| except IndexError: | |||||
| assert key == '$end' | |||||
| token = seq[-1] | |||||
| raise UnexpectedToken(token, expected, seq, i) | raise UnexpectedToken(token, expected, seq, i) | ||||
| @@ -48,15 +45,22 @@ class Parser(object): | |||||
| value_stack.append(res) | value_stack.append(res) | ||||
| # Main LALR-parser loop | # Main LALR-parser loop | ||||
| while i < len(seq): | |||||
| action, arg = get_action(seq[i].type) | |||||
| if action == ACTION_SHIFT: | |||||
| state_stack.append(arg) | |||||
| value_stack.append(seq[i]) | |||||
| i+= 1 | |||||
| else: | |||||
| reduce(*arg) | |||||
| try: | |||||
| token = next(stream) | |||||
| i += 1 | |||||
| while True: | |||||
| action, arg = get_action(token.type) | |||||
| if action == ACTION_SHIFT: | |||||
| state_stack.append(arg) | |||||
| value_stack.append(token) | |||||
| if set_state: self.state = arg | |||||
| token = next(stream) | |||||
| i += 1 | |||||
| else: | |||||
| reduce(*arg) | |||||
| except StopIteration: | |||||
| pass | |||||
| while True: | while True: | ||||
| _action, rule = get_action('$end') | _action, rule = get_action('$end') | ||||
| @@ -5,7 +5,7 @@ import logging | |||||
| from .test_trees import TestTrees | from .test_trees import TestTrees | ||||
| # from .test_selectors import TestSelectors | # from .test_selectors import TestSelectors | ||||
| from .test_parser import TestLalr, TestEarley, TestParsers | |||||
| from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers | |||||
| # from .test_grammars import TestPythonG, TestConfigG | # from .test_grammars import TestPythonG, TestConfigG | ||||
| logging.basicConfig(level=logging.INFO) | logging.basicConfig(level=logging.INFO) | ||||
| @@ -356,11 +356,10 @@ def _make_parser_test(PARSER): | |||||
| _TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
| globals()[_NAME] = _TestParser | globals()[_NAME] = _TestParser | ||||
| for PARSER in ['lalr', 'earley']: | |||||
| for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']: | |||||
| _make_parser_test(PARSER) | _make_parser_test(PARSER) | ||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| unittest.main() | unittest.main() | ||||