From 520ab88cd86bb0dddc08bc975a922c4d37e94e06 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 13 Feb 2017 01:39:10 +0200 Subject: [PATCH 1/3] Added ContextualLexer for LALR. Working, but doesn't seem to improve speed. Also some issues with python_parser --- lark/lexer.py | 62 +++++++++++++++++++++++++++++++++++++ lark/parser_frontends.py | 23 ++++++++++++-- lark/parsers/lalr_parser.py | 35 +++++++++++---------- tests/__main__.py | 2 +- tests/test_parser.py | 3 +- 5 files changed, 103 insertions(+), 22 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index efdbba3..75c0d18 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,6 +3,7 @@ import re from .utils import Str, classify +from .common import is_terminal class LexError(Exception): pass @@ -169,3 +170,64 @@ class Lexer(object): if lex_pos < len(stream): raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) break + + +class ContextualLexer: + def __init__(self, tokens, states, ignore=()): + tokens_by_name = {} + for t in tokens: + assert t.name not in tokens_by_name + tokens_by_name[t.name] = t + + lexer_by_tokens = {} + self.lexers = {} + for state, accepts in states.items(): + key = frozenset(accepts) + try: + lexer = lexer_by_tokens[key] + except KeyError: + accepts = list(accepts) # For python3 + accepts += ignore + # if '_NEWLINE' in tokens_by_name and '_NEWLINE' not in accepts: + # accepts.append('_NEWLINE') # XXX hack for now + state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] + lexer = Lexer(state_tokens, ignore=ignore) + lexer_by_tokens[key] = lexer + + self.lexers[state] = lexer + + self.root_lexer = Lexer(tokens, ignore=ignore) + + def lex(self, stream, parser): + lex_pos = 0 + line = 1 + col_start_pos = 0 + newline_types = list(self.root_lexer.newline_types) + ignore_types = list(self.root_lexer.ignore_types) + while True: + lexer = self.lexers[parser.state] + for mre, type_from_index in lexer.mres: + m = mre.match(stream, lex_pos) + if m: + value = m.group(0) + type_ = type_from_index[m.lastindex] + if type_ not in ignore_types: + t = Token(type_, value, lex_pos) + t.line = line + t.column = lex_pos - col_start_pos + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + yield t + + if type_ in newline_types: + newlines = value.count(lexer.newline_char) + if newlines: + line += newlines + col_start_pos = lex_pos + value.rindex(lexer.newline_char) + lex_pos += len(value) + break + else: + if lex_pos < len(stream): + raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) + break + diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index af1275d..0b9719b 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,7 +1,7 @@ import re import sre_parse -from .lexer import Lexer +from .lexer import Lexer, ContextualLexer from .parsers.lalr_analysis import GrammarAnalyzer from .common import is_terminal, GrammarError @@ -31,6 +31,25 @@ class LALR(WithLexer): tokens = list(self.lex(text)) return self.parser.parse(tokens) +class LALR_ContextualLexer: + def __init__(self, lexer_conf, parser_conf): + self.lexer_conf = lexer_conf + self.parser_conf = parser_conf + + self.analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) + self.analyzer.analyze() + + d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} + self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore) + + + def parse(self, text): + parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) + l = self.lexer.lex(text, parser) + return parser.parse(l, True) + + + class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf): WithLexer.__init__(self, lexer_conf) @@ -82,4 +101,4 @@ class Earley_NoLex: assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] -ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex } +ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer } diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 12c15e5..f87ad7d 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -7,13 +7,14 @@ class Parser(object): self.analysis = analysis self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in analysis.rules} + self.state = self.analysis.init_state_idx - def parse(self, seq): + def parse(self, stream, set_state=False): # XXX no set_state + stream = iter(stream) states_idx = self.analysis.states_idx state_stack = [self.analysis.init_state_idx] value_stack = [] - i = 0 def get_action(key): state = state_stack[-1] @@ -21,13 +22,8 @@ class Parser(object): return states_idx[state][key] except KeyError: expected = states_idx[state].keys() - try: - token = seq[i] - except IndexError: - assert key == '$end' - token = seq[-1] - raise UnexpectedToken(token, expected, seq, i) + raise UnexpectedToken(token, expected, [], 0) def reduce(rule, size): if size: @@ -48,15 +44,20 @@ class Parser(object): value_stack.append(res) # Main LALR-parser loop - while i < len(seq): - action, arg = get_action(seq[i].type) - - if action == ACTION_SHIFT: - state_stack.append(arg) - value_stack.append(seq[i]) - i+= 1 - else: - reduce(*arg) + try: + token = next(stream) + while True: + action, arg = get_action(token.type) + + if action == ACTION_SHIFT: + state_stack.append(arg) + value_stack.append(token) + if set_state: self.state = arg + token = next(stream) + else: + reduce(*arg) + except StopIteration: + pass while True: _action, rule = get_action('$end') diff --git a/tests/__main__.py b/tests/__main__.py index 0aa10fe..7a6f9b3 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -5,7 +5,7 @@ import logging from .test_trees import TestTrees # from .test_selectors import TestSelectors -from .test_parser import TestLalr, TestEarley, TestParsers +from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers # from .test_grammars import TestPythonG, TestConfigG logging.basicConfig(level=logging.INFO) diff --git a/tests/test_parser.py b/tests/test_parser.py index 014d220..213e977 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -356,11 +356,10 @@ def _make_parser_test(PARSER): _TestParser.__name__ = _NAME globals()[_NAME] = _TestParser -for PARSER in ['lalr', 'earley']: +for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']: _make_parser_test(PARSER) - if __name__ == '__main__': unittest.main() From d4425887d6f5a6617fa4c52709b518c68bc31872 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 15 Feb 2017 10:33:22 +0200 Subject: [PATCH 2/3] Added conf.py example and indenter support in contextual lexing --- examples/conf.py | 37 +++++++++++++++++++++++++++++++++++++ examples/conf_nolex.py | 6 +++++- lark/indenter.py | 7 +++++-- lark/lexer.py | 10 +++++----- lark/parser_frontends.py | 10 +++++++--- 5 files changed, 59 insertions(+), 11 deletions(-) create mode 100644 examples/conf.py diff --git a/examples/conf.py b/examples/conf.py new file mode 100644 index 0000000..c872b09 --- /dev/null +++ b/examples/conf.py @@ -0,0 +1,37 @@ +# +# This example demonstrates the power of the contextual lexer, by parsing a config file. +# +# The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily +# choose one over the other, which would lead to a (confusing) parse error. +# However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows +# which one of them to expect at each point during the parse. +# The lexer then only matches the tokens that the parser expects. +# The result is a correct parse, something that is impossible with a regular lexer. +# +# Another approach is to discard a lexer altogether and use the Earley algorithm. +# It will handle more cases than the contextual lexer, but at the cost of performance. +# See examples/conf_nolex.py for an example of that approach. +# + +from lark import Lark + +parser = Lark(r""" + start: _NL? section+ + section: "[" NAME "]" _NL item+ + item: NAME "=" VALUE _NL + NAME: /[a-zA-Z_]\w*/ + VALUE: /.*/ + + WS.ignore: /[\t \f]+/ + COMMENT.ignore: /\#[^\n]*/ + _NL: /(\r?\n)+/ + """, parser="lalr_contextual_lexer") + + +sample_conf = """ +[bla] +a=Hello +this="that",4 +""" + +print parser.parse(sample_conf).pretty() diff --git a/examples/conf_nolex.py b/examples/conf_nolex.py index 6ae7340..7879b26 100644 --- a/examples/conf_nolex.py +++ b/examples/conf_nolex.py @@ -1,5 +1,5 @@ # -# This example demonstrates lex-less parsing using the earley_nolex frontend +# This example demonstrates scanless parsing using the earley_nolex frontend # # Using a lexer for configuration files is tricky, because values don't # have to be surrounded by delimiters. @@ -7,6 +7,10 @@ # # Future versions of lark will make it easier to write these kinds of grammars. # +# Another approach is to use the contextual lexer. It is less powerful than the scanless approach, +# but it can handle some ambiguity in lexing and it's much faster since it uses LALR(1). +# See examples/conf.py for an example of that approach. +# from lark import Lark, Transformer diff --git a/lark/indenter.py b/lark/indenter.py index d6d27ed..24ac170 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -26,7 +26,6 @@ class Indenter: assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) - def process(self, stream): for token in stream: if token.type == self.NL_type: @@ -37,7 +36,7 @@ class Indenter: if token.type in self.OPEN_PAREN_types: self.paren_level += 1 - if token.type in self.CLOSE_PAREN_types: + elif token.type in self.CLOSE_PAREN_types: self.paren_level -= 1 assert self.paren_level >= 0 @@ -47,3 +46,7 @@ class Indenter: assert self.indent_level == [0], self.indent_level + # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? + @property + def always_accept(self): + return (self.NL_type,) diff --git a/lark/lexer.py b/lark/lexer.py index 75c0d18..301d555 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -173,7 +173,7 @@ class Lexer(object): class ContextualLexer: - def __init__(self, tokens, states, ignore=()): + def __init__(self, tokens, states, ignore=(), always_accept=()): tokens_by_name = {} for t in tokens: assert t.name not in tokens_by_name @@ -186,10 +186,9 @@ class ContextualLexer: try: lexer = lexer_by_tokens[key] except KeyError: - accepts = list(accepts) # For python3 - accepts += ignore - # if '_NEWLINE' in tokens_by_name and '_NEWLINE' not in accepts: - # accepts.append('_NEWLINE') # XXX hack for now + accepts = set(accepts) # For python3 + accepts |= set(ignore) + accepts |= set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] lexer = Lexer(state_tokens, ignore=ignore) lexer_by_tokens[key] = lexer @@ -228,6 +227,7 @@ class ContextualLexer: break else: if lex_pos < len(stream): + print("Allowed tokens:", lexer.tokens) raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) break diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 0b9719b..668815c 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -40,13 +40,17 @@ class LALR_ContextualLexer: self.analyzer.analyze() d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} - self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore) + self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, + always_accept=lexer_conf.postlex.always_accept + if lexer_conf.postlex else ()) def parse(self, text): parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) - l = self.lexer.lex(text, parser) - return parser.parse(l, True) + tokens = self.lexer.lex(text, parser) + if self.lexer_conf.postlex: + tokens = self.lexer_conf.postlex.process(tokens) + return parser.parse(tokens, True) From c8e6122148d7e1ba81cc192ffe9af094e131d9a5 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 15 Feb 2017 10:41:39 +0200 Subject: [PATCH 3/3] Restored nice error reports --- lark/common.py | 2 ++ lark/parsers/lalr_parser.py | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lark/common.py b/lark/common.py index a8f2975..53b6a86 100644 --- a/lark/common.py +++ b/lark/common.py @@ -17,6 +17,8 @@ class UnexpectedToken(ParseError): context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) except AttributeError: context = seq[index:index+5] + except TypeError: + context = "" message = ("Unexpected token %r at line %s, column %s.\n" "Expected: %s\n" "Context: %s" % (token, self.line, self.column, expected, context)) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f87ad7d..313d808 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -9,8 +9,9 @@ class Parser(object): for rule in analysis.rules} self.state = self.analysis.init_state_idx - def parse(self, stream, set_state=False): # XXX no set_state - stream = iter(stream) + def parse(self, seq, set_state=False): + i = 0 + stream = iter(seq) states_idx = self.analysis.states_idx state_stack = [self.analysis.init_state_idx] @@ -23,7 +24,7 @@ class Parser(object): except KeyError: expected = states_idx[state].keys() - raise UnexpectedToken(token, expected, [], 0) + raise UnexpectedToken(token, expected, seq, i) def reduce(rule, size): if size: @@ -46,6 +47,7 @@ class Parser(object): # Main LALR-parser loop try: token = next(stream) + i += 1 while True: action, arg = get_action(token.type) @@ -54,6 +56,7 @@ class Parser(object): value_stack.append(token) if set_state: self.state = arg token = next(stream) + i += 1 else: reduce(*arg) except StopIteration: