From d4425887d6f5a6617fa4c52709b518c68bc31872 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 15 Feb 2017 10:33:22 +0200 Subject: [PATCH] Added conf.py example and indenter support in contextual lexing --- examples/conf.py | 37 +++++++++++++++++++++++++++++++++++++ examples/conf_nolex.py | 6 +++++- lark/indenter.py | 7 +++++-- lark/lexer.py | 10 +++++----- lark/parser_frontends.py | 10 +++++++--- 5 files changed, 59 insertions(+), 11 deletions(-) create mode 100644 examples/conf.py diff --git a/examples/conf.py b/examples/conf.py new file mode 100644 index 0000000..c872b09 --- /dev/null +++ b/examples/conf.py @@ -0,0 +1,37 @@ +# +# This example demonstrates the power of the contextual lexer, by parsing a config file. +# +# The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily +# choose one over the other, which would lead to a (confusing) parse error. +# However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows +# which one of them to expect at each point during the parse. +# The lexer then only matches the tokens that the parser expects. +# The result is a correct parse, something that is impossible with a regular lexer. +# +# Another approach is to discard a lexer altogether and use the Earley algorithm. +# It will handle more cases than the contextual lexer, but at the cost of performance. +# See examples/conf_nolex.py for an example of that approach. +# + +from lark import Lark + +parser = Lark(r""" + start: _NL? section+ + section: "[" NAME "]" _NL item+ + item: NAME "=" VALUE _NL + NAME: /[a-zA-Z_]\w*/ + VALUE: /.*/ + + WS.ignore: /[\t \f]+/ + COMMENT.ignore: /\#[^\n]*/ + _NL: /(\r?\n)+/ + """, parser="lalr_contextual_lexer") + + +sample_conf = """ +[bla] +a=Hello +this="that",4 +""" + +print parser.parse(sample_conf).pretty() diff --git a/examples/conf_nolex.py b/examples/conf_nolex.py index 6ae7340..7879b26 100644 --- a/examples/conf_nolex.py +++ b/examples/conf_nolex.py @@ -1,5 +1,5 @@ # -# This example demonstrates lex-less parsing using the earley_nolex frontend +# This example demonstrates scanless parsing using the earley_nolex frontend # # Using a lexer for configuration files is tricky, because values don't # have to be surrounded by delimiters. @@ -7,6 +7,10 @@ # # Future versions of lark will make it easier to write these kinds of grammars. # +# Another approach is to use the contextual lexer. It is less powerful than the scanless approach, +# but it can handle some ambiguity in lexing and it's much faster since it uses LALR(1). +# See examples/conf.py for an example of that approach. +# from lark import Lark, Transformer diff --git a/lark/indenter.py b/lark/indenter.py index d6d27ed..24ac170 100644 --- a/lark/indenter.py +++ b/lark/indenter.py @@ -26,7 +26,6 @@ class Indenter: assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) - def process(self, stream): for token in stream: if token.type == self.NL_type: @@ -37,7 +36,7 @@ class Indenter: if token.type in self.OPEN_PAREN_types: self.paren_level += 1 - if token.type in self.CLOSE_PAREN_types: + elif token.type in self.CLOSE_PAREN_types: self.paren_level -= 1 assert self.paren_level >= 0 @@ -47,3 +46,7 @@ class Indenter: assert self.indent_level == [0], self.indent_level + # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? + @property + def always_accept(self): + return (self.NL_type,) diff --git a/lark/lexer.py b/lark/lexer.py index 75c0d18..301d555 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -173,7 +173,7 @@ class Lexer(object): class ContextualLexer: - def __init__(self, tokens, states, ignore=()): + def __init__(self, tokens, states, ignore=(), always_accept=()): tokens_by_name = {} for t in tokens: assert t.name not in tokens_by_name @@ -186,10 +186,9 @@ class ContextualLexer: try: lexer = lexer_by_tokens[key] except KeyError: - accepts = list(accepts) # For python3 - accepts += ignore - # if '_NEWLINE' in tokens_by_name and '_NEWLINE' not in accepts: - # accepts.append('_NEWLINE') # XXX hack for now + accepts = set(accepts) # For python3 + accepts |= set(ignore) + accepts |= set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] lexer = Lexer(state_tokens, ignore=ignore) lexer_by_tokens[key] = lexer @@ -228,6 +227,7 @@ class ContextualLexer: break else: if lex_pos < len(stream): + print("Allowed tokens:", lexer.tokens) raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) break diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 0b9719b..668815c 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -40,13 +40,17 @@ class LALR_ContextualLexer: self.analyzer.analyze() d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} - self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore) + self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, + always_accept=lexer_conf.postlex.always_accept + if lexer_conf.postlex else ()) def parse(self, text): parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) - l = self.lexer.lex(text, parser) - return parser.parse(l, True) + tokens = self.lexer.lex(text, parser) + if self.lexer_conf.postlex: + tokens = self.lexer_conf.postlex.process(tokens) + return parser.parse(tokens, True)