@@ -0,0 +1,37 @@ | |||||
# | |||||
# This example demonstrates the power of the contextual lexer, by parsing a config file. | |||||
# | |||||
# The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily | |||||
# choose one over the other, which would lead to a (confusing) parse error. | |||||
# However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows | |||||
# which one of them to expect at each point during the parse. | |||||
# The lexer then only matches the tokens that the parser expects. | |||||
# The result is a correct parse, something that is impossible with a regular lexer. | |||||
# | |||||
# Another approach is to discard a lexer altogether and use the Earley algorithm. | |||||
# It will handle more cases than the contextual lexer, but at the cost of performance. | |||||
# See examples/conf_nolex.py for an example of that approach. | |||||
# | |||||
from lark import Lark | |||||
parser = Lark(r""" | |||||
start: _NL? section+ | |||||
section: "[" NAME "]" _NL item+ | |||||
item: NAME "=" VALUE _NL | |||||
NAME: /[a-zA-Z_]\w*/ | |||||
VALUE: /.*/ | |||||
WS.ignore: /[\t \f]+/ | |||||
COMMENT.ignore: /\#[^\n]*/ | |||||
_NL: /(\r?\n)+/ | |||||
""", parser="lalr_contextual_lexer") | |||||
sample_conf = """ | |||||
[bla] | |||||
a=Hello | |||||
this="that",4 | |||||
""" | |||||
print parser.parse(sample_conf).pretty() |
@@ -1,5 +1,5 @@ | |||||
# | # | ||||
# This example demonstrates lex-less parsing using the earley_nolex frontend | |||||
# This example demonstrates scanless parsing using the earley_nolex frontend | |||||
# | # | ||||
# Using a lexer for configuration files is tricky, because values don't | # Using a lexer for configuration files is tricky, because values don't | ||||
# have to be surrounded by delimiters. | # have to be surrounded by delimiters. | ||||
@@ -7,6 +7,10 @@ | |||||
# | # | ||||
# Future versions of lark will make it easier to write these kinds of grammars. | # Future versions of lark will make it easier to write these kinds of grammars. | ||||
# | # | ||||
# Another approach is to use the contextual lexer. It is less powerful than the scanless approach, | |||||
# but it can handle some ambiguity in lexing and it's much faster since it uses LALR(1). | |||||
# See examples/conf.py for an example of that approach. | |||||
# | |||||
from lark import Lark, Transformer | from lark import Lark, Transformer | ||||
@@ -26,7 +26,6 @@ class Indenter: | |||||
assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | ||||
def process(self, stream): | def process(self, stream): | ||||
for token in stream: | for token in stream: | ||||
if token.type == self.NL_type: | if token.type == self.NL_type: | ||||
@@ -37,7 +36,7 @@ class Indenter: | |||||
if token.type in self.OPEN_PAREN_types: | if token.type in self.OPEN_PAREN_types: | ||||
self.paren_level += 1 | self.paren_level += 1 | ||||
if token.type in self.CLOSE_PAREN_types: | |||||
elif token.type in self.CLOSE_PAREN_types: | |||||
self.paren_level -= 1 | self.paren_level -= 1 | ||||
assert self.paren_level >= 0 | assert self.paren_level >= 0 | ||||
@@ -47,3 +46,7 @@ class Indenter: | |||||
assert self.indent_level == [0], self.indent_level | assert self.indent_level == [0], self.indent_level | ||||
# XXX Hack for ContextualLexer. Maybe there's a more elegant solution? | |||||
@property | |||||
def always_accept(self): | |||||
return (self.NL_type,) |
@@ -173,7 +173,7 @@ class Lexer(object): | |||||
class ContextualLexer: | class ContextualLexer: | ||||
def __init__(self, tokens, states, ignore=()): | |||||
def __init__(self, tokens, states, ignore=(), always_accept=()): | |||||
tokens_by_name = {} | tokens_by_name = {} | ||||
for t in tokens: | for t in tokens: | ||||
assert t.name not in tokens_by_name | assert t.name not in tokens_by_name | ||||
@@ -186,10 +186,9 @@ class ContextualLexer: | |||||
try: | try: | ||||
lexer = lexer_by_tokens[key] | lexer = lexer_by_tokens[key] | ||||
except KeyError: | except KeyError: | ||||
accepts = list(accepts) # For python3 | |||||
accepts += ignore | |||||
# if '_NEWLINE' in tokens_by_name and '_NEWLINE' not in accepts: | |||||
# accepts.append('_NEWLINE') # XXX hack for now | |||||
accepts = set(accepts) # For python3 | |||||
accepts |= set(ignore) | |||||
accepts |= set(always_accept) | |||||
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] | state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] | ||||
lexer = Lexer(state_tokens, ignore=ignore) | lexer = Lexer(state_tokens, ignore=ignore) | ||||
lexer_by_tokens[key] = lexer | lexer_by_tokens[key] = lexer | ||||
@@ -228,6 +227,7 @@ class ContextualLexer: | |||||
break | break | ||||
else: | else: | ||||
if lex_pos < len(stream): | if lex_pos < len(stream): | ||||
print("Allowed tokens:", lexer.tokens) | |||||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | ||||
break | break | ||||
@@ -40,13 +40,17 @@ class LALR_ContextualLexer: | |||||
self.analyzer.analyze() | self.analyzer.analyze() | ||||
d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} | d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} | ||||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore) | |||||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, | |||||
always_accept=lexer_conf.postlex.always_accept | |||||
if lexer_conf.postlex else ()) | |||||
def parse(self, text): | def parse(self, text): | ||||
parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) | parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) | ||||
l = self.lexer.lex(text, parser) | |||||
return parser.parse(l, True) | |||||
tokens = self.lexer.lex(text, parser) | |||||
if self.lexer_conf.postlex: | |||||
tokens = self.lexer_conf.postlex.process(tokens) | |||||
return parser.parse(tokens, True) | |||||