@@ -0,0 +1,37 @@ | |||
# | |||
# This example demonstrates the power of the contextual lexer, by parsing a config file. | |||
# | |||
# The tokens NAME and VALUE match the same input. A regular lexer would arbitrarily | |||
# choose one over the other, which would lead to a (confusing) parse error. | |||
# However, due to the unambiguous structure of the grammar, the LALR(1) algorithm knows | |||
# which one of them to expect at each point during the parse. | |||
# The lexer then only matches the tokens that the parser expects. | |||
# The result is a correct parse, something that is impossible with a regular lexer. | |||
# | |||
# Another approach is to discard a lexer altogether and use the Earley algorithm. | |||
# It will handle more cases than the contextual lexer, but at the cost of performance. | |||
# See examples/conf_nolex.py for an example of that approach. | |||
# | |||
from lark import Lark | |||
parser = Lark(r""" | |||
start: _NL? section+ | |||
section: "[" NAME "]" _NL item+ | |||
item: NAME "=" VALUE _NL | |||
NAME: /[a-zA-Z_]\w*/ | |||
VALUE: /.*/ | |||
WS.ignore: /[\t \f]+/ | |||
COMMENT.ignore: /\#[^\n]*/ | |||
_NL: /(\r?\n)+/ | |||
""", parser="lalr_contextual_lexer") | |||
sample_conf = """ | |||
[bla] | |||
a=Hello | |||
this="that",4 | |||
""" | |||
print parser.parse(sample_conf).pretty() |
@@ -1,5 +1,5 @@ | |||
# | |||
# This example demonstrates lex-less parsing using the earley_nolex frontend | |||
# This example demonstrates scanless parsing using the earley_nolex frontend | |||
# | |||
# Using a lexer for configuration files is tricky, because values don't | |||
# have to be surrounded by delimiters. | |||
@@ -7,6 +7,10 @@ | |||
# | |||
# Future versions of lark will make it easier to write these kinds of grammars. | |||
# | |||
# Another approach is to use the contextual lexer. It is less powerful than the scanless approach, | |||
# but it can handle some ambiguity in lexing and it's much faster since it uses LALR(1). | |||
# See examples/conf.py for an example of that approach. | |||
# | |||
from lark import Lark, Transformer | |||
@@ -17,6 +17,8 @@ class UnexpectedToken(ParseError): | |||
context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | |||
except AttributeError: | |||
context = seq[index:index+5] | |||
except TypeError: | |||
context = "<no context>" | |||
message = ("Unexpected token %r at line %s, column %s.\n" | |||
"Expected: %s\n" | |||
"Context: %s" % (token, self.line, self.column, expected, context)) | |||
@@ -26,7 +26,6 @@ class Indenter: | |||
assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | |||
def process(self, stream): | |||
for token in stream: | |||
if token.type == self.NL_type: | |||
@@ -37,7 +36,7 @@ class Indenter: | |||
if token.type in self.OPEN_PAREN_types: | |||
self.paren_level += 1 | |||
if token.type in self.CLOSE_PAREN_types: | |||
elif token.type in self.CLOSE_PAREN_types: | |||
self.paren_level -= 1 | |||
assert self.paren_level >= 0 | |||
@@ -47,3 +46,7 @@ class Indenter: | |||
assert self.indent_level == [0], self.indent_level | |||
# XXX Hack for ContextualLexer. Maybe there's a more elegant solution? | |||
@property | |||
def always_accept(self): | |||
return (self.NL_type,) |
@@ -3,6 +3,7 @@ | |||
import re | |||
from .utils import Str, classify | |||
from .common import is_terminal | |||
class LexError(Exception): | |||
pass | |||
@@ -169,3 +170,64 @@ class Lexer(object): | |||
if lex_pos < len(stream): | |||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||
break | |||
class ContextualLexer: | |||
def __init__(self, tokens, states, ignore=(), always_accept=()): | |||
tokens_by_name = {} | |||
for t in tokens: | |||
assert t.name not in tokens_by_name | |||
tokens_by_name[t.name] = t | |||
lexer_by_tokens = {} | |||
self.lexers = {} | |||
for state, accepts in states.items(): | |||
key = frozenset(accepts) | |||
try: | |||
lexer = lexer_by_tokens[key] | |||
except KeyError: | |||
accepts = set(accepts) # For python3 | |||
accepts |= set(ignore) | |||
accepts |= set(always_accept) | |||
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] | |||
lexer = Lexer(state_tokens, ignore=ignore) | |||
lexer_by_tokens[key] = lexer | |||
self.lexers[state] = lexer | |||
self.root_lexer = Lexer(tokens, ignore=ignore) | |||
def lex(self, stream, parser): | |||
lex_pos = 0 | |||
line = 1 | |||
col_start_pos = 0 | |||
newline_types = list(self.root_lexer.newline_types) | |||
ignore_types = list(self.root_lexer.ignore_types) | |||
while True: | |||
lexer = self.lexers[parser.state] | |||
for mre, type_from_index in lexer.mres: | |||
m = mre.match(stream, lex_pos) | |||
if m: | |||
value = m.group(0) | |||
type_ = type_from_index[m.lastindex] | |||
if type_ not in ignore_types: | |||
t = Token(type_, value, lex_pos) | |||
t.line = line | |||
t.column = lex_pos - col_start_pos | |||
if t.type in lexer.callback: | |||
t = lexer.callback[t.type](t) | |||
yield t | |||
if type_ in newline_types: | |||
newlines = value.count(lexer.newline_char) | |||
if newlines: | |||
line += newlines | |||
col_start_pos = lex_pos + value.rindex(lexer.newline_char) | |||
lex_pos += len(value) | |||
break | |||
else: | |||
if lex_pos < len(stream): | |||
print("Allowed tokens:", lexer.tokens) | |||
raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||
break | |||
@@ -1,7 +1,7 @@ | |||
import re | |||
import sre_parse | |||
from .lexer import Lexer | |||
from .lexer import Lexer, ContextualLexer | |||
from .parsers.lalr_analysis import GrammarAnalyzer | |||
from .common import is_terminal, GrammarError | |||
@@ -31,6 +31,29 @@ class LALR(WithLexer): | |||
tokens = list(self.lex(text)) | |||
return self.parser.parse(tokens) | |||
class LALR_ContextualLexer: | |||
def __init__(self, lexer_conf, parser_conf): | |||
self.lexer_conf = lexer_conf | |||
self.parser_conf = parser_conf | |||
self.analyzer = GrammarAnalyzer(parser_conf.rules, parser_conf.start) | |||
self.analyzer.analyze() | |||
d = {idx:t.keys() for idx, t in self.analyzer.states_idx.items()} | |||
self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, | |||
always_accept=lexer_conf.postlex.always_accept | |||
if lexer_conf.postlex else ()) | |||
def parse(self, text): | |||
parser = lalr_parser.Parser(self.analyzer, self.parser_conf.callback) | |||
tokens = self.lexer.lex(text, parser) | |||
if self.lexer_conf.postlex: | |||
tokens = self.lexer_conf.postlex.process(tokens) | |||
return parser.parse(tokens, True) | |||
class Earley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf): | |||
WithLexer.__init__(self, lexer_conf) | |||
@@ -82,4 +105,4 @@ class Earley_NoLex: | |||
assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
return res[0] | |||
ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex } | |||
ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer } |
@@ -7,13 +7,15 @@ class Parser(object): | |||
self.analysis = analysis | |||
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||
for rule in analysis.rules} | |||
self.state = self.analysis.init_state_idx | |||
def parse(self, seq): | |||
def parse(self, seq, set_state=False): | |||
i = 0 | |||
stream = iter(seq) | |||
states_idx = self.analysis.states_idx | |||
state_stack = [self.analysis.init_state_idx] | |||
value_stack = [] | |||
i = 0 | |||
def get_action(key): | |||
state = state_stack[-1] | |||
@@ -21,11 +23,6 @@ class Parser(object): | |||
return states_idx[state][key] | |||
except KeyError: | |||
expected = states_idx[state].keys() | |||
try: | |||
token = seq[i] | |||
except IndexError: | |||
assert key == '$end' | |||
token = seq[-1] | |||
raise UnexpectedToken(token, expected, seq, i) | |||
@@ -48,15 +45,22 @@ class Parser(object): | |||
value_stack.append(res) | |||
# Main LALR-parser loop | |||
while i < len(seq): | |||
action, arg = get_action(seq[i].type) | |||
if action == ACTION_SHIFT: | |||
state_stack.append(arg) | |||
value_stack.append(seq[i]) | |||
i+= 1 | |||
else: | |||
reduce(*arg) | |||
try: | |||
token = next(stream) | |||
i += 1 | |||
while True: | |||
action, arg = get_action(token.type) | |||
if action == ACTION_SHIFT: | |||
state_stack.append(arg) | |||
value_stack.append(token) | |||
if set_state: self.state = arg | |||
token = next(stream) | |||
i += 1 | |||
else: | |||
reduce(*arg) | |||
except StopIteration: | |||
pass | |||
while True: | |||
_action, rule = get_action('$end') | |||
@@ -5,7 +5,7 @@ import logging | |||
from .test_trees import TestTrees | |||
# from .test_selectors import TestSelectors | |||
from .test_parser import TestLalr, TestEarley, TestParsers | |||
from .test_parser import TestLalr, TestEarley, TestLalr_contextual_lexer, TestParsers | |||
# from .test_grammars import TestPythonG, TestConfigG | |||
logging.basicConfig(level=logging.INFO) | |||
@@ -356,11 +356,10 @@ def _make_parser_test(PARSER): | |||
_TestParser.__name__ = _NAME | |||
globals()[_NAME] = _TestParser | |||
for PARSER in ['lalr', 'earley']: | |||
for PARSER in ['lalr', 'earley', 'lalr_contextual_lexer']: | |||
_make_parser_test(PARSER) | |||
if __name__ == '__main__': | |||
unittest.main() | |||