@@ -0,0 +1,46 @@ | |||
"""This example demonstrates usage of the Indenter class. | |||
Since indentation is context-sensitive, a postlex stage is introduced to manufacture INDENT/DEDENT tokens. | |||
It is crucial for the indenter that the NL_type matches the spaces (and tabs) after the newline. | |||
""" | |||
from lark.lark import Lark | |||
from lark.indenter import Indenter | |||
tree_grammar = """ | |||
?start: _NL* tree | |||
tree: /\w+/ _NL [_INDENT tree+ _DEDENT] | |||
NAME: /\w+/ | |||
WS.ignore: /\s+/ | |||
_NL.newline: /(\r?\n[\t ]*)+/ | |||
""" | |||
class TreeIndenter(Indenter): | |||
NL_type = '_NL' | |||
OPEN_PAREN_types = [] | |||
CLOSE_PAREN_types = [] | |||
INDENT_type = '_INDENT' | |||
DEDENT_type = '_DEDENT' | |||
tab_len = 0 | |||
parser = Lark(tree_grammar, parser='lalr', postlex=TreeIndenter()) | |||
test_tree = """ | |||
a | |||
b | |||
c | |||
d | |||
e | |||
f | |||
g | |||
""" | |||
def test(): | |||
print parser.parse(test_tree).pretty() | |||
if __name__ == '__main__': | |||
test() | |||
@@ -0,0 +1,47 @@ | |||
"Provides Indentation services for languages with indentation similar to Python" | |||
from .lexer import Token | |||
class Indenter: | |||
def __init__(self): | |||
self.paren_level = 0 | |||
self.indent_level = [0] | |||
def handle_NL(self, token): | |||
if (self.paren_level > 0): | |||
return | |||
indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | |||
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | |||
if indent > self.indent_level[-1]: | |||
self.indent_level.append(indent) | |||
yield Token(self.INDENT_type, indent_str) | |||
else: | |||
while indent < self.indent_level[-1]: | |||
self.indent_level.pop() | |||
yield Token(self.DEDENT_type, indent_str) | |||
assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | |||
def process(self, stream): | |||
for token in stream: | |||
yield token | |||
if token.type == self.NL_type: | |||
for t in self.handle_NL(token): | |||
yield t | |||
if token.type in self.OPEN_PAREN_types: | |||
self.paren_level += 1 | |||
if token.type in self.CLOSE_PAREN_types: | |||
self.paren_level -= 1 | |||
assert self.paren_level >= 0 | |||
while len(self.indent_level) > 1: | |||
self.indent_level.pop() | |||
yield Token(self.DEDENT_type, '') | |||
assert self.indent_level == [0], self.indent_level | |||
@@ -23,7 +23,7 @@ class LarkOptions(object): | |||
only_lex - Don't build a parser. Useful for debugging (default: False) | |||
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | |||
cache_grammar - Cache the Lark grammar (Default: False) | |||
ignore_postproc - Don't call the post-processing function (default: False) | |||
postlex - Lexer post-processing (Default: None) | |||
""" | |||
__doc__ += OPTIONS_DOC | |||
def __init__(self, options_dict): | |||
@@ -34,7 +34,7 @@ class LarkOptions(object): | |||
self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) | |||
self.tree_class = o.pop('tree_class', Tree) | |||
self.cache_grammar = o.pop('cache_grammar', False) | |||
self.ignore_postproc = bool(o.pop('ignore_postproc', False)) | |||
self.postlex = o.pop('postlex', None) | |||
self.parser = o.pop('parser', 'earley') | |||
self.transformer = o.pop('transformer', None) | |||
@@ -206,7 +206,11 @@ class Lark: | |||
return f | |||
def lex(self, text): | |||
return self.lexer.lex(text) | |||
stream = self.lexer.lex(text) | |||
if self.options.postlex: | |||
return self.options.postlex.process(stream) | |||
else: | |||
return stream | |||
def parse(self, text): | |||
assert not self.options.only_lex | |||
@@ -33,6 +33,7 @@ LIMIT = 50 # Stupid named groups limit in python re | |||
class Lexer(object): | |||
def __init__(self, tokens, callbacks, ignore=()): | |||
self.ignore = ignore | |||
self.newline_char = '\n' | |||
# Sanitization | |||
token_names = {t[0] for t in tokens} | |||
@@ -60,6 +61,8 @@ class Lexer(object): | |||
def lex(self, stream): | |||
lex_pos = 0 | |||
line = 0 | |||
col_start_pos = 0 | |||
while True: | |||
i = 0 | |||
for mre in self.mres: | |||
@@ -67,11 +70,17 @@ class Lexer(object): | |||
if m: | |||
value = m.group(0) | |||
type_ = self.name_from_index[i][m.lastindex] | |||
t = Token(type_, value, lex_pos) | |||
if t.type in self.callbacks: | |||
self.callbacks[t.type](t) | |||
if t.type not in self.ignore: | |||
if type_ not in self.ignore: | |||
t = Token(type_, value, lex_pos) | |||
t.line = line | |||
t.column = lex_pos - col_start_pos | |||
if t.type in self.callbacks: | |||
t = self.callbacks[t.type](t) | |||
yield t | |||
newlines = value.count(self.newline_char) | |||
if newlines: | |||
line += newlines | |||
col_start_pos = lex_pos + value.rindex(self.newline_char) | |||
lex_pos += len(value) | |||
break | |||
i += 1 | |||
@@ -4,7 +4,7 @@ class ParseError(Exception): | |||
pass | |||
class Parser(object): | |||
def __init__(self, ga, callback, temp=False): | |||
def __init__(self, ga, callback): | |||
self.ga = ga | |||
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | |||
for rule in ga.rules} | |||