| @@ -0,0 +1,46 @@ | |||||
| """This example demonstrates usage of the Indenter class. | |||||
| Since indentation is context-sensitive, a postlex stage is introduced to manufacture INDENT/DEDENT tokens. | |||||
| It is crucial for the indenter that the NL_type matches the spaces (and tabs) after the newline. | |||||
| """ | |||||
| from lark.lark import Lark | |||||
| from lark.indenter import Indenter | |||||
| tree_grammar = """ | |||||
| ?start: _NL* tree | |||||
| tree: /\w+/ _NL [_INDENT tree+ _DEDENT] | |||||
| NAME: /\w+/ | |||||
| WS.ignore: /\s+/ | |||||
| _NL.newline: /(\r?\n[\t ]*)+/ | |||||
| """ | |||||
| class TreeIndenter(Indenter): | |||||
| NL_type = '_NL' | |||||
| OPEN_PAREN_types = [] | |||||
| CLOSE_PAREN_types = [] | |||||
| INDENT_type = '_INDENT' | |||||
| DEDENT_type = '_DEDENT' | |||||
| tab_len = 0 | |||||
| parser = Lark(tree_grammar, parser='lalr', postlex=TreeIndenter()) | |||||
| test_tree = """ | |||||
| a | |||||
| b | |||||
| c | |||||
| d | |||||
| e | |||||
| f | |||||
| g | |||||
| """ | |||||
| def test(): | |||||
| print parser.parse(test_tree).pretty() | |||||
| if __name__ == '__main__': | |||||
| test() | |||||
| @@ -0,0 +1,47 @@ | |||||
| "Provides Indentation services for languages with indentation similar to Python" | |||||
| from .lexer import Token | |||||
| class Indenter: | |||||
| def __init__(self): | |||||
| self.paren_level = 0 | |||||
| self.indent_level = [0] | |||||
| def handle_NL(self, token): | |||||
| if (self.paren_level > 0): | |||||
| return | |||||
| indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | |||||
| indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | |||||
| if indent > self.indent_level[-1]: | |||||
| self.indent_level.append(indent) | |||||
| yield Token(self.INDENT_type, indent_str) | |||||
| else: | |||||
| while indent < self.indent_level[-1]: | |||||
| self.indent_level.pop() | |||||
| yield Token(self.DEDENT_type, indent_str) | |||||
| assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | |||||
| def process(self, stream): | |||||
| for token in stream: | |||||
| yield token | |||||
| if token.type == self.NL_type: | |||||
| for t in self.handle_NL(token): | |||||
| yield t | |||||
| if token.type in self.OPEN_PAREN_types: | |||||
| self.paren_level += 1 | |||||
| if token.type in self.CLOSE_PAREN_types: | |||||
| self.paren_level -= 1 | |||||
| assert self.paren_level >= 0 | |||||
| while len(self.indent_level) > 1: | |||||
| self.indent_level.pop() | |||||
| yield Token(self.DEDENT_type, '') | |||||
| assert self.indent_level == [0], self.indent_level | |||||
| @@ -23,7 +23,7 @@ class LarkOptions(object): | |||||
| only_lex - Don't build a parser. Useful for debugging (default: False) | only_lex - Don't build a parser. Useful for debugging (default: False) | ||||
| keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) | ||||
| cache_grammar - Cache the Lark grammar (Default: False) | cache_grammar - Cache the Lark grammar (Default: False) | ||||
| ignore_postproc - Don't call the post-processing function (default: False) | |||||
| postlex - Lexer post-processing (Default: None) | |||||
| """ | """ | ||||
| __doc__ += OPTIONS_DOC | __doc__ += OPTIONS_DOC | ||||
| def __init__(self, options_dict): | def __init__(self, options_dict): | ||||
| @@ -34,7 +34,7 @@ class LarkOptions(object): | |||||
| self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) | self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) | ||||
| self.tree_class = o.pop('tree_class', Tree) | self.tree_class = o.pop('tree_class', Tree) | ||||
| self.cache_grammar = o.pop('cache_grammar', False) | self.cache_grammar = o.pop('cache_grammar', False) | ||||
| self.ignore_postproc = bool(o.pop('ignore_postproc', False)) | |||||
| self.postlex = o.pop('postlex', None) | |||||
| self.parser = o.pop('parser', 'earley') | self.parser = o.pop('parser', 'earley') | ||||
| self.transformer = o.pop('transformer', None) | self.transformer = o.pop('transformer', None) | ||||
| @@ -206,7 +206,11 @@ class Lark: | |||||
| return f | return f | ||||
| def lex(self, text): | def lex(self, text): | ||||
| return self.lexer.lex(text) | |||||
| stream = self.lexer.lex(text) | |||||
| if self.options.postlex: | |||||
| return self.options.postlex.process(stream) | |||||
| else: | |||||
| return stream | |||||
| def parse(self, text): | def parse(self, text): | ||||
| assert not self.options.only_lex | assert not self.options.only_lex | ||||
| @@ -33,6 +33,7 @@ LIMIT = 50 # Stupid named groups limit in python re | |||||
| class Lexer(object): | class Lexer(object): | ||||
| def __init__(self, tokens, callbacks, ignore=()): | def __init__(self, tokens, callbacks, ignore=()): | ||||
| self.ignore = ignore | self.ignore = ignore | ||||
| self.newline_char = '\n' | |||||
| # Sanitization | # Sanitization | ||||
| token_names = {t[0] for t in tokens} | token_names = {t[0] for t in tokens} | ||||
| @@ -60,6 +61,8 @@ class Lexer(object): | |||||
| def lex(self, stream): | def lex(self, stream): | ||||
| lex_pos = 0 | lex_pos = 0 | ||||
| line = 0 | |||||
| col_start_pos = 0 | |||||
| while True: | while True: | ||||
| i = 0 | i = 0 | ||||
| for mre in self.mres: | for mre in self.mres: | ||||
| @@ -67,11 +70,17 @@ class Lexer(object): | |||||
| if m: | if m: | ||||
| value = m.group(0) | value = m.group(0) | ||||
| type_ = self.name_from_index[i][m.lastindex] | type_ = self.name_from_index[i][m.lastindex] | ||||
| t = Token(type_, value, lex_pos) | |||||
| if t.type in self.callbacks: | |||||
| self.callbacks[t.type](t) | |||||
| if t.type not in self.ignore: | |||||
| if type_ not in self.ignore: | |||||
| t = Token(type_, value, lex_pos) | |||||
| t.line = line | |||||
| t.column = lex_pos - col_start_pos | |||||
| if t.type in self.callbacks: | |||||
| t = self.callbacks[t.type](t) | |||||
| yield t | yield t | ||||
| newlines = value.count(self.newline_char) | |||||
| if newlines: | |||||
| line += newlines | |||||
| col_start_pos = lex_pos + value.rindex(self.newline_char) | |||||
| lex_pos += len(value) | lex_pos += len(value) | ||||
| break | break | ||||
| i += 1 | i += 1 | ||||
| @@ -4,7 +4,7 @@ class ParseError(Exception): | |||||
| pass | pass | ||||
| class Parser(object): | class Parser(object): | ||||
| def __init__(self, ga, callback, temp=False): | |||||
| def __init__(self, ga, callback): | |||||
| self.ga = ga | self.ga = ga | ||||
| self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) | ||||
| for rule in ga.rules} | for rule in ga.rules} | ||||