Browse Source

Added indentation support (for python-like languages)

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.1
Erez Shinan 8 years ago
parent
commit
9e71f5a045
5 changed files with 114 additions and 8 deletions
  1. +46
    -0
      examples/indented_tree.py
  2. +47
    -0
      lark/indenter.py
  3. +7
    -3
      lark/lark.py
  4. +13
    -4
      lark/lexer.py
  5. +1
    -1
      lark/parser.py

+ 46
- 0
examples/indented_tree.py View File

@@ -0,0 +1,46 @@
"""This example demonstrates usage of the Indenter class.

Since indentation is context-sensitive, a postlex stage is introduced to manufacture INDENT/DEDENT tokens.
It is crucial for the indenter that the NL_type matches the spaces (and tabs) after the newline.
"""

from lark.lark import Lark
from lark.indenter import Indenter

tree_grammar = """
?start: _NL* tree

tree: /\w+/ _NL [_INDENT tree+ _DEDENT]

NAME: /\w+/

WS.ignore: /\s+/
_NL.newline: /(\r?\n[\t ]*)+/
"""

class TreeIndenter(Indenter):
NL_type = '_NL'
OPEN_PAREN_types = []
CLOSE_PAREN_types = []
INDENT_type = '_INDENT'
DEDENT_type = '_DEDENT'
tab_len = 0

parser = Lark(tree_grammar, parser='lalr', postlex=TreeIndenter())

test_tree = """
a
b
c
d
e
f
g
"""

def test():
print parser.parse(test_tree).pretty()

if __name__ == '__main__':
test()


+ 47
- 0
lark/indenter.py View File

@@ -0,0 +1,47 @@
"Provides Indentation services for languages with indentation similar to Python"

from .lexer import Token

class Indenter:
def __init__(self):
self.paren_level = 0
self.indent_level = [0]

def handle_NL(self, token):
if (self.paren_level > 0):
return

indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces
indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len

if indent > self.indent_level[-1]:
self.indent_level.append(indent)
yield Token(self.INDENT_type, indent_str)
else:
while indent < self.indent_level[-1]:
self.indent_level.pop()
yield Token(self.DEDENT_type, indent_str)

assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1])


def process(self, stream):
for token in stream:
yield token

if token.type == self.NL_type:
for t in self.handle_NL(token):
yield t

if token.type in self.OPEN_PAREN_types:
self.paren_level += 1
if token.type in self.CLOSE_PAREN_types:
self.paren_level -= 1
assert self.paren_level >= 0

while len(self.indent_level) > 1:
self.indent_level.pop()
yield Token(self.DEDENT_type, '')

assert self.indent_level == [0], self.indent_level


+ 7
- 3
lark/lark.py View File

@@ -23,7 +23,7 @@ class LarkOptions(object):
only_lex - Don't build a parser. Useful for debugging (default: False)
keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True)
cache_grammar - Cache the Lark grammar (Default: False)
ignore_postproc - Don't call the post-processing function (default: False)
postlex - Lexer post-processing (Default: None)
"""
__doc__ += OPTIONS_DOC
def __init__(self, options_dict):
@@ -34,7 +34,7 @@ class LarkOptions(object):
self.keep_all_tokens = bool(o.pop('keep_all_tokens', False))
self.tree_class = o.pop('tree_class', Tree)
self.cache_grammar = o.pop('cache_grammar', False)
self.ignore_postproc = bool(o.pop('ignore_postproc', False))
self.postlex = o.pop('postlex', None)
self.parser = o.pop('parser', 'earley')
self.transformer = o.pop('transformer', None)

@@ -206,7 +206,11 @@ class Lark:
return f

def lex(self, text):
return self.lexer.lex(text)
stream = self.lexer.lex(text)
if self.options.postlex:
return self.options.postlex.process(stream)
else:
return stream

def parse(self, text):
assert not self.options.only_lex


+ 13
- 4
lark/lexer.py View File

@@ -33,6 +33,7 @@ LIMIT = 50 # Stupid named groups limit in python re
class Lexer(object):
def __init__(self, tokens, callbacks, ignore=()):
self.ignore = ignore
self.newline_char = '\n'

# Sanitization
token_names = {t[0] for t in tokens}
@@ -60,6 +61,8 @@ class Lexer(object):

def lex(self, stream):
lex_pos = 0
line = 0
col_start_pos = 0
while True:
i = 0
for mre in self.mres:
@@ -67,11 +70,17 @@ class Lexer(object):
if m:
value = m.group(0)
type_ = self.name_from_index[i][m.lastindex]
t = Token(type_, value, lex_pos)
if t.type in self.callbacks:
self.callbacks[t.type](t)
if t.type not in self.ignore:
if type_ not in self.ignore:
t = Token(type_, value, lex_pos)
t.line = line
t.column = lex_pos - col_start_pos
if t.type in self.callbacks:
t = self.callbacks[t.type](t)
yield t
newlines = value.count(self.newline_char)
if newlines:
line += newlines
col_start_pos = lex_pos + value.rindex(self.newline_char)
lex_pos += len(value)
break
i += 1


+ 1
- 1
lark/parser.py View File

@@ -4,7 +4,7 @@ class ParseError(Exception):
pass

class Parser(object):
def __init__(self, ga, callback, temp=False):
def __init__(self, ga, callback):
self.ga = ga
self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None)
for rule in ga.rules}


Loading…
Cancel
Save