@@ -1,13 +1,13 @@ | |||
# Lark - a modern pure-Python parsing library | |||
# Lark - a modern parsing library | |||
Lark is a modern general-purpose Python parsing library, that focuses on simplicity and power. | |||
Lark is a modern general-purpose parsing library for Python. | |||
Lark accepts grammars as EBNF and lets you choose between two parsing algorithms: | |||
Lark focuses on simplicity and power. It lets you choose between two parsing algorithms: | |||
- Earley : Parses all context-free grammars (even ambiguous ones)! | |||
- Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. | |||
- LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries. | |||
Both algorithms are pure-python implementations and can be used interchangably (aside for algorithmic restrictions). | |||
Both algorithms are written in Python and can be used interchangably with the same grammar (aside for algorithmic restrictions). See "Comparison to other parsers" for more details. | |||
Lark can automagically build an AST from your grammar, without any more code on your part. | |||
@@ -41,10 +41,12 @@ Tree(start, [Token(WORD, Hello), Token(WORD, World)]) | |||
Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark. | |||
To learn more about Lark: | |||
- Learn how to parse json at the [tutorial](/docs/json_tutorial.md) | |||
## Learn more about using Lark | |||
## Features | |||
- Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark. | |||
- Browse the [examples](/examples), which include a calculator, and a Python-code parser. | |||
## List of Features | |||
- EBNF grammar with a little extra | |||
- Earley & LALR(1) | |||
@@ -0,0 +1 @@ | |||
from .lark import Lark, Transformer |
@@ -39,6 +39,7 @@ class LarkOptions(object): | |||
self.parser = o.pop('parser', 'earley') | |||
self.transformer = o.pop('transformer', None) | |||
self.start = o.pop('start', 'start') | |||
self.profile = o.pop('profile', False) # XXX new | |||
assert self.parser in ENGINE_DICT | |||
if self.parser == 'earley' and self.transformer: | |||
@@ -50,6 +51,30 @@ class LarkOptions(object): | |||
raise ValueError("Unknown options: %s" % o.keys()) | |||
import time | |||
from collections import defaultdict | |||
class Profiler: | |||
def __init__(self): | |||
self.total_time = defaultdict(float) | |||
self.cur_section = '__init__' | |||
self.last_enter_time = time.time() | |||
def enter_section(self, name): | |||
cur_time = time.time() | |||
self.total_time[self.cur_section] += cur_time - self.last_enter_time | |||
self.last_enter_time = cur_time | |||
self.cur_section = name | |||
def make_wrapper(self, name, f): | |||
def _f(*args, **kwargs): | |||
last_section = self.cur_section | |||
self.enter_section(name) | |||
try: | |||
return f(*args, **kwargs) | |||
finally: | |||
self.enter_section(last_section) | |||
return _f | |||
class Lark: | |||
@@ -82,6 +107,8 @@ class Lark: | |||
if self.options.cache_grammar: | |||
raise NotImplementedError("Not available yet") | |||
self.profiler = Profiler() if self.options.profile else None | |||
self.tokens, self.rules = load_grammar(grammar) | |||
self.lexer = self._build_lexer() | |||
@@ -90,6 +117,9 @@ class Lark: | |||
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | |||
self.parser = self._build_parser() | |||
if self.profiler: self.profiler.enter_section('outside_lark') | |||
def _create_unless_callback(self, strs): | |||
def f(t): | |||
if t in strs: | |||
@@ -105,8 +135,6 @@ class Lark: | |||
for flag in flags: | |||
if flag == 'ignore': | |||
ignore_tokens.append(name) | |||
elif flag == 'newline': | |||
pass # TODO | |||
elif isinstance(flag, tuple) and flag[0] == 'unless': | |||
_, strs = flag | |||
callbacks[name] = self._create_unless_callback(strs) | |||
@@ -119,6 +147,10 @@ class Lark: | |||
def _build_parser(self): | |||
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | |||
if self.profiler: | |||
for f in dir(callback): | |||
if not f.startswith('__'): | |||
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) | |||
return self.parser_engine.build_parser(rules, callback, self.options.start) | |||
@@ -133,6 +165,16 @@ class Lark: | |||
def parse(self, text): | |||
assert not self.options.only_lex | |||
l = list(self.lex(text)) | |||
return self.parser.parse(l) | |||
if self.profiler: | |||
self.profiler.enter_section('lex') | |||
l = list(self.lex(text)) | |||
self.profiler.enter_section('parse') | |||
try: | |||
return self.parser.parse(l) | |||
finally: | |||
self.profiler.enter_section('outside_lark') | |||
else: | |||
l = list(self.lex(text)) | |||
return self.parser.parse(l) | |||
@@ -1,5 +1,7 @@ | |||
## Lexer Implementation | |||
import re | |||
from .utils import Str | |||
class LexError(Exception): | |||
@@ -13,13 +15,6 @@ class Token(Str): | |||
inst.value = value | |||
return inst | |||
# class Token(object): | |||
# def __init__(self, type, value, lexpos): | |||
# self.type = type | |||
# self.value = value | |||
# self.lexpos = lexpos | |||
def __repr__(self): | |||
return 'Token(%s, %s)' % (self.type, self.value) | |||
@@ -29,12 +24,11 @@ class Regex: | |||
self.flags = flags | |||
import re | |||
LIMIT = 50 # Stupid named groups limit in python re | |||
class Lexer(object): | |||
def __init__(self, tokens, callbacks, ignore=()): | |||
self.ignore = ignore | |||
self.newline_char = '\n' | |||
tokens = list(tokens) | |||
# Sanitization | |||
token_names = {t[0] for t in tokens} | |||
@@ -49,42 +43,57 @@ class Lexer(object): | |||
self.tokens = tokens | |||
self.callbacks = callbacks | |||
# self.tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
self.token_types = list(token_names) | |||
self.type_index = {name:i for i,name in enumerate(self.token_types)} | |||
self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]] | |||
self.ignore_types = [self.type_index[t] for t in ignore] | |||
self.mres = [] | |||
self.name_from_index = [] | |||
x = list(tokens) | |||
while x: | |||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) | |||
self.mres.append(mre) | |||
self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) | |||
x = x[LIMIT:] | |||
self.mres = self._build_mres(tokens, len(tokens)) | |||
def _build_mres(self, tokens, max_size): | |||
# Python sets an unreasonable group limit (currently 100) in its re module | |||
# Worse, the only way to know we reached it is by catching an AssertionError! | |||
# This function recursively tries less and less groups until it's successful. | |||
mres = [] | |||
while tokens: | |||
try: | |||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size])) | |||
except AssertionError: # Yes, this is what Python provides us.. :/ | |||
return self._build_mres(tokens, max_size/2) | |||
mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} )) | |||
tokens = tokens[max_size:] | |||
return mres | |||
def lex(self, stream): | |||
lex_pos = 0 | |||
line = 1 | |||
col_start_pos = 0 | |||
newline_types = list(self.newline_types) | |||
ignore_types = list(self.ignore_types) | |||
while True: | |||
i = 0 | |||
for mre in self.mres: | |||
for mre, type_from_index in self.mres: | |||
m = mre.match(stream, lex_pos) | |||
if m: | |||
value = m.group(0) | |||
type_ = self.name_from_index[i][m.lastindex] | |||
if type_ not in self.ignore: | |||
t = Token(type_, value, lex_pos) | |||
type_num = type_from_index[m.lastindex] | |||
if type_num not in ignore_types: | |||
t = Token(self.token_types[type_num], value, lex_pos) | |||
t.line = line | |||
t.column = lex_pos - col_start_pos | |||
if t.type in self.callbacks: | |||
t = self.callbacks[t.type](t) | |||
yield t | |||
newlines = value.count(self.newline_char) | |||
if newlines: | |||
line += newlines | |||
col_start_pos = lex_pos + value.rindex(self.newline_char) | |||
if type_num in newline_types: | |||
newlines = value.count(self.newline_char) | |||
if newlines: | |||
line += newlines | |||
col_start_pos = lex_pos + value.rindex(self.newline_char) | |||
lex_pos += len(value) | |||
break | |||
i += 1 | |||
else: | |||
if lex_pos < len(stream): | |||
context = stream[lex_pos:lex_pos+5] | |||
@@ -334,6 +334,13 @@ def _make_parser_test(PARSER): | |||
x = g.parse('a') | |||
self.assertEqual(x.data, "b") | |||
def test_lexer_token_limit(self): | |||
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" | |||
tokens = {'A%d'%i:'"%d"'%i for i in range(300)} | |||
g = _Lark("""start: %s | |||
%s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items()))) | |||
_NAME = "Test" + PARSER.capitalize() | |||
_TestParser.__name__ = _NAME | |||
globals()[_NAME] = _TestParser | |||