@@ -1,13 +1,13 @@ | |||||
# Lark - a modern pure-Python parsing library | |||||
# Lark - a modern parsing library | |||||
Lark is a modern general-purpose Python parsing library, that focuses on simplicity and power. | |||||
Lark is a modern general-purpose parsing library for Python. | |||||
Lark accepts grammars as EBNF and lets you choose between two parsing algorithms: | |||||
Lark focuses on simplicity and power. It lets you choose between two parsing algorithms: | |||||
- Earley : Parses all context-free grammars (even ambiguous ones)! | |||||
- Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. | |||||
- LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries. | - LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries. | ||||
Both algorithms are pure-python implementations and can be used interchangably (aside for algorithmic restrictions). | |||||
Both algorithms are written in Python and can be used interchangably with the same grammar (aside for algorithmic restrictions). See "Comparison to other parsers" for more details. | |||||
Lark can automagically build an AST from your grammar, without any more code on your part. | Lark can automagically build an AST from your grammar, without any more code on your part. | ||||
@@ -41,10 +41,12 @@ Tree(start, [Token(WORD, Hello), Token(WORD, World)]) | |||||
Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark. | Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark. | ||||
To learn more about Lark: | |||||
- Learn how to parse json at the [tutorial](/docs/json_tutorial.md) | |||||
## Learn more about using Lark | |||||
## Features | |||||
- Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark. | |||||
- Browse the [examples](/examples), which include a calculator, and a Python-code parser. | |||||
## List of Features | |||||
- EBNF grammar with a little extra | - EBNF grammar with a little extra | ||||
- Earley & LALR(1) | - Earley & LALR(1) | ||||
@@ -0,0 +1 @@ | |||||
from .lark import Lark, Transformer |
@@ -39,6 +39,7 @@ class LarkOptions(object): | |||||
self.parser = o.pop('parser', 'earley') | self.parser = o.pop('parser', 'earley') | ||||
self.transformer = o.pop('transformer', None) | self.transformer = o.pop('transformer', None) | ||||
self.start = o.pop('start', 'start') | self.start = o.pop('start', 'start') | ||||
self.profile = o.pop('profile', False) # XXX new | |||||
assert self.parser in ENGINE_DICT | assert self.parser in ENGINE_DICT | ||||
if self.parser == 'earley' and self.transformer: | if self.parser == 'earley' and self.transformer: | ||||
@@ -50,6 +51,30 @@ class LarkOptions(object): | |||||
raise ValueError("Unknown options: %s" % o.keys()) | raise ValueError("Unknown options: %s" % o.keys()) | ||||
import time | |||||
from collections import defaultdict | |||||
class Profiler: | |||||
def __init__(self): | |||||
self.total_time = defaultdict(float) | |||||
self.cur_section = '__init__' | |||||
self.last_enter_time = time.time() | |||||
def enter_section(self, name): | |||||
cur_time = time.time() | |||||
self.total_time[self.cur_section] += cur_time - self.last_enter_time | |||||
self.last_enter_time = cur_time | |||||
self.cur_section = name | |||||
def make_wrapper(self, name, f): | |||||
def _f(*args, **kwargs): | |||||
last_section = self.cur_section | |||||
self.enter_section(name) | |||||
try: | |||||
return f(*args, **kwargs) | |||||
finally: | |||||
self.enter_section(last_section) | |||||
return _f | |||||
class Lark: | class Lark: | ||||
@@ -82,6 +107,8 @@ class Lark: | |||||
if self.options.cache_grammar: | if self.options.cache_grammar: | ||||
raise NotImplementedError("Not available yet") | raise NotImplementedError("Not available yet") | ||||
self.profiler = Profiler() if self.options.profile else None | |||||
self.tokens, self.rules = load_grammar(grammar) | self.tokens, self.rules = load_grammar(grammar) | ||||
self.lexer = self._build_lexer() | self.lexer = self._build_lexer() | ||||
@@ -90,6 +117,9 @@ class Lark: | |||||
self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | ||||
self.parser = self._build_parser() | self.parser = self._build_parser() | ||||
if self.profiler: self.profiler.enter_section('outside_lark') | |||||
def _create_unless_callback(self, strs): | def _create_unless_callback(self, strs): | ||||
def f(t): | def f(t): | ||||
if t in strs: | if t in strs: | ||||
@@ -105,8 +135,6 @@ class Lark: | |||||
for flag in flags: | for flag in flags: | ||||
if flag == 'ignore': | if flag == 'ignore': | ||||
ignore_tokens.append(name) | ignore_tokens.append(name) | ||||
elif flag == 'newline': | |||||
pass # TODO | |||||
elif isinstance(flag, tuple) and flag[0] == 'unless': | elif isinstance(flag, tuple) and flag[0] == 'unless': | ||||
_, strs = flag | _, strs = flag | ||||
callbacks[name] = self._create_unless_callback(strs) | callbacks[name] = self._create_unless_callback(strs) | ||||
@@ -119,6 +147,10 @@ class Lark: | |||||
def _build_parser(self): | def _build_parser(self): | ||||
rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | ||||
if self.profiler: | |||||
for f in dir(callback): | |||||
if not f.startswith('__'): | |||||
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) | |||||
return self.parser_engine.build_parser(rules, callback, self.options.start) | return self.parser_engine.build_parser(rules, callback, self.options.start) | ||||
@@ -133,6 +165,16 @@ class Lark: | |||||
def parse(self, text): | def parse(self, text): | ||||
assert not self.options.only_lex | assert not self.options.only_lex | ||||
l = list(self.lex(text)) | |||||
return self.parser.parse(l) | |||||
if self.profiler: | |||||
self.profiler.enter_section('lex') | |||||
l = list(self.lex(text)) | |||||
self.profiler.enter_section('parse') | |||||
try: | |||||
return self.parser.parse(l) | |||||
finally: | |||||
self.profiler.enter_section('outside_lark') | |||||
else: | |||||
l = list(self.lex(text)) | |||||
return self.parser.parse(l) | |||||
@@ -1,5 +1,7 @@ | |||||
## Lexer Implementation | ## Lexer Implementation | ||||
import re | |||||
from .utils import Str | from .utils import Str | ||||
class LexError(Exception): | class LexError(Exception): | ||||
@@ -13,13 +15,6 @@ class Token(Str): | |||||
inst.value = value | inst.value = value | ||||
return inst | return inst | ||||
# class Token(object): | |||||
# def __init__(self, type, value, lexpos): | |||||
# self.type = type | |||||
# self.value = value | |||||
# self.lexpos = lexpos | |||||
def __repr__(self): | def __repr__(self): | ||||
return 'Token(%s, %s)' % (self.type, self.value) | return 'Token(%s, %s)' % (self.type, self.value) | ||||
@@ -29,12 +24,11 @@ class Regex: | |||||
self.flags = flags | self.flags = flags | ||||
import re | |||||
LIMIT = 50 # Stupid named groups limit in python re | |||||
class Lexer(object): | class Lexer(object): | ||||
def __init__(self, tokens, callbacks, ignore=()): | def __init__(self, tokens, callbacks, ignore=()): | ||||
self.ignore = ignore | self.ignore = ignore | ||||
self.newline_char = '\n' | self.newline_char = '\n' | ||||
tokens = list(tokens) | |||||
# Sanitization | # Sanitization | ||||
token_names = {t[0] for t in tokens} | token_names = {t[0] for t in tokens} | ||||
@@ -49,42 +43,57 @@ class Lexer(object): | |||||
self.tokens = tokens | self.tokens = tokens | ||||
self.callbacks = callbacks | self.callbacks = callbacks | ||||
# self.tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||||
self.token_types = list(token_names) | |||||
self.type_index = {name:i for i,name in enumerate(self.token_types)} | |||||
self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]] | |||||
self.ignore_types = [self.type_index[t] for t in ignore] | |||||
self.mres = [] | |||||
self.name_from_index = [] | |||||
x = list(tokens) | |||||
while x: | |||||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) | |||||
self.mres.append(mre) | |||||
self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) | |||||
x = x[LIMIT:] | |||||
self.mres = self._build_mres(tokens, len(tokens)) | |||||
def _build_mres(self, tokens, max_size): | |||||
# Python sets an unreasonable group limit (currently 100) in its re module | |||||
# Worse, the only way to know we reached it is by catching an AssertionError! | |||||
# This function recursively tries less and less groups until it's successful. | |||||
mres = [] | |||||
while tokens: | |||||
try: | |||||
mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size])) | |||||
except AssertionError: # Yes, this is what Python provides us.. :/ | |||||
return self._build_mres(tokens, max_size/2) | |||||
mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} )) | |||||
tokens = tokens[max_size:] | |||||
return mres | |||||
def lex(self, stream): | def lex(self, stream): | ||||
lex_pos = 0 | lex_pos = 0 | ||||
line = 1 | line = 1 | ||||
col_start_pos = 0 | col_start_pos = 0 | ||||
newline_types = list(self.newline_types) | |||||
ignore_types = list(self.ignore_types) | |||||
while True: | while True: | ||||
i = 0 | |||||
for mre in self.mres: | |||||
for mre, type_from_index in self.mres: | |||||
m = mre.match(stream, lex_pos) | m = mre.match(stream, lex_pos) | ||||
if m: | if m: | ||||
value = m.group(0) | value = m.group(0) | ||||
type_ = self.name_from_index[i][m.lastindex] | |||||
if type_ not in self.ignore: | |||||
t = Token(type_, value, lex_pos) | |||||
type_num = type_from_index[m.lastindex] | |||||
if type_num not in ignore_types: | |||||
t = Token(self.token_types[type_num], value, lex_pos) | |||||
t.line = line | t.line = line | ||||
t.column = lex_pos - col_start_pos | t.column = lex_pos - col_start_pos | ||||
if t.type in self.callbacks: | if t.type in self.callbacks: | ||||
t = self.callbacks[t.type](t) | t = self.callbacks[t.type](t) | ||||
yield t | yield t | ||||
newlines = value.count(self.newline_char) | |||||
if newlines: | |||||
line += newlines | |||||
col_start_pos = lex_pos + value.rindex(self.newline_char) | |||||
if type_num in newline_types: | |||||
newlines = value.count(self.newline_char) | |||||
if newlines: | |||||
line += newlines | |||||
col_start_pos = lex_pos + value.rindex(self.newline_char) | |||||
lex_pos += len(value) | lex_pos += len(value) | ||||
break | break | ||||
i += 1 | |||||
else: | else: | ||||
if lex_pos < len(stream): | if lex_pos < len(stream): | ||||
context = stream[lex_pos:lex_pos+5] | context = stream[lex_pos:lex_pos+5] | ||||
@@ -334,6 +334,13 @@ def _make_parser_test(PARSER): | |||||
x = g.parse('a') | x = g.parse('a') | ||||
self.assertEqual(x.data, "b") | self.assertEqual(x.data, "b") | ||||
def test_lexer_token_limit(self): | |||||
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" | |||||
tokens = {'A%d'%i:'"%d"'%i for i in range(300)} | |||||
g = _Lark("""start: %s | |||||
%s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items()))) | |||||
_NAME = "Test" + PARSER.capitalize() | _NAME = "Test" + PARSER.capitalize() | ||||
_TestParser.__name__ = _NAME | _TestParser.__name__ = _NAME | ||||
globals()[_NAME] = _TestParser | globals()[_NAME] = _TestParser | ||||