| @@ -1,13 +1,13 @@ | |||
| # Lark - a modern pure-Python parsing library | |||
| # Lark - a modern parsing library | |||
| Lark is a modern general-purpose Python parsing library, that focuses on simplicity and power. | |||
| Lark is a modern general-purpose parsing library for Python. | |||
| Lark accepts grammars as EBNF and lets you choose between two parsing algorithms: | |||
| Lark focuses on simplicity and power. It lets you choose between two parsing algorithms: | |||
| - Earley : Parses all context-free grammars (even ambiguous ones)! | |||
| - Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. | |||
| - LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries. | |||
| Both algorithms are pure-python implementations and can be used interchangably (aside for algorithmic restrictions). | |||
| Both algorithms are written in Python and can be used interchangably with the same grammar (aside for algorithmic restrictions). See "Comparison to other parsers" for more details. | |||
| Lark can automagically build an AST from your grammar, without any more code on your part. | |||
| @@ -41,10 +41,12 @@ Tree(start, [Token(WORD, Hello), Token(WORD, World)]) | |||
| Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark. | |||
| To learn more about Lark: | |||
| - Learn how to parse json at the [tutorial](/docs/json_tutorial.md) | |||
| ## Learn more about using Lark | |||
| ## Features | |||
| - Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark. | |||
| - Browse the [examples](/examples), which include a calculator, and a Python-code parser. | |||
| ## List of Features | |||
| - EBNF grammar with a little extra | |||
| - Earley & LALR(1) | |||
| @@ -0,0 +1 @@ | |||
| from .lark import Lark, Transformer | |||
| @@ -39,6 +39,7 @@ class LarkOptions(object): | |||
| self.parser = o.pop('parser', 'earley') | |||
| self.transformer = o.pop('transformer', None) | |||
| self.start = o.pop('start', 'start') | |||
| self.profile = o.pop('profile', False) # XXX new | |||
| assert self.parser in ENGINE_DICT | |||
| if self.parser == 'earley' and self.transformer: | |||
| @@ -50,6 +51,30 @@ class LarkOptions(object): | |||
| raise ValueError("Unknown options: %s" % o.keys()) | |||
| import time | |||
| from collections import defaultdict | |||
| class Profiler: | |||
| def __init__(self): | |||
| self.total_time = defaultdict(float) | |||
| self.cur_section = '__init__' | |||
| self.last_enter_time = time.time() | |||
| def enter_section(self, name): | |||
| cur_time = time.time() | |||
| self.total_time[self.cur_section] += cur_time - self.last_enter_time | |||
| self.last_enter_time = cur_time | |||
| self.cur_section = name | |||
| def make_wrapper(self, name, f): | |||
| def _f(*args, **kwargs): | |||
| last_section = self.cur_section | |||
| self.enter_section(name) | |||
| try: | |||
| return f(*args, **kwargs) | |||
| finally: | |||
| self.enter_section(last_section) | |||
| return _f | |||
| class Lark: | |||
| @@ -82,6 +107,8 @@ class Lark: | |||
| if self.options.cache_grammar: | |||
| raise NotImplementedError("Not available yet") | |||
| self.profiler = Profiler() if self.options.profile else None | |||
| self.tokens, self.rules = load_grammar(grammar) | |||
| self.lexer = self._build_lexer() | |||
| @@ -90,6 +117,9 @@ class Lark: | |||
| self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) | |||
| self.parser = self._build_parser() | |||
| if self.profiler: self.profiler.enter_section('outside_lark') | |||
| def _create_unless_callback(self, strs): | |||
| def f(t): | |||
| if t in strs: | |||
| @@ -105,8 +135,6 @@ class Lark: | |||
| for flag in flags: | |||
| if flag == 'ignore': | |||
| ignore_tokens.append(name) | |||
| elif flag == 'newline': | |||
| pass # TODO | |||
| elif isinstance(flag, tuple) and flag[0] == 'unless': | |||
| _, strs = flag | |||
| callbacks[name] = self._create_unless_callback(strs) | |||
| @@ -119,6 +147,10 @@ class Lark: | |||
| def _build_parser(self): | |||
| rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) | |||
| if self.profiler: | |||
| for f in dir(callback): | |||
| if not f.startswith('__'): | |||
| setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) | |||
| return self.parser_engine.build_parser(rules, callback, self.options.start) | |||
| @@ -133,6 +165,16 @@ class Lark: | |||
| def parse(self, text): | |||
| assert not self.options.only_lex | |||
| l = list(self.lex(text)) | |||
| return self.parser.parse(l) | |||
| if self.profiler: | |||
| self.profiler.enter_section('lex') | |||
| l = list(self.lex(text)) | |||
| self.profiler.enter_section('parse') | |||
| try: | |||
| return self.parser.parse(l) | |||
| finally: | |||
| self.profiler.enter_section('outside_lark') | |||
| else: | |||
| l = list(self.lex(text)) | |||
| return self.parser.parse(l) | |||
| @@ -1,5 +1,7 @@ | |||
| ## Lexer Implementation | |||
| import re | |||
| from .utils import Str | |||
| class LexError(Exception): | |||
| @@ -13,13 +15,6 @@ class Token(Str): | |||
| inst.value = value | |||
| return inst | |||
| # class Token(object): | |||
| # def __init__(self, type, value, lexpos): | |||
| # self.type = type | |||
| # self.value = value | |||
| # self.lexpos = lexpos | |||
| def __repr__(self): | |||
| return 'Token(%s, %s)' % (self.type, self.value) | |||
| @@ -29,12 +24,11 @@ class Regex: | |||
| self.flags = flags | |||
| import re | |||
| LIMIT = 50 # Stupid named groups limit in python re | |||
| class Lexer(object): | |||
| def __init__(self, tokens, callbacks, ignore=()): | |||
| self.ignore = ignore | |||
| self.newline_char = '\n' | |||
| tokens = list(tokens) | |||
| # Sanitization | |||
| token_names = {t[0] for t in tokens} | |||
| @@ -49,42 +43,57 @@ class Lexer(object): | |||
| self.tokens = tokens | |||
| self.callbacks = callbacks | |||
| # self.tokens.sort(key=lambda x:len(x[1]), reverse=True) | |||
| self.token_types = list(token_names) | |||
| self.type_index = {name:i for i,name in enumerate(self.token_types)} | |||
| self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]] | |||
| self.ignore_types = [self.type_index[t] for t in ignore] | |||
| self.mres = [] | |||
| self.name_from_index = [] | |||
| x = list(tokens) | |||
| while x: | |||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) | |||
| self.mres.append(mre) | |||
| self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) | |||
| x = x[LIMIT:] | |||
| self.mres = self._build_mres(tokens, len(tokens)) | |||
| def _build_mres(self, tokens, max_size): | |||
| # Python sets an unreasonable group limit (currently 100) in its re module | |||
| # Worse, the only way to know we reached it is by catching an AssertionError! | |||
| # This function recursively tries less and less groups until it's successful. | |||
| mres = [] | |||
| while tokens: | |||
| try: | |||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size])) | |||
| except AssertionError: # Yes, this is what Python provides us.. :/ | |||
| return self._build_mres(tokens, max_size/2) | |||
| mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} )) | |||
| tokens = tokens[max_size:] | |||
| return mres | |||
| def lex(self, stream): | |||
| lex_pos = 0 | |||
| line = 1 | |||
| col_start_pos = 0 | |||
| newline_types = list(self.newline_types) | |||
| ignore_types = list(self.ignore_types) | |||
| while True: | |||
| i = 0 | |||
| for mre in self.mres: | |||
| for mre, type_from_index in self.mres: | |||
| m = mre.match(stream, lex_pos) | |||
| if m: | |||
| value = m.group(0) | |||
| type_ = self.name_from_index[i][m.lastindex] | |||
| if type_ not in self.ignore: | |||
| t = Token(type_, value, lex_pos) | |||
| type_num = type_from_index[m.lastindex] | |||
| if type_num not in ignore_types: | |||
| t = Token(self.token_types[type_num], value, lex_pos) | |||
| t.line = line | |||
| t.column = lex_pos - col_start_pos | |||
| if t.type in self.callbacks: | |||
| t = self.callbacks[t.type](t) | |||
| yield t | |||
| newlines = value.count(self.newline_char) | |||
| if newlines: | |||
| line += newlines | |||
| col_start_pos = lex_pos + value.rindex(self.newline_char) | |||
| if type_num in newline_types: | |||
| newlines = value.count(self.newline_char) | |||
| if newlines: | |||
| line += newlines | |||
| col_start_pos = lex_pos + value.rindex(self.newline_char) | |||
| lex_pos += len(value) | |||
| break | |||
| i += 1 | |||
| else: | |||
| if lex_pos < len(stream): | |||
| context = stream[lex_pos:lex_pos+5] | |||
| @@ -334,6 +334,13 @@ def _make_parser_test(PARSER): | |||
| x = g.parse('a') | |||
| self.assertEqual(x.data, "b") | |||
| def test_lexer_token_limit(self): | |||
| "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" | |||
| tokens = {'A%d'%i:'"%d"'%i for i in range(300)} | |||
| g = _Lark("""start: %s | |||
| %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items()))) | |||
| _NAME = "Test" + PARSER.capitalize() | |||
| _TestParser.__name__ = _NAME | |||
| globals()[_NAME] = _TestParser | |||