From 8b9c5801daca8cf8db4e6d3783e556b02d1da2a0 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 10 Feb 2017 11:50:50 +0200 Subject: [PATCH] Improved lexer, added profiler option to Lark --- README.md | 18 ++++++----- lark/__init__.py | 1 + lark/lark.py | 50 +++++++++++++++++++++++++++--- lark/lexer.py | 65 ++++++++++++++++++++++----------------- lark/tests/test_parser.py | 7 +++++ 5 files changed, 101 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index faf3698..5a30750 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ -# Lark - a modern pure-Python parsing library +# Lark - a modern parsing library -Lark is a modern general-purpose Python parsing library, that focuses on simplicity and power. +Lark is a modern general-purpose parsing library for Python. -Lark accepts grammars as EBNF and lets you choose between two parsing algorithms: +Lark focuses on simplicity and power. It lets you choose between two parsing algorithms: - - Earley : Parses all context-free grammars (even ambiguous ones)! + - Earley : Parses all context-free grammars (even ambiguous ones)! It is the default. - LALR(1): Only LR grammars. Outperforms PLY and most if not all other pure-python parsing libraries. -Both algorithms are pure-python implementations and can be used interchangably (aside for algorithmic restrictions). +Both algorithms are written in Python and can be used interchangably with the same grammar (aside for algorithmic restrictions). See "Comparison to other parsers" for more details. Lark can automagically build an AST from your grammar, without any more code on your part. @@ -41,10 +41,12 @@ Tree(start, [Token(WORD, Hello), Token(WORD, World)]) Notice punctuation doesn't appear in the resulting tree. It's automatically filtered away by Lark. -To learn more about Lark: - - Learn how to parse json at the [tutorial](/docs/json_tutorial.md) +## Learn more about using Lark -## Features + - Read the [tutorial](/docs/json_tutorial.md), which shows how to write a JSON parser in Lark. + - Browse the [examples](/examples), which include a calculator, and a Python-code parser. + +## List of Features - EBNF grammar with a little extra - Earley & LALR(1) diff --git a/lark/__init__.py b/lark/__init__.py index e69de29..11fc59f 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -0,0 +1 @@ +from .lark import Lark, Transformer diff --git a/lark/lark.py b/lark/lark.py index badabe0..8d4ad96 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -39,6 +39,7 @@ class LarkOptions(object): self.parser = o.pop('parser', 'earley') self.transformer = o.pop('transformer', None) self.start = o.pop('start', 'start') + self.profile = o.pop('profile', False) # XXX new assert self.parser in ENGINE_DICT if self.parser == 'earley' and self.transformer: @@ -50,6 +51,30 @@ class LarkOptions(object): raise ValueError("Unknown options: %s" % o.keys()) +import time +from collections import defaultdict +class Profiler: + def __init__(self): + self.total_time = defaultdict(float) + self.cur_section = '__init__' + self.last_enter_time = time.time() + + def enter_section(self, name): + cur_time = time.time() + self.total_time[self.cur_section] += cur_time - self.last_enter_time + self.last_enter_time = cur_time + self.cur_section = name + + def make_wrapper(self, name, f): + def _f(*args, **kwargs): + last_section = self.cur_section + self.enter_section(name) + try: + return f(*args, **kwargs) + finally: + self.enter_section(last_section) + + return _f class Lark: @@ -82,6 +107,8 @@ class Lark: if self.options.cache_grammar: raise NotImplementedError("Not available yet") + self.profiler = Profiler() if self.options.profile else None + self.tokens, self.rules = load_grammar(grammar) self.lexer = self._build_lexer() @@ -90,6 +117,9 @@ class Lark: self.parse_tree_builder = ParseTreeBuilder(self.options.tree_class) self.parser = self._build_parser() + if self.profiler: self.profiler.enter_section('outside_lark') + + def _create_unless_callback(self, strs): def f(t): if t in strs: @@ -105,8 +135,6 @@ class Lark: for flag in flags: if flag == 'ignore': ignore_tokens.append(name) - elif flag == 'newline': - pass # TODO elif isinstance(flag, tuple) and flag[0] == 'unless': _, strs = flag callbacks[name] = self._create_unless_callback(strs) @@ -119,6 +147,10 @@ class Lark: def _build_parser(self): rules, callback = self.parse_tree_builder.create_tree_builder(self.rules, self.options.transformer) + if self.profiler: + for f in dir(callback): + if not f.startswith('__'): + setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) return self.parser_engine.build_parser(rules, callback, self.options.start) @@ -133,6 +165,16 @@ class Lark: def parse(self, text): assert not self.options.only_lex - l = list(self.lex(text)) - return self.parser.parse(l) + + if self.profiler: + self.profiler.enter_section('lex') + l = list(self.lex(text)) + self.profiler.enter_section('parse') + try: + return self.parser.parse(l) + finally: + self.profiler.enter_section('outside_lark') + else: + l = list(self.lex(text)) + return self.parser.parse(l) diff --git a/lark/lexer.py b/lark/lexer.py index 2691458..1fc6a1c 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -1,5 +1,7 @@ ## Lexer Implementation +import re + from .utils import Str class LexError(Exception): @@ -13,13 +15,6 @@ class Token(Str): inst.value = value return inst -# class Token(object): -# def __init__(self, type, value, lexpos): -# self.type = type -# self.value = value -# self.lexpos = lexpos - - def __repr__(self): return 'Token(%s, %s)' % (self.type, self.value) @@ -29,12 +24,11 @@ class Regex: self.flags = flags -import re -LIMIT = 50 # Stupid named groups limit in python re class Lexer(object): def __init__(self, tokens, callbacks, ignore=()): self.ignore = ignore self.newline_char = '\n' + tokens = list(tokens) # Sanitization token_names = {t[0] for t in tokens} @@ -49,42 +43,57 @@ class Lexer(object): self.tokens = tokens self.callbacks = callbacks - # self.tokens.sort(key=lambda x:len(x[1]), reverse=True) + self.token_types = list(token_names) + self.type_index = {name:i for i,name in enumerate(self.token_types)} + + self.newline_types = [self.type_index[t[0]] for t in tokens if '\n' in t[1] or '\\n' in t[1]] + self.ignore_types = [self.type_index[t] for t in ignore] - self.mres = [] - self.name_from_index = [] - x = list(tokens) - while x: - mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) - self.mres.append(mre) - self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) - x = x[LIMIT:] + self.mres = self._build_mres(tokens, len(tokens)) + + + def _build_mres(self, tokens, max_size): + # Python sets an unreasonable group limit (currently 100) in its re module + # Worse, the only way to know we reached it is by catching an AssertionError! + # This function recursively tries less and less groups until it's successful. + mres = [] + while tokens: + try: + mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in tokens[:max_size])) + except AssertionError: # Yes, this is what Python provides us.. :/ + return self._build_mres(tokens, max_size/2) + + mres.append((mre, {i:self.type_index[n] for n,i in mre.groupindex.items()} )) + tokens = tokens[max_size:] + return mres def lex(self, stream): lex_pos = 0 line = 1 col_start_pos = 0 + newline_types = list(self.newline_types) + ignore_types = list(self.ignore_types) while True: - i = 0 - for mre in self.mres: + for mre, type_from_index in self.mres: m = mre.match(stream, lex_pos) if m: value = m.group(0) - type_ = self.name_from_index[i][m.lastindex] - if type_ not in self.ignore: - t = Token(type_, value, lex_pos) + type_num = type_from_index[m.lastindex] + if type_num not in ignore_types: + t = Token(self.token_types[type_num], value, lex_pos) t.line = line t.column = lex_pos - col_start_pos if t.type in self.callbacks: t = self.callbacks[t.type](t) yield t - newlines = value.count(self.newline_char) - if newlines: - line += newlines - col_start_pos = lex_pos + value.rindex(self.newline_char) + + if type_num in newline_types: + newlines = value.count(self.newline_char) + if newlines: + line += newlines + col_start_pos = lex_pos + value.rindex(self.newline_char) lex_pos += len(value) break - i += 1 else: if lex_pos < len(stream): context = stream[lex_pos:lex_pos+5] diff --git a/lark/tests/test_parser.py b/lark/tests/test_parser.py index b4ba738..22d28d2 100644 --- a/lark/tests/test_parser.py +++ b/lark/tests/test_parser.py @@ -334,6 +334,13 @@ def _make_parser_test(PARSER): x = g.parse('a') self.assertEqual(x.data, "b") + def test_lexer_token_limit(self): + "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" + tokens = {'A%d'%i:'"%d"'%i for i in range(300)} + g = _Lark("""start: %s + %s""" % (' '.join(tokens), '\n'.join("%s: %s"%x for x in tokens.items()))) + + _NAME = "Test" + PARSER.capitalize() _TestParser.__name__ = _NAME globals()[_NAME] = _TestParser