diff --git a/README.md b/README.md index 46df014..062828e 100644 --- a/README.md +++ b/README.md @@ -154,10 +154,10 @@ These features may be implemented some day: | Code | CPython Time | PyPy Time | CPython Mem | PyPy Mem |:-----|:-------------|:------------|:----------|:--------- -| **Lark - LALR(1)** | 4.2s | 1.1s | 0.4M | 0.3M | -| PyParsing | 32s | 4.1s | 0.4M | 0.2M | -| funcparserlib | 11s | 1.9s | 0.5M | 0.3M | -| Parsimonious | | 7s | | 1.4M | +| **Lark - LALR(1)** | 4.7s | 1.2s | 70M | 134M | +| PyParsing | 32s | 3.5s | 443M | 225M | +| funcparserlib | 8.5s | 1.3s | 483M | 293M | +| Parsimonious | | 5.7s | | 1545M | Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more details on how the comparison was made. diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index c83d6c7..9f3fbf1 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -327,7 +327,7 @@ class TreeToJson(Transformer): true = lambda self, _: True false = lambda self, _: False -json_parser = Lark(json_grammar, start='value') +json_parser = Lark(json_grammar, start='value', lexer='standard') if __name__ == '__main__': with open(sys.argv[1]) as f: @@ -419,12 +419,13 @@ I measured memory consumption using a little script called [memusg](https://gist | Code | CPython Time | PyPy Time | CPython Mem | PyPy Mem |:-----|:-------------|:------------|:----------|:--------- -| Lark - Earley | 36s | 4.3s | 6.2M | 1.2M | -| Lark - LALR(1) | 7s | 1.3s | 0.6M | 0.3M | -| Lark - LALR(1) tree-less | 4.2s | 1.1s | 0.4M | 0.3M | -| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 4.1s | 0.4M | 0.2M | -| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 11s | 1.9s | 0.5M | 0.3M | -| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 7s | ? | 1.4M | +| Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M | +| Lark - LALR(1) | 8s | 1.53s | 453M | 266M | +| Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M | +| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M | +| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | +| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M | + I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1). diff --git a/examples/json_parser.py b/examples/json_parser.py index a67ae4b..4f5feaf 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -47,7 +47,7 @@ class TreeToJson(Transformer): true = lambda self, _: True false = lambda self, _: False -# json_parser = Lark(json_grammar, parser='earley') +# json_parser = Lark(json_grammar, parser='earley', lexer='standard') # def parse(x): # return TreeToJson().transform(json_parser.parse(x)) @@ -72,7 +72,7 @@ def test(): assert j == json.loads(test_json) if __name__ == '__main__': - test() + # test() with open(sys.argv[1]) as f: print(parse(f.read())) diff --git a/lark/__init__.py b/lark/__init__.py index ea10a1e..1f60bc2 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -3,4 +3,4 @@ from .common import ParseError, GrammarError from .lark import Lark from .utils import inline_args -__version__ = "0.2.2" +__version__ = "0.2.3" diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dc4cacb..4b8e7fe 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -324,7 +324,7 @@ class TokenTreeToPattern(Transformer): def expansion(self, items): if len(items) == 1: return items[0] - if len(set(i.flags for i in items)) > 1: + if len({i.flags for i in items}) > 1: raise GrammarError("Lark doesn't support joining tokens with conflicting flags!") return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags) @@ -348,60 +348,64 @@ def _interleave(l, item): elif is_terminal(e): yield item +def _choice_of_rules(rules): + return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) + class Grammar: def __init__(self, rule_defs, token_defs, extra): self.token_defs = token_defs self.rule_defs = rule_defs self.extra = extra - def compile(self, lexer=False, start=None): - if not lexer: - rule_defs = deepcopy(self.rule_defs) - - # XXX VERY HACKY!! There must be a better way.. - ignore_tokens = [('_'+name, t) for name, t in self.token_defs if name in self.extra['ignore']] - if ignore_tokens: - self.token_defs = [('_'+name if name in self.extra['ignore'] else name,t) for name,t in self.token_defs] - ignore_names = [t[0] for t in ignore_tokens] - expr = Token('RULE', '__ignore') - for r, tree, _o in rule_defs: - for exp in tree.find_data('expansion'): - exp.children = list(_interleave(exp.children, expr)) - if r == start: - exp.children = [expr] + exp.children - for exp in tree.find_data('expr'): - exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr))) - - x = [T('expansion', [Token('RULE', x)]) for x in ignore_names] - _ignore_tree = T('expr', [T('expansions', x), Token('OP', '?')]) - rule_defs.append(('__ignore', _ignore_tree, None)) - # End of "ignore" section - - rule_defs += [(name, tree, RuleOptions(keep_all_tokens=True)) for name, tree in self.token_defs] - token_defs = [] - - tokens_to_convert = {name: '__token_'+name for name, tree, _ in rule_defs if is_terminal(name)} - new_rule_defs = [] - for name, tree, options in rule_defs: - if name in tokens_to_convert: - if name.startswith('_'): - options = RuleOptions.new_from(options, filter_out=True) - else: - options = RuleOptions.new_from(options, create_token=name) - name = tokens_to_convert[name] - inner = Token('RULE', name + '_inner') - new_rule_defs.append((name, T('expansions', [T('expansion', [inner])]), None)) - name = inner + def _prepare_scanless_grammar(self, start): + # XXX Pretty hacky! There should be a better way to write this method.. + + rule_defs = deepcopy(self.rule_defs) + term_defs = self.token_defs + + # Implement the "%ignore" feature without a lexer.. + terms_to_ignore = {name:'__'+name for name in self.extra['ignore']} + if terms_to_ignore: + assert set(terms_to_ignore) <= {name for name, t in term_defs} + term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] + expr = Token('RULE', '__ignore') + for r, tree, _o in rule_defs: + for exp in tree.find_data('expansion'): + exp.children = list(_interleave(exp.children, expr)) + if r == start: + exp.children = [expr] + exp.children + for exp in tree.find_data('expr'): + exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr))) + + _ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) + rule_defs.append(('__ignore', _ignore_tree, None)) + + # Convert all tokens to rules + new_terminal_names = {name: '__token_'+name for name, tree in term_defs} + + for name, tree, options in rule_defs: + for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): + for i, sym in enumerate(exp.children): + if sym in new_terminal_names: + exp.children[i] = Token(sym.type, new_terminal_names[sym]) + + for name, tree in term_defs: + if name.startswith('_'): + options = RuleOptions(filter_out=True) + else: + options = RuleOptions(keep_all_tokens=True, create_token=name) - else: - for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): - for i, sym in enumerate(exp.children): - if sym in tokens_to_convert: - exp.children[i] = Token(sym.type, tokens_to_convert[sym]) + name = new_terminal_names[name] + inner_name = name + '_inner' + rule_defs.append((name, _choice_of_rules([inner_name]), None)) + rule_defs.append((inner_name, tree, options)) - new_rule_defs.append((name, tree, options)) + return [], rule_defs - rule_defs = new_rule_defs + + def compile(self, lexer=False, start=None): + if not lexer: + token_defs, rule_defs = self._prepare_scanless_grammar(start) else: token_defs = list(self.token_defs) rule_defs = self.rule_defs @@ -473,14 +477,6 @@ class RuleOptions: self.filter_out = filter_out # remove this rule from the tree # used for "token"-rules in scanless - - @classmethod - def new_from(cls, options, **kw): - return cls( - keep_all_tokens=options and options.keep_all_tokens, - expand1=options and options.expand1, - **kw) - @classmethod def from_rule(cls, name, expansions): keep_all_tokens = name.startswith('!') diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 7043dbc..0b36a75 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -27,7 +27,7 @@ class LALR(WithLexer): self.parser = lalr_parser.Parser(parser_conf) def parse(self, text): - tokens = list(self.lex(text)) + tokens = self.lex(text) return self.parser.parse(tokens) class LALR_ContextualLexer: @@ -160,7 +160,7 @@ class Earley(WithLexer): return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] def parse(self, text): - tokens = list(self.lex(text)) + tokens = self.lex(text) return self.parser.parse(tokens) def get_frontend(parser, lexer): diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 49f9ed7..95061e1 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -98,7 +98,7 @@ class Column: for item in items: if item.is_complete: - # XXX TODO Potential bug: What happens if there's ambiguity in an empty rule? + # XXX Potential bug: What happens if there's ambiguity in an empty rule? if item.rule.expansion and item in self.completed: old_tree = self.completed[item].tree if old_tree.data != 'ambig': @@ -110,7 +110,7 @@ class Column: old_tree.children.append(item.tree) else: self.completed[item] = item - self.to_reduce.append(item) + self.to_reduce.append(item) else: if item not in added: added.add(item) @@ -168,7 +168,7 @@ class Parser: to_scan = cur_set.to_scan.get_news() for item in to_scan: if item.expect.match(token): - next_set.add([item.advance(stream[i])]) + next_set.add([item.advance(token)]) if not next_set and token is not END_TOKEN: expect = {i.expect for i in cur_set.to_scan} @@ -181,10 +181,12 @@ class Parser: column0.add(predict(start, column0)) cur_set = column0 - for i, char in enumerate(stream): - _, cur_set = process_column(i, char, cur_set) + i = 0 + for token in stream: + _, cur_set = process_column(i, token, cur_set) + i += 1 - last_set, _ = process_column(len(stream), END_TOKEN, cur_set) + last_set, _ = process_column(i, END_TOKEN, cur_set) # Parse ended. Now build a parse tree solutions = [n.tree for n in last_set.to_reduce diff --git a/tests/test_parser.py b/tests/test_parser.py index 83085e2..3e3ee14 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -19,6 +19,7 @@ logging.basicConfig(level=logging.INFO) from lark.lark import Lark from lark.common import GrammarError, ParseError from lark.lexer import LexError +from lark.tree import Tree __path__ = os.path.dirname(__file__) def _read(n, *args): @@ -104,6 +105,21 @@ class TestEarley(unittest.TestCase): res = l.parse("aaa") self.assertEqual(res.children, ['aaa']) + def test_earley_repeating_empty(self): + # This was a sneaky bug! + + grammar = """ + !start: "a" empty empty "b" + empty: empty2 + empty2: + """ + + parser = Lark(grammar, parser='earley', lexer=None) + res = parser.parse('ab') + + empty_tree = Tree('empty', [Tree('empty2', [])]) + self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) + def _make_parser_test(LEXER, PARSER): def _Lark(grammar, **kwargs): return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs)