@@ -154,10 +154,10 @@ These features may be implemented some day: | |||||
| Code | CPython Time | PyPy Time | CPython Mem | PyPy Mem | | Code | CPython Time | PyPy Time | CPython Mem | PyPy Mem | ||||
|:-----|:-------------|:------------|:----------|:--------- | |:-----|:-------------|:------------|:----------|:--------- | ||||
| **Lark - LALR(1)** | 4.2s | 1.1s | 0.4M | 0.3M | | |||||
| PyParsing | 32s | 4.1s | 0.4M | 0.2M | | |||||
| funcparserlib | 11s | 1.9s | 0.5M | 0.3M | | |||||
| Parsimonious | | 7s | | 1.4M | | |||||
| **Lark - LALR(1)** | 4.7s | 1.2s | 70M | 134M | | |||||
| PyParsing | 32s | 3.5s | 443M | 225M | | |||||
| funcparserlib | 8.5s | 1.3s | 483M | 293M | | |||||
| Parsimonious | | 5.7s | | 1545M | | |||||
Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more details on how the comparison was made. | Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more details on how the comparison was made. | ||||
@@ -327,7 +327,7 @@ class TreeToJson(Transformer): | |||||
true = lambda self, _: True | true = lambda self, _: True | ||||
false = lambda self, _: False | false = lambda self, _: False | ||||
json_parser = Lark(json_grammar, start='value') | |||||
json_parser = Lark(json_grammar, start='value', lexer='standard') | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
with open(sys.argv[1]) as f: | with open(sys.argv[1]) as f: | ||||
@@ -419,12 +419,13 @@ I measured memory consumption using a little script called [memusg](https://gist | |||||
| Code | CPython Time | PyPy Time | CPython Mem | PyPy Mem | | Code | CPython Time | PyPy Time | CPython Mem | PyPy Mem | ||||
|:-----|:-------------|:------------|:----------|:--------- | |:-----|:-------------|:------------|:----------|:--------- | ||||
| Lark - Earley | 36s | 4.3s | 6.2M | 1.2M | | |||||
| Lark - LALR(1) | 7s | 1.3s | 0.6M | 0.3M | | |||||
| Lark - LALR(1) tree-less | 4.2s | 1.1s | 0.4M | 0.3M | | |||||
| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 4.1s | 0.4M | 0.2M | | |||||
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 11s | 1.9s | 0.5M | 0.3M | | |||||
| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 7s | ? | 1.4M | | |||||
| Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M | | |||||
| Lark - LALR(1) | 8s | 1.53s | 453M | 266M | | |||||
| Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M | | |||||
| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M | | |||||
| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | | |||||
| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M | | |||||
I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1). | I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1). | ||||
@@ -47,7 +47,7 @@ class TreeToJson(Transformer): | |||||
true = lambda self, _: True | true = lambda self, _: True | ||||
false = lambda self, _: False | false = lambda self, _: False | ||||
# json_parser = Lark(json_grammar, parser='earley') | |||||
# json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||||
# def parse(x): | # def parse(x): | ||||
# return TreeToJson().transform(json_parser.parse(x)) | # return TreeToJson().transform(json_parser.parse(x)) | ||||
@@ -72,7 +72,7 @@ def test(): | |||||
assert j == json.loads(test_json) | assert j == json.loads(test_json) | ||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
test() | |||||
# test() | |||||
with open(sys.argv[1]) as f: | with open(sys.argv[1]) as f: | ||||
print(parse(f.read())) | print(parse(f.read())) | ||||
@@ -3,4 +3,4 @@ from .common import ParseError, GrammarError | |||||
from .lark import Lark | from .lark import Lark | ||||
from .utils import inline_args | from .utils import inline_args | ||||
__version__ = "0.2.2" | |||||
__version__ = "0.2.3" |
@@ -324,7 +324,7 @@ class TokenTreeToPattern(Transformer): | |||||
def expansion(self, items): | def expansion(self, items): | ||||
if len(items) == 1: | if len(items) == 1: | ||||
return items[0] | return items[0] | ||||
if len(set(i.flags for i in items)) > 1: | |||||
if len({i.flags for i in items}) > 1: | |||||
raise GrammarError("Lark doesn't support joining tokens with conflicting flags!") | raise GrammarError("Lark doesn't support joining tokens with conflicting flags!") | ||||
return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags) | return PatternRE(''.join(i.to_regexp() for i in items), items[0].flags) | ||||
@@ -348,60 +348,64 @@ def _interleave(l, item): | |||||
elif is_terminal(e): | elif is_terminal(e): | ||||
yield item | yield item | ||||
def _choice_of_rules(rules): | |||||
return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | |||||
class Grammar: | class Grammar: | ||||
def __init__(self, rule_defs, token_defs, extra): | def __init__(self, rule_defs, token_defs, extra): | ||||
self.token_defs = token_defs | self.token_defs = token_defs | ||||
self.rule_defs = rule_defs | self.rule_defs = rule_defs | ||||
self.extra = extra | self.extra = extra | ||||
def compile(self, lexer=False, start=None): | |||||
if not lexer: | |||||
rule_defs = deepcopy(self.rule_defs) | |||||
# XXX VERY HACKY!! There must be a better way.. | |||||
ignore_tokens = [('_'+name, t) for name, t in self.token_defs if name in self.extra['ignore']] | |||||
if ignore_tokens: | |||||
self.token_defs = [('_'+name if name in self.extra['ignore'] else name,t) for name,t in self.token_defs] | |||||
ignore_names = [t[0] for t in ignore_tokens] | |||||
expr = Token('RULE', '__ignore') | |||||
for r, tree, _o in rule_defs: | |||||
for exp in tree.find_data('expansion'): | |||||
exp.children = list(_interleave(exp.children, expr)) | |||||
if r == start: | |||||
exp.children = [expr] + exp.children | |||||
for exp in tree.find_data('expr'): | |||||
exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr))) | |||||
x = [T('expansion', [Token('RULE', x)]) for x in ignore_names] | |||||
_ignore_tree = T('expr', [T('expansions', x), Token('OP', '?')]) | |||||
rule_defs.append(('__ignore', _ignore_tree, None)) | |||||
# End of "ignore" section | |||||
rule_defs += [(name, tree, RuleOptions(keep_all_tokens=True)) for name, tree in self.token_defs] | |||||
token_defs = [] | |||||
tokens_to_convert = {name: '__token_'+name for name, tree, _ in rule_defs if is_terminal(name)} | |||||
new_rule_defs = [] | |||||
for name, tree, options in rule_defs: | |||||
if name in tokens_to_convert: | |||||
if name.startswith('_'): | |||||
options = RuleOptions.new_from(options, filter_out=True) | |||||
else: | |||||
options = RuleOptions.new_from(options, create_token=name) | |||||
name = tokens_to_convert[name] | |||||
inner = Token('RULE', name + '_inner') | |||||
new_rule_defs.append((name, T('expansions', [T('expansion', [inner])]), None)) | |||||
name = inner | |||||
def _prepare_scanless_grammar(self, start): | |||||
# XXX Pretty hacky! There should be a better way to write this method.. | |||||
rule_defs = deepcopy(self.rule_defs) | |||||
term_defs = self.token_defs | |||||
# Implement the "%ignore" feature without a lexer.. | |||||
terms_to_ignore = {name:'__'+name for name in self.extra['ignore']} | |||||
if terms_to_ignore: | |||||
assert set(terms_to_ignore) <= {name for name, t in term_defs} | |||||
term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] | |||||
expr = Token('RULE', '__ignore') | |||||
for r, tree, _o in rule_defs: | |||||
for exp in tree.find_data('expansion'): | |||||
exp.children = list(_interleave(exp.children, expr)) | |||||
if r == start: | |||||
exp.children = [expr] + exp.children | |||||
for exp in tree.find_data('expr'): | |||||
exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr))) | |||||
_ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) | |||||
rule_defs.append(('__ignore', _ignore_tree, None)) | |||||
# Convert all tokens to rules | |||||
new_terminal_names = {name: '__token_'+name for name, tree in term_defs} | |||||
for name, tree, options in rule_defs: | |||||
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||||
for i, sym in enumerate(exp.children): | |||||
if sym in new_terminal_names: | |||||
exp.children[i] = Token(sym.type, new_terminal_names[sym]) | |||||
for name, tree in term_defs: | |||||
if name.startswith('_'): | |||||
options = RuleOptions(filter_out=True) | |||||
else: | |||||
options = RuleOptions(keep_all_tokens=True, create_token=name) | |||||
else: | |||||
for exp in chain( tree.find_data('expansion'), tree.find_data('expr') ): | |||||
for i, sym in enumerate(exp.children): | |||||
if sym in tokens_to_convert: | |||||
exp.children[i] = Token(sym.type, tokens_to_convert[sym]) | |||||
name = new_terminal_names[name] | |||||
inner_name = name + '_inner' | |||||
rule_defs.append((name, _choice_of_rules([inner_name]), None)) | |||||
rule_defs.append((inner_name, tree, options)) | |||||
new_rule_defs.append((name, tree, options)) | |||||
return [], rule_defs | |||||
rule_defs = new_rule_defs | |||||
def compile(self, lexer=False, start=None): | |||||
if not lexer: | |||||
token_defs, rule_defs = self._prepare_scanless_grammar(start) | |||||
else: | else: | ||||
token_defs = list(self.token_defs) | token_defs = list(self.token_defs) | ||||
rule_defs = self.rule_defs | rule_defs = self.rule_defs | ||||
@@ -473,14 +477,6 @@ class RuleOptions: | |||||
self.filter_out = filter_out # remove this rule from the tree | self.filter_out = filter_out # remove this rule from the tree | ||||
# used for "token"-rules in scanless | # used for "token"-rules in scanless | ||||
@classmethod | |||||
def new_from(cls, options, **kw): | |||||
return cls( | |||||
keep_all_tokens=options and options.keep_all_tokens, | |||||
expand1=options and options.expand1, | |||||
**kw) | |||||
@classmethod | @classmethod | ||||
def from_rule(cls, name, expansions): | def from_rule(cls, name, expansions): | ||||
keep_all_tokens = name.startswith('!') | keep_all_tokens = name.startswith('!') | ||||
@@ -27,7 +27,7 @@ class LALR(WithLexer): | |||||
self.parser = lalr_parser.Parser(parser_conf) | self.parser = lalr_parser.Parser(parser_conf) | ||||
def parse(self, text): | def parse(self, text): | ||||
tokens = list(self.lex(text)) | |||||
tokens = self.lex(text) | |||||
return self.parser.parse(tokens) | return self.parser.parse(tokens) | ||||
class LALR_ContextualLexer: | class LALR_ContextualLexer: | ||||
@@ -160,7 +160,7 @@ class Earley(WithLexer): | |||||
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | ||||
def parse(self, text): | def parse(self, text): | ||||
tokens = list(self.lex(text)) | |||||
tokens = self.lex(text) | |||||
return self.parser.parse(tokens) | return self.parser.parse(tokens) | ||||
def get_frontend(parser, lexer): | def get_frontend(parser, lexer): | ||||
@@ -98,7 +98,7 @@ class Column: | |||||
for item in items: | for item in items: | ||||
if item.is_complete: | if item.is_complete: | ||||
# XXX TODO Potential bug: What happens if there's ambiguity in an empty rule? | |||||
# XXX Potential bug: What happens if there's ambiguity in an empty rule? | |||||
if item.rule.expansion and item in self.completed: | if item.rule.expansion and item in self.completed: | ||||
old_tree = self.completed[item].tree | old_tree = self.completed[item].tree | ||||
if old_tree.data != 'ambig': | if old_tree.data != 'ambig': | ||||
@@ -110,7 +110,7 @@ class Column: | |||||
old_tree.children.append(item.tree) | old_tree.children.append(item.tree) | ||||
else: | else: | ||||
self.completed[item] = item | self.completed[item] = item | ||||
self.to_reduce.append(item) | |||||
self.to_reduce.append(item) | |||||
else: | else: | ||||
if item not in added: | if item not in added: | ||||
added.add(item) | added.add(item) | ||||
@@ -168,7 +168,7 @@ class Parser: | |||||
to_scan = cur_set.to_scan.get_news() | to_scan = cur_set.to_scan.get_news() | ||||
for item in to_scan: | for item in to_scan: | ||||
if item.expect.match(token): | if item.expect.match(token): | ||||
next_set.add([item.advance(stream[i])]) | |||||
next_set.add([item.advance(token)]) | |||||
if not next_set and token is not END_TOKEN: | if not next_set and token is not END_TOKEN: | ||||
expect = {i.expect for i in cur_set.to_scan} | expect = {i.expect for i in cur_set.to_scan} | ||||
@@ -181,10 +181,12 @@ class Parser: | |||||
column0.add(predict(start, column0)) | column0.add(predict(start, column0)) | ||||
cur_set = column0 | cur_set = column0 | ||||
for i, char in enumerate(stream): | |||||
_, cur_set = process_column(i, char, cur_set) | |||||
i = 0 | |||||
for token in stream: | |||||
_, cur_set = process_column(i, token, cur_set) | |||||
i += 1 | |||||
last_set, _ = process_column(len(stream), END_TOKEN, cur_set) | |||||
last_set, _ = process_column(i, END_TOKEN, cur_set) | |||||
# Parse ended. Now build a parse tree | # Parse ended. Now build a parse tree | ||||
solutions = [n.tree for n in last_set.to_reduce | solutions = [n.tree for n in last_set.to_reduce | ||||
@@ -19,6 +19,7 @@ logging.basicConfig(level=logging.INFO) | |||||
from lark.lark import Lark | from lark.lark import Lark | ||||
from lark.common import GrammarError, ParseError | from lark.common import GrammarError, ParseError | ||||
from lark.lexer import LexError | from lark.lexer import LexError | ||||
from lark.tree import Tree | |||||
__path__ = os.path.dirname(__file__) | __path__ = os.path.dirname(__file__) | ||||
def _read(n, *args): | def _read(n, *args): | ||||
@@ -104,6 +105,21 @@ class TestEarley(unittest.TestCase): | |||||
res = l.parse("aaa") | res = l.parse("aaa") | ||||
self.assertEqual(res.children, ['aaa']) | self.assertEqual(res.children, ['aaa']) | ||||
def test_earley_repeating_empty(self): | |||||
# This was a sneaky bug! | |||||
grammar = """ | |||||
!start: "a" empty empty "b" | |||||
empty: empty2 | |||||
empty2: | |||||
""" | |||||
parser = Lark(grammar, parser='earley', lexer=None) | |||||
res = parser.parse('ab') | |||||
empty_tree = Tree('empty', [Tree('empty2', [])]) | |||||
self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) | |||||
def _make_parser_test(LEXER, PARSER): | def _make_parser_test(LEXER, PARSER): | ||||
def _Lark(grammar, **kwargs): | def _Lark(grammar, **kwargs): | ||||
return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) | return Lark(grammar, lexer=LEXER, parser=PARSER, **kwargs) | ||||