From c93140386d3e9cd16cad3f6a281497502aeac108 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 31 Oct 2017 10:52:00 +0200 Subject: [PATCH 1/3] BUGFIX: The Earley parser collected repeating derivations, and filtered them out much later than needed. This resulted in very long lists and many unnecessary comparisons. --- lark/lark.py | 5 ---- lark/parser_frontends.py | 6 ++--- lark/parsers/earley.py | 52 +++++++++++++++++++--------------------- lark/parsers/xearley.py | 9 +++++-- lark/tree.py | 3 +++ 5 files changed, 37 insertions(+), 38 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 1899229..b8c8efe 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -32,10 +32,6 @@ class LarkOptions(object): (it chooses consistently: greedy for tokens, non-greedy for rules) "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). - earley__all_derivations - If True, try every possible derivation of each rule. If False, pick the first - correct derivation. Both will find a solution to every correct grammar & input, - but when False, some ambiguities won't appear (Default: True) - transformer - Applies the transformer to every parse tree debug - Affects verbosity (default: False) keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) @@ -61,7 +57,6 @@ class LarkOptions(object): self.profile = o.pop('profile', False) self.ambiguity = o.pop('ambiguity', 'auto') self.propagate_positions = o.pop('propagate_positions', False) - self.earley__all_derivations = o.pop('earley__all_derivations', True) assert self.parser in ('earley', 'lalr', None) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index c46352f..18264ce 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -77,8 +77,7 @@ class Earley_NoLex: self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback, - resolve_ambiguity=get_ambiguity_resolver(options), - all_derivations = options.earley__all_derivations if options else True) + resolve_ambiguity=get_ambiguity_resolver(options)) def _prepare_expansion(self, expansion): for sym in expansion: @@ -104,8 +103,7 @@ class Earley(WithLexer): self.parser = earley.Parser(rules, parser_conf.start, parser_conf.callback, - resolve_ambiguity=get_ambiguity_resolver(options), - all_derivations = options.earley__all_derivations if options else True) + resolve_ambiguity=get_ambiguity_resolver(options)) def _prepare_expansion(self, expansion): return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 7370a48..a4b469e 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -64,23 +64,16 @@ class Item(object): return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule def __eq__(self, other): - return self.similar(other) and (self.tree == other.tree) + return self.similar(other) #and (self.tree == other.tree) def __hash__(self): - return hash((self.rule, self.ptr, id(self.start), self.tree)) # Always runs Derivation.__hash__ + return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ def __repr__(self): before = list(map(str, self.rule.expansion[:self.ptr])) after = list(map(str, self.rule.expansion[self.ptr:])) return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) -class Item_JoinDerivations(Item): - __eq__ = Item.similar - - def __hash__(self): - return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ - - class NewsList(list): "Keeps track of newly added items (append-only)" @@ -97,12 +90,13 @@ class NewsList(list): class Column: "An entry in the table, aka Earley Chart. Contains lists of items." - def __init__(self, i): + def __init__(self, i, FIRST): self.i = i self.to_reduce = NewsList() self.to_predict = NewsList() self.to_scan = NewsList() self.item_count = 0 + self.FIRST = FIRST self.added = set() self.completed = {} @@ -112,13 +106,17 @@ class Column: Makes sure only unique items are added. """ - for item in items: if item.is_complete: # XXX Potential bug: What happens if there's ambiguity in an empty rule? if item.rule.expansion and item in self.completed: old_tree = self.completed[item].tree + if old_tree == item.tree: + is_empty = len(self.FIRST[item.rule.origin]) + if is_empty: + continue + if old_tree.data != '_ambig': new_tree = old_tree.copy() new_tree.rule = old_tree.rule @@ -128,16 +126,18 @@ class Column: if item.tree.children[0] is old_tree: # XXX a little hacky! raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) - old_tree.children.append(item.tree) + if item.tree not in old_tree.children: + old_tree.children.append(item.tree) + # old_tree.children.append(item.tree) else: self.completed[item] = item self.to_reduce.append(item) else: - if item not in self.added: - self.added.add(item) - if isinstance(item.expect, Terminal): - self.to_scan.append(item) - else: + if isinstance(item.expect, Terminal): + self.to_scan.append(item) + else: + if item not in self.added: + self.added.add(item) self.to_predict.append(item) self.item_count += 1 # Only count if actually added @@ -146,30 +146,28 @@ class Column: return bool(self.item_count) class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, all_derivations=True): - """ - all_derivations: - True = Try every rule combination, and every possible derivation of each rule. (default) - False = Try every rule combination, but not every derivation of the same rule. - """ + def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None): self.analysis = GrammarAnalyzer(rules, start_symbol) self.start_symbol = start_symbol self.resolve_ambiguity = resolve_ambiguity - self.all_derivations = all_derivations self.postprocess = {} self.predictions = {} + self.FIRST = {} for rule in self.analysis.rules: if rule.origin != '$root': # XXX kinda ugly a = rule.alias self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + + def parse(self, stream, start_symbol=None): # Define parser functions start_symbol = start_symbol or self.start_symbol - _Item = Item if self.all_derivations else Item_JoinDerivations + _Item = Item def predict(nonterm, column): assert not isinstance(nonterm, Terminal), nonterm @@ -199,7 +197,7 @@ class Parser: def scan(i, token, column): to_scan = column.to_scan.get_news() - next_set = Column(i) + next_set = Column(i, self.FIRST) next_set.add(item.advance(token) for item in to_scan if item.expect.match(token)) if not next_set: @@ -209,7 +207,7 @@ class Parser: return next_set # Main loop starts - column0 = Column(0) + column0 = Column(0, self.FIRST) column0.add(predict(start_symbol, column0)) column = column0 diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 24e81f0..9f30b23 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -37,12 +37,17 @@ class Parser: self.postprocess = {} self.predictions = {} + self.FIRST = {} + for rule in self.analysis.rules: if rule.origin != '$root': # XXX kinda ugly a = rule.alias self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] + + def parse(self, stream, start_symbol=None): # Define parser functions start_symbol = start_symbol or self.start_symbol @@ -106,14 +111,14 @@ class Parser: if m: delayed_matches[m.end()].add(item.advance(m.group(0))) - next_set = Column(i+1) + next_set = Column(i+1, self.FIRST) next_set.add(delayed_matches[i+1]) del delayed_matches[i+1] # No longer needed, so unburden memory return next_set # Main loop starts - column0 = Column(0) + column0 = Column(0, self.FIRST) column0.add(predict(start_symbol, column0)) column = column0 diff --git a/lark/tree.py b/lark/tree.py index 290b9a7..ac06556 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -45,6 +45,9 @@ class Tree(object): except AttributeError: return False + def __ne__(self, other): + return not (self == other) + def __hash__(self): return hash((self.data, tuple(self.children))) From 93302d7ceb8dbf28c401d2e5523935715b241fa3 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 31 Oct 2017 12:27:17 +0200 Subject: [PATCH 2/3] Important optimization for fix --- lark/parsers/earley.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index a4b469e..0722fec 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -181,12 +181,13 @@ class Parser: while True: to_predict = {x.expect for x in column.to_predict.get_news() if x.ptr} # if not part of an already predicted batch - to_reduce = column.to_reduce.get_news() + to_reduce = set(column.to_reduce.get_news()) if not (to_predict or to_reduce): break for nonterm in to_predict: column.add( predict(nonterm, column) ) + for item in to_reduce: new_items = list(complete(item)) for new_item in new_items: From 0155d3d956368ed6a3bd6c63c07662d277d98c1e Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 1 Nov 2017 00:22:18 +0200 Subject: [PATCH 3/3] A few more fixes --- lark/parsers/earley.py | 7 ++++--- lark/parsers/xearley.py | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 0722fec..3c04ac1 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -110,8 +110,9 @@ class Column: if item.is_complete: # XXX Potential bug: What happens if there's ambiguity in an empty rule? - if item.rule.expansion and item in self.completed: - old_tree = self.completed[item].tree + item_key = item, item.tree # Elsewhere, tree is not part of the comparison + if item.rule.expansion and item_key in self.completed: + old_tree = self.completed[item_key].tree if old_tree == item.tree: is_empty = len(self.FIRST[item.rule.origin]) if is_empty: @@ -130,7 +131,7 @@ class Column: old_tree.children.append(item.tree) # old_tree.children.append(item.tree) else: - self.completed[item] = item + self.completed[item_key] = item self.to_reduce.append(item) else: if isinstance(item.expect, Terminal): diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 9f30b23..693e54f 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -51,7 +51,7 @@ class Parser: def parse(self, stream, start_symbol=None): # Define parser functions start_symbol = start_symbol or self.start_symbol - delayed_matches = defaultdict(set) + delayed_matches = defaultdict(list) match_after_ignore = set() text_line = 1 @@ -88,7 +88,7 @@ class Parser: for x in self.ignore: m = x.match(stream, i) if m: - delayed_matches[m.end()] |= set(to_scan) + delayed_matches[m.end()] += set(to_scan) if m.end() == len(stream): match_after_ignore.update(set(column.to_reduce)) @@ -103,13 +103,13 @@ class Parser: m = item.expect.match(stream, i) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[m.end()].add(item.advance(t)) + delayed_matches[m.end()].append(item.advance(t)) s = m.group(0) for j in range(1, len(s)): m = item.expect.match(s[:-j]) if m: - delayed_matches[m.end()].add(item.advance(m.group(0))) + delayed_matches[m.end()].append(item.advance(m.group(0))) next_set = Column(i+1, self.FIRST) next_set.add(delayed_matches[i+1])