From bb22c84df37f3d30144c90f7a4d9416a76364a2d Mon Sep 17 00:00:00 2001 From: night199uk Date: Tue, 4 Dec 2018 17:07:19 +0100 Subject: [PATCH 1/5] Speed up repetitive parsing using the same parser When using the same parser repeatedly for small parsers we incur significant overhead by recreating the ForestVisitor each parser. We can cache the Forest walker and re-use it by making it stateless. Also, we can use slots for all of the Forest Walkers to reduce construction delay and function call overhead. --- lark/parsers/earley.py | 4 ++-- lark/parsers/earley_forest.py | 35 +++++++++++++++++++---------------- lark/parsers/xearley.py | 4 ++-- tests/test_parser.py | 6 +++--- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 7dfa26c..02bc5d4 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -26,7 +26,6 @@ class Parser: analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity - self.forest_sum_visitor = forest_sum_visitor self.FIRST = analysis.FIRST self.callbacks = {} @@ -41,6 +40,7 @@ class Parser: self.callbacks[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] + self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) self.term_matcher = term_matcher @@ -203,7 +203,7 @@ class Parser: # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities # according to the rules. - return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() + return self.forest_tree_visitor.go(solutions[0]) class ApplyCallbacks(Transformer_InPlace): def __init__(self, postprocess): diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index 0b1650f..e4e3e36 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -114,9 +114,7 @@ class ForestVisitor(object): Use this as a base when you need to walk the forest. """ - def __init__(self, root): - self.root = root - self.result = None + __slots__ = ['result'] def visit_token_node(self, node): pass def visit_symbol_node_in(self, node): pass @@ -124,7 +122,8 @@ class ForestVisitor(object): def visit_packed_node_in(self, node): pass def visit_packed_node_out(self, node): pass - def go(self): + def go(self, root): + self.result = None # Visiting is a list of IDs of all symbol/intermediate nodes currently in # the stack. It serves two purposes: to detect when we 'recurse' in and out # of a symbol/intermediate so that we can process both up and down. Also, @@ -134,7 +133,7 @@ class ForestVisitor(object): # We do not use recursion here to walk the Forest due to the limited # stack size in python. Therefore input_stack is essentially our stack. - input_stack = deque([self.root]) + input_stack = deque([root]) # It is much faster to cache these as locals since they are called # many times in large parses. @@ -263,19 +262,21 @@ class ForestToTreeVisitor(ForestVisitor): implementation should be another ForestVisitor which sorts the children according to some priority mechanism. """ - def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None): - super(ForestToTreeVisitor, self).__init__(root) - self.forest_sum_visitor = forest_sum_visitor - self.output_stack = deque() + __slots__ = ['forest_sum_visitor', 'output_stack', 'callbacks'] + def __init__(self, forest_sum_visitor = ForestSumVisitor, callbacks = None): + self.forest_sum_visitor = forest_sum_visitor() self.callbacks = callbacks - self.result = None + + def go(self, root): + self.output_stack = deque() + return super(ForestToTreeVisitor, self).go(root) def visit_token_node(self, node): self.output_stack[-1].append(node) def visit_symbol_node_in(self, node): if node.is_ambiguous and node.priority is None: - self.forest_sum_visitor(node).go() + self.forest_sum_visitor.go(node) return next(iter(node.children)) def visit_packed_node_in(self, node): @@ -311,11 +312,13 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): This is mainly used by the test framework, to make it simpler to write tests ensuring the SPPF contains the right results. """ - def __init__(self, root, callbacks): - super(ForestToAmbiguousTreeVisitor, self).__init__(root) - self.output_stack = deque() + __slots__ = ['output_stack', 'callbacks'] + def __init__(self, callbacks): self.callbacks = callbacks - self.result = None + + def go(self, root): + self.output_stack = deque([]) + return super(ForestToAmbiguousTreeVisitor, self).go(root) def visit_token_node(self, node): self.output_stack[-1].children.append(node) @@ -347,4 +350,4 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): if self.output_stack: self.output_stack[-1].children.append(result) else: - self.result = result \ No newline at end of file + self.result = result diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 4bdccc4..30729ef 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -33,7 +33,6 @@ class Parser: analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity - self.forest_sum_visitor = forest_sum_visitor self.ignore = [Terminal(t) for t in ignore] self.complete_lex = complete_lex @@ -50,6 +49,7 @@ class Parser: self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] self.term_matcher = term_matcher + self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) def parse(self, stream, start_symbol=None): start_symbol = NonTerminal(start_symbol or self.parser_conf.start) @@ -271,4 +271,4 @@ class Parser: # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities # according to the rules. - return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() + return self.forest_tree_visitor.go(solutions[0]) diff --git a/tests/test_parser.py b/tests/test_parser.py index a54ecc4..e00041e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -239,7 +239,7 @@ def _make_full_earley_test(LEXER): parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') root_symbol = parser.parse('ab') - ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() + ambig_tree = ForestToAmbiguousTreeVisitor(parser.parser.parser.callbacks).go(root_symbol) # print(ambig_tree.pretty()) self.assertEqual( ambig_tree.data, '_ambig') self.assertEqual( len(ambig_tree.children), 2) @@ -255,7 +255,7 @@ def _make_full_earley_test(LEXER): """ l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) root_symbol = l.parse('cde') - ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, l.parser.parser.callbacks).go() + ambig_tree = ForestToAmbiguousTreeVisitor(l.parser.parser.callbacks).go(root_symbol) # print(ambig_tree.pretty()) # tree = ApplyCallbacks(l.parser.parser.postprocess).transform(ambig_tree) @@ -302,7 +302,7 @@ def _make_full_earley_test(LEXER): """ parser = Lark(grammar, ambiguity='explicit', lexer=LEXER) root_symbol = parser.parse('fruit flies like bananas') - tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() + tree = ForestToAmbiguousTreeVisitor(parser.parser.parser.callbacks).go(root_symbol) # tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree) expected = Tree('_ambig', [ From 8415fa26a3d3d2b81e64e4fe440faab15b53db49 Mon Sep 17 00:00:00 2001 From: night199uk Date: Tue, 4 Dec 2018 21:34:20 +0100 Subject: [PATCH 2/5] Add a pydot visualizer for the SPPF. --- lark/parsers/earley_forest.py | 95 +++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 3 deletions(-) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index e4e3e36..e5038d9 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -7,6 +7,7 @@ Full reference and more details is here: http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ """ +from random import randint from ..tree import Tree from ..exceptions import ParseError from ..lexer import Token @@ -15,6 +16,7 @@ from ..grammar import NonTerminal, Terminal from .earley_common import Column, Derivation from collections import deque +from importlib import import_module class ForestNode(object): pass @@ -61,7 +63,13 @@ class SymbolNode(ForestNode): return hash((self.s, self.start.i, self.end.i)) def __repr__(self): - symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name + if self.is_intermediate: + rule = self.s[0] + ptr = self.s[1] + names = [ "{}*".format(expansion.name) if index == ptr else expansion.name for index, expansion in enumerate(rule.expansion) ] + symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) + else: + symbol = self.s.name return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) class PackedNode(ForestNode): @@ -105,8 +113,14 @@ class PackedNode(ForestNode): return self._hash def __repr__(self): - symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name - return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0) + if isinstance(self.s, tuple): + rule = self.s[0] + ptr = self.s[1] + names = [ "{}*".format(expansion.name) if index == ptr else expansion.name for index, expansion in enumerate(rule.expansion) ] + symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) + else: + symbol = self.s.name + return "{%s, %d, %d}" % (symbol, self.start.i, self.priority if self.priority is not None else 0) class ForestVisitor(object): """ @@ -351,3 +365,78 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): self.output_stack[-1].children.append(result) else: self.result = result + +class ForestToPyDotVisitor(ForestVisitor): + """ + A Forest visitor which writes the SPPF to a PNG. + + The SPPF can get really large, really quickly because + of the amount of meta-data it stores, so this is probably + only useful for trivial trees and learning how the SPPF + is structured. + """ + def __init__(self, rankdir="TB"): + self.pydot = import_module('pydot') + self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir) + + def go(self, root, filename): + super(ForestToPyDotVisitor, self).go(root) + self.graph.write_png(filename) + + def visit_token_node(self, node): + graph_node_id = str(id(node)) + graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"')) + graph_node_color = 0x808080 + graph_node_style = "filled" + graph_node_shape = "polygon" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + + def visit_packed_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "filled" + graph_node_shape = "diamond" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + return iter([node.left, node.right]) + + def visit_packed_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in [node.left, node.right]: + if child is not None: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) + else: + #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay. + child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890)) + child_graph_node_style = "invis" + child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None") + child_edge_style = "invis" + self.graph.add_node(child_graph_node) + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style)) + + def visit_symbol_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "filled" + if node.is_intermediate: + graph_node_shape = "ellipse" + else: + graph_node_shape = "rectangle" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + return iter(node.children) + + def visit_symbol_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in node.children: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) + From 8fa8ac36fc23c6a34f9af362405ca854b8e03e43 Mon Sep 17 00:00:00 2001 From: night199uk Date: Wed, 12 Dec 2018 14:45:52 +0100 Subject: [PATCH 3/5] Remove Earley Column We can replace Earley Columns with basic python sets for improved performance and simplicity. --- lark/parsers/earley.py | 61 ++++++++++++++++++-------------- lark/parsers/earley_common.py | 26 +++----------- lark/parsers/earley_forest.py | 10 +++--- lark/parsers/xearley.py | 66 +++++++++++++++++++---------------- 4 files changed, 80 insertions(+), 83 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 02bc5d4..7600915 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -16,7 +16,7 @@ from ..visitors import Transformer_InPlace, v_args from ..exceptions import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal -from .earley_common import Column, Item +from .earley_common import Item from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode from collections import deque, defaultdict @@ -48,19 +48,24 @@ class Parser: # Define parser functions start_symbol = NonTerminal(start_symbol or self.parser_conf.start) match = self.term_matcher - held_completions = defaultdict(list) + + # Held Completions (H in E.Scotts paper). + held_completions = {} + + # Cache for nodes & tokens created in a particular parse step. node_cache = {} token_cache = {} + columns = [] def make_symbol_node(s, start, end): - label = (s, start.i, end.i) + label = (s, start, end) if label in node_cache: node = node_cache[label] else: node = node_cache[label] = SymbolNode(s, start, end) return node - def predict_and_complete(column, to_scan): + def predict_and_complete(i, to_scan): """The core Earley Predictor and Completer. At each stage of the input, we handling any completed items (things @@ -70,15 +75,16 @@ class Parser: which can be added to the scan list for the next scanner cycle.""" held_completions.clear() + column = columns[i] # R (items) = Ei (column.items) - items = deque(column.items) + items = deque(column) while items: item = items.pop() # remove an element, A say, from R ### The Earley completer if item.is_complete: ### (item.s == string) if item.node is None: - item.node = make_symbol_node(item.s, item.start, column) + item.node = make_symbol_node(item.s, item.start, i) item.node.add_family(item.s, item.rule, item.start, None, None) # Empty has 0 length. If we complete an empty symbol in a particular @@ -86,19 +92,19 @@ class Parser: # any predictions that result, that themselves require empty. Avoids # infinite recursion on empty symbols. # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start.i == column.i + is_empty_item = item.start == i if is_empty_item: held_completions[item.rule.origin] = item.node - originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] for originator in originators: new_item = originator.advance() - new_item.node = make_symbol_node(new_item.s, originator.start, column) + new_item.node = make_symbol_node(new_item.s, originator.start, i) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) if new_item.expect in self.TERMINALS: # Add (B :: aC.B, h, y) to Q to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: # Add (B :: aC.B, h, y) to Ei and R column.add(new_item) items.append(new_item) @@ -107,24 +113,24 @@ class Parser: elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) new_items = [] for rule in self.predictions[item.expect]: - new_item = Item(rule, 0, column) + new_item = Item(rule, 0, i) new_items.append(new_item) # Process any held completions (H). if item.expect in held_completions: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, item.start, column) + new_item.node = make_symbol_node(new_item.s, item.start, i) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) new_items.append(new_item) for new_item in new_items: if new_item.expect in self.TERMINALS: to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: column.add(new_item) items.append(new_item) - def scan(i, token, column, to_scan): + def scan(i, token, to_scan): """The core Earley Scanner. This is a custom implementation of the scanner that uses the @@ -132,12 +138,14 @@ class Parser: Earley predictor, based on the previously completed tokens. This ensures that at each phase of the parse we have a custom lexer context, allowing for more complex ambiguities.""" - next_set = Column(i+1, self.FIRST) next_to_scan = set() + next_set = set() + columns.append(next_set) + for item in set(to_scan): if match(item.expect, token): new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, new_item.start, column) + new_item.node = make_symbol_node(new_item.s, new_item.start, i) new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) if new_item.expect in self.TERMINALS: @@ -151,11 +159,10 @@ class Parser: expect = {i.expect.name for i in to_scan} raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) - return next_set, next_to_scan + return next_to_scan # Main loop starts - column0 = Column(0, self.FIRST) - column = column0 + columns.append(set()) ## The scan buffer. 'Q' in E.Scott's paper. to_scan = set() @@ -164,32 +171,34 @@ class Parser: # Add predicted items to the first Earley set (for the predictor) if they # result in a non-terminal, or the scanner if they result in a terminal. for rule in self.predictions[start_symbol]: - item = Item(rule, 0, column0) + item = Item(rule, 0, 0) if item.expect in self.TERMINALS: to_scan.add(item) else: - column.add(item) + columns[0].add(item) ## The main Earley loop. # Run the Prediction/Completion cycle for any Items in the current Earley set. # Completions will be added to the SPPF tree, and predictions will be recursively # processed down to terminals/empty nodes to be added to the scanner for the next # step. - for i, token in enumerate(stream): - predict_and_complete(column, to_scan) + i = 0 + for token in stream: + predict_and_complete(i, to_scan) # Clear the node_cache and token_cache, which are only relevant for each # step in the Earley pass. node_cache.clear() token_cache.clear() - column, to_scan = scan(i, token, column, to_scan) + to_scan = scan(i, token, to_scan) + i += 1 - predict_and_complete(column, to_scan) + predict_and_complete(i, to_scan) ## Column is now the final column in the parse. If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in # this column. Find the item for the start_symbol, which is the root of the SPPF tree. - solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] + solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: raise ParseError('Incomplete parse: Could not find a solution to input') diff --git a/lark/parsers/earley_common.py b/lark/parsers/earley_common.py index d17abe4..74dd388 100644 --- a/lark/parsers/earley_common.py +++ b/lark/parsers/earley_common.py @@ -35,6 +35,7 @@ class Item(object): __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') def __init__(self, rule, ptr, start): + assert isinstance(start, int), "start is not an int" self.is_complete = len(rule.expansion) == ptr self.rule = rule # rule self.ptr = ptr # ptr @@ -46,35 +47,16 @@ class Item(object): else: self.s = (rule, ptr) self.expect = rule.expansion[ptr] - self._hash = hash((self.s, self.start.i)) + self._hash = hash((self.s, self.start)) def advance(self): return self.__class__(self.rule, self.ptr + 1, self.start) def __eq__(self, other): - return self is other or (self.s == other.s and self.start.i == other.start.i) + return self is other or (self.s == other.s and self.start == other.start) def __hash__(self): return self._hash def __repr__(self): - return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i) - -class Column: - "An entry in the table, aka Earley Chart. Contains lists of items." - def __init__(self, i, FIRST): - self.i = i - self.items = set() - self.FIRST = FIRST - - def add(self, item): - """Sort items into scan/predict/reduce newslists - - Makes sure only unique items are added. - """ - self.items.add(item) - - def __bool__(self): - return bool(self.items) - - __nonzero__ = __bool__ # Py2 backwards-compatibility + return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index e5038d9..730ebe1 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -13,7 +13,7 @@ from ..exceptions import ParseError from ..lexer import Token from ..utils import Str from ..grammar import NonTerminal, Terminal -from .earley_common import Column, Derivation +from .earley_common import Derivation from collections import deque from importlib import import_module @@ -60,7 +60,7 @@ class SymbolNode(ForestNode): return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) def __hash__(self): - return hash((self.s, self.start.i, self.end.i)) + return hash((self.s, self.start, self.end)) def __repr__(self): if self.is_intermediate: @@ -70,7 +70,7 @@ class SymbolNode(ForestNode): symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) else: symbol = self.s.name - return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) + return "(%s, %d, %d, %d)" % (symbol, self.start, self.end, self.priority if self.priority is not None else 0) class PackedNode(ForestNode): """ @@ -85,7 +85,7 @@ class PackedNode(ForestNode): self.left = left self.right = right self.priority = None - self._hash = hash((self.s, self.start.i, self.left, self.right)) + self._hash = hash((self.s, self.start, self.left, self.right)) @property def is_empty(self): @@ -120,7 +120,7 @@ class PackedNode(ForestNode): symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) else: symbol = self.s.name - return "{%s, %d, %d}" % (symbol, self.start.i, self.priority if self.priority is not None else 0) + return "{%s, %d, %d}" % (symbol, self.start, self.priority if self.priority is not None else 0) class ForestVisitor(object): """ diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 30729ef..57aab61 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -24,7 +24,7 @@ from ..tree import Tree from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal, Terminal from .earley import ApplyCallbacks -from .earley_common import Column, Item +from .earley_common import Item from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode @@ -44,12 +44,13 @@ class Parser: # the slow 'isupper' in is_terminal. self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } + for rule in parser_conf.rules: self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] - self.term_matcher = term_matcher self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) + self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): start_symbol = NonTerminal(start_symbol or self.parser_conf.start) @@ -62,19 +63,20 @@ class Parser: # Cache for nodes & tokens created in a particular parse step. node_cache = {} token_cache = {} + columns = [] text_line = 1 text_column = 1 def make_symbol_node(s, start, end): - label = (s, start.i, end.i) + label = (s, start, end) if label in node_cache: node = node_cache[label] else: node = node_cache[label] = SymbolNode(s, start, end) return node - def predict_and_complete(column, to_scan): + def predict_and_complete(i, to_scan): """The core Earley Predictor and Completer. At each stage of the input, we handling any completed items (things @@ -84,15 +86,16 @@ class Parser: which can be added to the scan list for the next scanner cycle.""" held_completions.clear() + column = columns[i] # R (items) = Ei (column.items) - items = deque(column.items) + items = deque(column) while items: item = items.pop() # remove an element, A say, from R ### The Earley completer if item.is_complete: ### (item.s == string) if item.node is None: - item.node = make_symbol_node(item.s, item.start, column) + item.node = make_symbol_node(item.s, item.start, i) item.node.add_family(item.s, item.rule, item.start, None, None) # Empty has 0 length. If we complete an empty symbol in a particular @@ -100,19 +103,19 @@ class Parser: # any predictions that result, that themselves require empty. Avoids # infinite recursion on empty symbols. # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start.i == column.i + is_empty_item = item.start == i if is_empty_item: held_completions[item.rule.origin] = item.node - originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] for originator in originators: new_item = originator.advance() - new_item.node = make_symbol_node(new_item.s, originator.start, column) + new_item.node = make_symbol_node(new_item.s, originator.start, i) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) if new_item.expect in self.TERMINALS: # Add (B :: aC.B, h, y) to Q to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: # Add (B :: aC.B, h, y) to Ei and R column.add(new_item) items.append(new_item) @@ -121,24 +124,24 @@ class Parser: elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) new_items = [] for rule in self.predictions[item.expect]: - new_item = Item(rule, 0, column) + new_item = Item(rule, 0, i) new_items.append(new_item) # Process any held completions (H). if item.expect in held_completions: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, item.start, column) + new_item.node = make_symbol_node(new_item.s, item.start, i) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) new_items.append(new_item) for new_item in new_items: if new_item.expect in self.TERMINALS: to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: column.add(new_item) items.append(new_item) - def scan(i, column, to_scan): + def scan(i, to_scan): """The core Earley Scanner. This is a custom implementation of the scanner that uses the @@ -157,7 +160,7 @@ class Parser: m = match(item.expect, stream, i) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[m.end()].append( (item, column, t) ) + delayed_matches[m.end()].append( (item, i, t) ) if self.complete_lex: s = m.group(0) @@ -165,7 +168,7 @@ class Parser: m = match(item.expect, s[:-j]) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[i+m.end()].append( (item, column, t) ) + delayed_matches[i+m.end()].append( (item, i, t) ) # Remove any items that successfully matched in this pass from the to_scan buffer. # This ensures we don't carry over tokens that already matched, if we're ignoring below. @@ -179,13 +182,14 @@ class Parser: m = match(x, stream, i) if m: # Carry over any items still in the scan buffer, to past the end of the ignored items. - delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ]) + delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ]) # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. - delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol]) + delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) - next_set = Column(i + 1, self.FIRST) # Ei+1 next_to_scan = set() + next_set = set() + columns.append(next_set) ## 4) Process Tokens from delayed_matches. # This is the core of the Earley scanner. Create an SPPF node for each Token, @@ -195,7 +199,8 @@ class Parser: for item, start, token in delayed_matches[i+1]: if token is not None: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, new_item.start, column) +# new_item.start = start # Should we update this to account for gaps due to ignores? + new_item.node = make_symbol_node(new_item.s, new_item.start, i) new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) else: new_item = item @@ -212,11 +217,10 @@ class Parser: if not next_set and not delayed_matches and not next_to_scan: raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) - return next_set, next_to_scan + return next_to_scan # Main loop starts - column0 = Column(0, self.FIRST) - column = column0 + columns.append(set()) ## The scan buffer. 'Q' in E.Scott's paper. to_scan = set() @@ -225,38 +229,40 @@ class Parser: # Add predicted items to the first Earley set (for the predictor) if they # result in a non-terminal, or the scanner if they result in a terminal. for rule in self.predictions[start_symbol]: - item = Item(rule, 0, column0) + item = Item(rule, 0, 0) if item.expect in self.TERMINALS: to_scan.add(item) else: - column.add(item) + columns[0].add(item) ## The main Earley loop. # Run the Prediction/Completion cycle for any Items in the current Earley set. # Completions will be added to the SPPF tree, and predictions will be recursively # processed down to terminals/empty nodes to be added to the scanner for the next # step. - for i, token in enumerate(stream): - predict_and_complete(column, to_scan) + i = 0 + for token in stream: + predict_and_complete(i, to_scan) # Clear the node_cache and token_cache, which are only relevant for each # step in the Earley pass. node_cache.clear() token_cache.clear() - column, to_scan = scan(i, column, to_scan) + to_scan = scan(i, to_scan) if token == '\n': text_line += 1 text_column = 1 else: text_column += 1 + i += 1 - predict_and_complete(column, to_scan) + predict_and_complete(i, to_scan) ## Column is now the final column in the parse. If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in # this column. Find the item for the start_symbol, which is the root of the SPPF tree. - solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] + solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: expected_tokens = [t.expect for t in to_scan] From 637f12110979999afc3bab3afbd514ce5e651bd2 Mon Sep 17 00:00:00 2001 From: night199uk Date: Fri, 14 Dec 2018 12:49:29 +0100 Subject: [PATCH 4/5] Cleanup unused Derivation --- lark/parsers/earley_common.py | 17 ----------------- lark/parsers/earley_forest.py | 1 - 2 files changed, 18 deletions(-) diff --git a/lark/parsers/earley_common.py b/lark/parsers/earley_common.py index 74dd388..e13c7c3 100644 --- a/lark/parsers/earley_common.py +++ b/lark/parsers/earley_common.py @@ -13,23 +13,6 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -## for recursive repr -from ..tree import Tree - -class Derivation(Tree): - def __init__(self, rule, children = None): - Tree.__init__(self, 'drv', children if children is not None else []) - self.meta.rule = rule - self._hash = None - - def __repr__(self, indent = 0): - return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...') - - def __hash__(self): - if self._hash is None: - self._hash = Tree.__hash__(self) - return self._hash - class Item(object): "An Earley Item, the atom of the algorithm." diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index 730ebe1..f6bea75 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -13,7 +13,6 @@ from ..exceptions import ParseError from ..lexer import Token from ..utils import Str from ..grammar import NonTerminal, Terminal -from .earley_common import Derivation from collections import deque from importlib import import_module From 04d90fa9165741333c2e394592a64e1c3966aa14 Mon Sep 17 00:00:00 2001 From: night199uk Date: Tue, 18 Dec 2018 05:57:57 +0100 Subject: [PATCH 5/5] Implement Joop Leo's optimizations for right recursion performance --- lark/parsers/earley.py | 137 +++++++++++++++++++++++++++------- lark/parsers/earley_common.py | 38 +++++++++- lark/parsers/earley_forest.py | 67 +++++++++++------ lark/parsers/xearley.py | 137 +++++++++++++++++++++++++++------- 4 files changed, 303 insertions(+), 76 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 7600915..66ab903 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -16,7 +16,7 @@ from ..visitors import Transformer_InPlace, v_args from ..exceptions import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal -from .earley_common import Item +from .earley_common import Item, TransitiveItem from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode from collections import deque, defaultdict @@ -28,6 +28,7 @@ class Parser: self.resolve_ambiguity = resolve_ambiguity self.FIRST = analysis.FIRST + self.NULLABLE = analysis.NULLABLE self.callbacks = {} self.predictions = {} @@ -56,14 +57,68 @@ class Parser: node_cache = {} token_cache = {} columns = [] - - def make_symbol_node(s, start, end): - label = (s, start, end) - if label in node_cache: - node = node_cache[label] + transitives = [] + + def is_quasi_complete(item): + if item.is_complete: + return True + + quasi = item.advance() + while not quasi.is_complete: + symbol = quasi.expect + if symbol not in self.NULLABLE: + return False + if quasi.rule.origin == start_symbol and symbol == start_symbol: + return False + quasi = quasi.advance() + return True + + def create_leo_transitives(item, trule, previous, visited = None): + if visited is None: + visited = set() + + if item.rule.origin in transitives[item.start]: + previous = trule = transitives[item.start][item.rule.origin] + return trule, previous + + is_empty_rule = not self.FIRST[item.rule.origin] + if is_empty_rule: + return trule, previous + + originator = None + for key in columns[item.start]: + if key.expect is not None and key.expect == item.rule.origin: + if originator is not None: + return trule, previous + originator = key + + if originator is None: + return trule, previous + + if originator in visited: + return trule, previous + + visited.add(originator) + if not is_quasi_complete(originator): + return trule, previous + + trule = originator.advance() + if originator.start != item.start: + visited.clear() + + trule, previous = create_leo_transitives(originator, trule, previous, visited) + if trule is None: + return trule, previous + + titem = None + if previous is not None: + titem = TransitiveItem(item.rule.origin, trule, originator, previous.column) + previous.next_titem = titem else: - node = node_cache[label] = SymbolNode(s, start, end) - return node + titem = TransitiveItem(item.rule.origin, trule, originator, item.start) + + previous = transitives[item.start][item.rule.origin] = titem + return trule, previous def predict_and_complete(i, to_scan): """The core Earley Predictor and Completer. @@ -84,23 +139,26 @@ class Parser: ### The Earley completer if item.is_complete: ### (item.s == string) if item.node is None: - item.node = make_symbol_node(item.s, item.start, i) + label = (item.s, item.start, i) + item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) item.node.add_family(item.s, item.rule, item.start, None, None) - # Empty has 0 length. If we complete an empty symbol in a particular - # parse step, we need to be able to use that same empty symbol to complete - # any predictions that result, that themselves require empty. Avoids - # infinite recursion on empty symbols. - # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start == i - if is_empty_item: - held_completions[item.rule.origin] = item.node - - originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] - for originator in originators: - new_item = originator.advance() - new_item.node = make_symbol_node(new_item.s, originator.start, i) - new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) + create_leo_transitives(item, None, None) + + ###R Joop Leo right recursion Completer + if item.rule.origin in transitives[item.start]: + transitive = transitives[item.start][item.s] + if transitive.previous in transitives[transitive.column]: + root_transitive = transitives[transitive.column][transitive.previous] + else: + root_transitive = transitive + + label = (root_transitive.s, root_transitive.start, i) + node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + vn.add_path(root_transitive, item.node) + + new_item = Item(transitive.rule, transitive.ptr, transitive.start) + new_item.node = vn if new_item.expect in self.TERMINALS: # Add (B :: aC.B, h, y) to Q to_scan.add(new_item) @@ -108,6 +166,30 @@ class Parser: # Add (B :: aC.B, h, y) to Ei and R column.add(new_item) items.append(new_item) + ###R Regular Earley completer + else: + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start == i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + label = (new_item.s, originator.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) ### The Earley predictor elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) @@ -119,7 +201,8 @@ class Parser: # Process any held completions (H). if item.expect in held_completions: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, item.start, i) + label = (new_item.s, item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) new_items.append(new_item) @@ -141,11 +224,14 @@ class Parser: next_to_scan = set() next_set = set() columns.append(next_set) + next_transitives = dict() + transitives.append(next_transitives) for item in set(to_scan): if match(item.expect, token): new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, new_item.start, i) + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) if new_item.expect in self.TERMINALS: @@ -163,6 +249,7 @@ class Parser: # Main loop starts columns.append(set()) + transitives.append(dict()) ## The scan buffer. 'Q' in E.Scott's paper. to_scan = set() diff --git a/lark/parsers/earley_common.py b/lark/parsers/earley_common.py index e13c7c3..6bd614b 100644 --- a/lark/parsers/earley_common.py +++ b/lark/parsers/earley_common.py @@ -13,12 +13,13 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com +from ..grammar import NonTerminal, Terminal + class Item(object): "An Earley Item, the atom of the algorithm." - __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') + __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') def __init__(self, rule, ptr, start): - assert isinstance(start, int), "start is not an int" self.is_complete = len(rule.expansion) == ptr self.rule = rule # rule self.ptr = ptr # ptr @@ -27,13 +28,15 @@ class Item(object): if self.is_complete: self.s = rule.origin self.expect = None + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None else: self.s = (rule, ptr) self.expect = rule.expansion[ptr] + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None self._hash = hash((self.s, self.start)) def advance(self): - return self.__class__(self.rule, self.ptr + 1, self.start) + return Item(self.rule, self.ptr + 1, self.start) def __eq__(self, other): return self is other or (self.s == other.s and self.start == other.start) @@ -42,4 +45,31 @@ class Item(object): return self._hash def __repr__(self): - return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start) + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after)) + return '%s (%d)' % (symbol, self.start) + + +class TransitiveItem(Item): + __slots__ = ('recognized', 'reduction', 'column', 'next_titem') + def __init__(self, recognized, trule, originator, start): + super(TransitiveItem, self).__init__(trule.rule, trule.ptr, trule.start) + self.recognized = recognized + self.reduction = originator + self.column = start + self.next_titem = None + self._hash = hash((self.s, self.start, self.recognized)) + + def __eq__(self, other): + if not isinstance(other, TransitiveItem): + return False + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.recognized == other.recognized) + + def __hash__(self): + return self._hash + + def __repr__(self): + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + return '{} : {} -> {}* {} ({}, {})'.format(self.recognized.name, self.rule.origin.name, ' '.join(before), ' '.join(after), self.column, self.start) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index f6bea75..dda2dcb 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -12,7 +12,7 @@ from ..tree import Tree from ..exceptions import ParseError from ..lexer import Token from ..utils import Str -from ..grammar import NonTerminal, Terminal +from ..grammar import NonTerminal, Terminal, Symbol from collections import deque from importlib import import_module @@ -34,42 +34,65 @@ class SymbolNode(ForestNode): Hence a Symbol Node with a single child is unambiguous. """ - __slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate') + __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash') def __init__(self, s, start, end): self.s = s self.start = start self.end = end - self.children = set() + self._children = set() + self.paths = set() + self.paths_loaded = False self.priority = None self.is_intermediate = isinstance(s, tuple) + self._hash = hash((self.s, self.start, self.end)) def add_family(self, lr0, rule, start, left, right): - self.children.add(PackedNode(self, lr0, rule, start, left, right)) + self._children.add(PackedNode(self, lr0, rule, start, left, right)) + + def add_path(self, transitive, node): + self.paths.add((transitive, node)) + + def load_paths(self): + for transitive, node in self.paths: + if transitive.next_titem is not None: + vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end) + vn.add_path(transitive.next_titem, node) + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn) + else: + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node) + self.paths_loaded = True @property def is_ambiguous(self): return len(self.children) > 1 + @property + def children(self): + if not self.paths_loaded: + self.load_paths() + return self._children + def __iter__(self): - return iter(self.children) + return iter(self._children) def __eq__(self, other): if not isinstance(other, SymbolNode): return False - return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end) def __hash__(self): - return hash((self.s, self.start, self.end)) + return self._hash def __repr__(self): if self.is_intermediate: rule = self.s[0] ptr = self.s[1] - names = [ "{}*".format(expansion.name) if index == ptr else expansion.name for index, expansion in enumerate(rule.expansion) ] - symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) else: symbol = self.s.name - return "(%s, %d, %d, %d)" % (symbol, self.start, self.end, self.priority if self.priority is not None else 0) + return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority if self.priority is not None else 0) class PackedNode(ForestNode): """ @@ -115,11 +138,12 @@ class PackedNode(ForestNode): if isinstance(self.s, tuple): rule = self.s[0] ptr = self.s[1] - names = [ "{}*".format(expansion.name) if index == ptr else expansion.name for index, expansion in enumerate(rule.expansion) ] - symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) else: symbol = self.s.name - return "{%s, %d, %d}" % (symbol, self.start, self.priority if self.priority is not None else 0) + return "({}, {}, {})".format(symbol, self.start, self.priority) class ForestVisitor(object): """ @@ -182,8 +206,8 @@ class ForestVisitor(object): current_id = id(current) if current_id in visiting: - if isinstance(current, PackedNode): vpno(current) - else: vsno(current) + if isinstance(current, PackedNode): vpno(current) + else: vsno(current) input_stack.pop() visiting.remove(current_id) continue @@ -226,7 +250,7 @@ class ForestSumVisitor(ForestVisitor): def visit_symbol_node_out(self, node): node.priority = max(child.priority for child in node.children) - node.children = sorted(node.children, reverse = True) + node._children = sorted(node.children, reverse = True) class ForestAntiscoreSumVisitor(ForestSumVisitor): """ @@ -240,7 +264,7 @@ class ForestAntiscoreSumVisitor(ForestSumVisitor): """ def visit_symbol_node_out(self, node): node.priority = min(child.priority for child in node.children) - node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) + node._children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) class AntiscoreSumComparator(object): """ @@ -342,7 +366,7 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): return iter(node.children) def visit_symbol_node_out(self, node): - if node.is_ambiguous: + if not node.is_intermediate and node.is_ambiguous: result = self.output_stack.pop() if self.output_stack: self.output_stack[-1].children.append(result) @@ -386,8 +410,8 @@ class ForestToPyDotVisitor(ForestVisitor): graph_node_id = str(id(node)) graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"')) graph_node_color = 0x808080 - graph_node_style = "filled" - graph_node_shape = "polygon" + graph_node_style = "\"filled,rounded\"" + graph_node_shape = "diamond" graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) self.graph.add_node(graph_node) @@ -422,7 +446,7 @@ class ForestToPyDotVisitor(ForestVisitor): graph_node_id = str(id(node)) graph_node_label = repr(node) graph_node_color = 0x808080 - graph_node_style = "filled" + graph_node_style = "\"filled\"" if node.is_intermediate: graph_node_shape = "ellipse" else: @@ -438,4 +462,3 @@ class ForestToPyDotVisitor(ForestVisitor): child_graph_node_id = str(id(child)) child_graph_node = self.graph.get_node(child_graph_node_id)[0] self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) - diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 57aab61..acd5d25 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -24,7 +24,7 @@ from ..tree import Tree from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal, Terminal from .earley import ApplyCallbacks -from .earley_common import Item +from .earley_common import Item, TransitiveItem from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode @@ -37,6 +37,7 @@ class Parser: self.complete_lex = complete_lex self.FIRST = analysis.FIRST + self.NULLABLE = analysis.NULLABLE self.callbacks = {} self.predictions = {} @@ -64,17 +65,71 @@ class Parser: node_cache = {} token_cache = {} columns = [] + transitives = [] text_line = 1 text_column = 1 - def make_symbol_node(s, start, end): - label = (s, start, end) - if label in node_cache: - node = node_cache[label] + def is_quasi_complete(item): + if item.is_complete: + return True + + quasi = item.advance() + while not quasi.is_complete: + symbol = quasi.expect + if symbol not in self.NULLABLE: + return False + if quasi.rule.origin == start_symbol and symbol == start_symbol: + return False + quasi = quasi.advance() + return True + + def create_leo_transitives(item, trule, previous, visited = None): + if visited is None: + visited = set() + + if item.rule.origin in transitives[item.start]: + previous = trule = transitives[item.start][item.rule.origin] + return trule, previous + + is_empty_rule = not self.FIRST[item.rule.origin] + if is_empty_rule: + return trule, previous + + originator = None + for key in columns[item.start]: + if key.expect is not None and key.expect == item.rule.origin: + if originator is not None: + return trule, previous + originator = key + + if originator is None: + return trule, previous + + if originator in visited: + return trule, previous + + visited.add(originator) + if not is_quasi_complete(originator): + return trule, previous + + trule = originator.advance() + if originator.start != item.start: + visited.clear() + + trule, previous = create_leo_transitives(originator, trule, previous, visited) + if trule is None: + return trule, previous + + titem = None + if previous is not None: + titem = TransitiveItem(item.rule.origin, trule, originator, previous.column) + previous.next_titem = titem else: - node = node_cache[label] = SymbolNode(s, start, end) - return node + titem = TransitiveItem(item.rule.origin, trule, originator, item.start) + + previous = transitives[item.start][item.rule.origin] = titem + return trule, previous def predict_and_complete(i, to_scan): """The core Earley Predictor and Completer. @@ -95,23 +150,26 @@ class Parser: ### The Earley completer if item.is_complete: ### (item.s == string) if item.node is None: - item.node = make_symbol_node(item.s, item.start, i) + label = (item.s, item.start, i) + item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) item.node.add_family(item.s, item.rule, item.start, None, None) - # Empty has 0 length. If we complete an empty symbol in a particular - # parse step, we need to be able to use that same empty symbol to complete - # any predictions that result, that themselves require empty. Avoids - # infinite recursion on empty symbols. - # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start == i - if is_empty_item: - held_completions[item.rule.origin] = item.node - - originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] - for originator in originators: - new_item = originator.advance() - new_item.node = make_symbol_node(new_item.s, originator.start, i) - new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) + create_leo_transitives(item, None, None) + + ###R Joop Leo right recursion Completer + if item.rule.origin in transitives[item.start]: + transitive = transitives[item.start][item.s] + if transitive.previous in transitives[transitive.column]: + root_transitive = transitives[transitive.column][transitive.previous] + else: + root_transitive = transitive + + label = (root_transitive.s, root_transitive.start, i) + node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + vn.add_path(root_transitive, item.node) + + new_item = Item(transitive.rule, transitive.ptr, transitive.start) + new_item.node = vn if new_item.expect in self.TERMINALS: # Add (B :: aC.B, h, y) to Q to_scan.add(new_item) @@ -119,6 +177,30 @@ class Parser: # Add (B :: aC.B, h, y) to Ei and R column.add(new_item) items.append(new_item) + ###R Regular Earley completer + else: + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start == i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + label = (new_item.s, originator.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) ### The Earley predictor elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) @@ -130,7 +212,8 @@ class Parser: # Process any held completions (H). if item.expect in held_completions: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, item.start, i) + label = (new_item.s, item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) new_items.append(new_item) @@ -190,6 +273,8 @@ class Parser: next_to_scan = set() next_set = set() columns.append(next_set) + next_transitives = dict() + transitives.append(next_transitives) ## 4) Process Tokens from delayed_matches. # This is the core of the Earley scanner. Create an SPPF node for each Token, @@ -199,8 +284,8 @@ class Parser: for item, start, token in delayed_matches[i+1]: if token is not None: new_item = item.advance() -# new_item.start = start # Should we update this to account for gaps due to ignores? - new_item.node = make_symbol_node(new_item.s, new_item.start, i) + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) else: new_item = item @@ -221,6 +306,7 @@ class Parser: # Main loop starts columns.append(set()) + transitives.append(dict()) ## The scan buffer. 'Q' in E.Scott's paper. to_scan = set() @@ -248,6 +334,7 @@ class Parser: # step in the Earley pass. node_cache.clear() token_cache.clear() + node_cache.clear() to_scan = scan(i, to_scan) if token == '\n':