diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index c514f13..cb33afa 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -16,17 +16,19 @@ from ..visitors import Transformer_InPlace, v_args from ..exceptions import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal -from .earley_common import Column, Item -from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor +from .earley_common import Item, TransitiveItem +from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode + +from collections import deque, defaultdict class Parser: def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor): analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity - self.forest_sum_visitor = forest_sum_visitor self.FIRST = analysis.FIRST + self.NULLABLE = analysis.NULLABLE self.callbacks = {} self.predictions = {} @@ -39,6 +41,7 @@ class Parser: self.callbacks[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] + self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) self.term_matcher = term_matcher @@ -46,19 +49,78 @@ class Parser: # Define parser functions start_symbol = NonTerminal(start_symbol or self.parser_conf.start) match = self.term_matcher - held_completions = defaultdict(list) + + # Held Completions (H in E.Scotts paper). + held_completions = {} + + # Cache for nodes & tokens created in a particular parse step. node_cache = {} token_cache = {} - - def make_symbol_node(s, start, end): - label = (s, start.i, end.i) - if label in node_cache: - node = node_cache[label] + columns = [] + transitives = [] + + def is_quasi_complete(item): + if item.is_complete: + return True + + quasi = item.advance() + while not quasi.is_complete: + symbol = quasi.expect + if symbol not in self.NULLABLE: + return False + if quasi.rule.origin == start_symbol and symbol == start_symbol: + return False + quasi = quasi.advance() + return True + + def create_leo_transitives(item, trule, previous, visited = None): + if visited is None: + visited = set() + + if item.rule.origin in transitives[item.start]: + previous = trule = transitives[item.start][item.rule.origin] + return trule, previous + + is_empty_rule = not self.FIRST[item.rule.origin] + if is_empty_rule: + return trule, previous + + originator = None + for key in columns[item.start]: + if key.expect is not None and key.expect == item.rule.origin: + if originator is not None: + return trule, previous + originator = key + + if originator is None: + return trule, previous + + if originator in visited: + return trule, previous + + visited.add(originator) + if not is_quasi_complete(originator): + return trule, previous + + trule = originator.advance() + if originator.start != item.start: + visited.clear() + + trule, previous = create_leo_transitives(originator, trule, previous, visited) + if trule is None: + return trule, previous + + titem = None + if previous is not None: + titem = TransitiveItem(item.rule.origin, trule, originator, previous.column) + previous.next_titem = titem else: - node = node_cache[label] = SymbolNode(s, start, end) - return node + titem = TransitiveItem(item.rule.origin, trule, originator, item.start) + + previous = transitives[item.start][item.rule.origin] = titem + return trule, previous - def predict_and_complete(column, to_scan): + def predict_and_complete(i, to_scan): """The core Earley Predictor and Completer. At each stage of the input, we handling any completed items (things @@ -68,61 +130,90 @@ class Parser: which can be added to the scan list for the next scanner cycle.""" held_completions.clear() + column = columns[i] # R (items) = Ei (column.items) - items = deque(column.items) + items = deque(column) while items: item = items.pop() # remove an element, A say, from R ### The Earley completer if item.is_complete: ### (item.s == string) if item.node is None: - item.node = make_symbol_node(item.s, item.start, column) + label = (item.s, item.start, i) + item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) item.node.add_family(item.s, item.rule, item.start, None, None) - # Empty has 0 length. If we complete an empty symbol in a particular - # parse step, we need to be able to use that same empty symbol to complete - # any predictions that result, that themselves require empty. Avoids - # infinite recursion on empty symbols. - # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start.i == column.i - if is_empty_item: - held_completions[item.rule.origin] = item.node - - originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] - for originator in originators: - new_item = originator.advance() - new_item.node = make_symbol_node(new_item.s, originator.start, column) - new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) + create_leo_transitives(item, None, None) + + ###R Joop Leo right recursion Completer + if item.rule.origin in transitives[item.start]: + transitive = transitives[item.start][item.s] + if transitive.previous in transitives[transitive.column]: + root_transitive = transitives[transitive.column][transitive.previous] + else: + root_transitive = transitive + + label = (root_transitive.s, root_transitive.start, i) + node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + vn.add_path(root_transitive, item.node) + + new_item = Item(transitive.rule, transitive.ptr, transitive.start) + new_item.node = vn if new_item.expect in self.TERMINALS: # Add (B :: aC.B, h, y) to Q to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: # Add (B :: aC.B, h, y) to Ei and R column.add(new_item) items.append(new_item) + ###R Regular Earley completer + else: + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start == i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + label = (new_item.s, originator.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) ### The Earley predictor elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) new_items = [] for rule in self.predictions[item.expect]: - new_item = Item(rule, 0, column) + new_item = Item(rule, 0, i) new_items.append(new_item) # Process any held completions (H). if item.expect in held_completions: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, item.start, column) + label = (new_item.s, item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) new_items.append(new_item) for new_item in new_items: if new_item.expect in self.TERMINALS: to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: column.add(new_item) items.append(new_item) - def scan(i, token, column, to_scan): + def scan(i, token, to_scan): """The core Earley Scanner. This is a custom implementation of the scanner that uses the @@ -130,12 +221,17 @@ class Parser: Earley predictor, based on the previously completed tokens. This ensures that at each phase of the parse we have a custom lexer context, allowing for more complex ambiguities.""" - next_set = Column(i+1, self.FIRST) next_to_scan = set() + next_set = set() + columns.append(next_set) + next_transitives = dict() + transitives.append(next_transitives) + for item in set(to_scan): if match(item.expect, token): new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, new_item.start, column) + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) if new_item.expect in self.TERMINALS: @@ -149,11 +245,11 @@ class Parser: expect = {i.expect.name for i in to_scan} raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) - return next_set, next_to_scan + return next_to_scan # Main loop starts - column0 = Column(0, self.FIRST) - column = column0 + columns.append(set()) + transitives.append(dict()) ## The scan buffer. 'Q' in E.Scott's paper. to_scan = set() @@ -162,32 +258,34 @@ class Parser: # Add predicted items to the first Earley set (for the predictor) if they # result in a non-terminal, or the scanner if they result in a terminal. for rule in self.predictions[start_symbol]: - item = Item(rule, 0, column0) + item = Item(rule, 0, 0) if item.expect in self.TERMINALS: to_scan.add(item) else: - column.add(item) + columns[0].add(item) ## The main Earley loop. # Run the Prediction/Completion cycle for any Items in the current Earley set. # Completions will be added to the SPPF tree, and predictions will be recursively # processed down to terminals/empty nodes to be added to the scanner for the next # step. - for i, token in enumerate(stream): - predict_and_complete(column, to_scan) + i = 0 + for token in stream: + predict_and_complete(i, to_scan) # Clear the node_cache and token_cache, which are only relevant for each # step in the Earley pass. node_cache.clear() token_cache.clear() - column, to_scan = scan(i, token, column, to_scan) + to_scan = scan(i, token, to_scan) + i += 1 - predict_and_complete(column, to_scan) + predict_and_complete(i, to_scan) ## Column is now the final column in the parse. If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in # this column. Find the item for the start_symbol, which is the root of the SPPF tree. - solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] + solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: raise ParseError('Incomplete parse: Could not find a solution to input') @@ -201,7 +299,7 @@ class Parser: # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities # according to the rules. - return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() + return self.forest_tree_visitor.go(solutions[0]) class ApplyCallbacks(Transformer_InPlace): def __init__(self, postprocess): diff --git a/lark/parsers/earley_common.py b/lark/parsers/earley_common.py index d17abe4..6bd614b 100644 --- a/lark/parsers/earley_common.py +++ b/lark/parsers/earley_common.py @@ -13,27 +13,12 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -## for recursive repr -from ..tree import Tree - -class Derivation(Tree): - def __init__(self, rule, children = None): - Tree.__init__(self, 'drv', children if children is not None else []) - self.meta.rule = rule - self._hash = None - - def __repr__(self, indent = 0): - return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...') - - def __hash__(self): - if self._hash is None: - self._hash = Tree.__hash__(self) - return self._hash +from ..grammar import NonTerminal, Terminal class Item(object): "An Earley Item, the atom of the algorithm." - __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') + __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') def __init__(self, rule, ptr, start): self.is_complete = len(rule.expansion) == ptr self.rule = rule # rule @@ -43,38 +28,48 @@ class Item(object): if self.is_complete: self.s = rule.origin self.expect = None + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None else: self.s = (rule, ptr) self.expect = rule.expansion[ptr] - self._hash = hash((self.s, self.start.i)) + self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None + self._hash = hash((self.s, self.start)) def advance(self): - return self.__class__(self.rule, self.ptr + 1, self.start) + return Item(self.rule, self.ptr + 1, self.start) def __eq__(self, other): - return self is other or (self.s == other.s and self.start.i == other.start.i) + return self is other or (self.s == other.s and self.start == other.start) def __hash__(self): return self._hash def __repr__(self): - return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i) - -class Column: - "An entry in the table, aka Earley Chart. Contains lists of items." - def __init__(self, i, FIRST): - self.i = i - self.items = set() - self.FIRST = FIRST + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after)) + return '%s (%d)' % (symbol, self.start) + + +class TransitiveItem(Item): + __slots__ = ('recognized', 'reduction', 'column', 'next_titem') + def __init__(self, recognized, trule, originator, start): + super(TransitiveItem, self).__init__(trule.rule, trule.ptr, trule.start) + self.recognized = recognized + self.reduction = originator + self.column = start + self.next_titem = None + self._hash = hash((self.s, self.start, self.recognized)) - def add(self, item): - """Sort items into scan/predict/reduce newslists - - Makes sure only unique items are added. - """ - self.items.add(item) + def __eq__(self, other): + if not isinstance(other, TransitiveItem): + return False + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.recognized == other.recognized) - def __bool__(self): - return bool(self.items) + def __hash__(self): + return self._hash - __nonzero__ = __bool__ # Py2 backwards-compatibility + def __repr__(self): + before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) + after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) + return '{} : {} -> {}* {} ({}, {})'.format(self.recognized.name, self.rule.origin.name, ' '.join(before), ' '.join(after), self.column, self.start) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index 0b1650f..dda2dcb 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -7,14 +7,15 @@ Full reference and more details is here: http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ """ +from random import randint from ..tree import Tree from ..exceptions import ParseError from ..lexer import Token from ..utils import Str -from ..grammar import NonTerminal, Terminal -from .earley_common import Column, Derivation +from ..grammar import NonTerminal, Terminal, Symbol from collections import deque +from importlib import import_module class ForestNode(object): pass @@ -33,36 +34,65 @@ class SymbolNode(ForestNode): Hence a Symbol Node with a single child is unambiguous. """ - __slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate') + __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash') def __init__(self, s, start, end): self.s = s self.start = start self.end = end - self.children = set() + self._children = set() + self.paths = set() + self.paths_loaded = False self.priority = None self.is_intermediate = isinstance(s, tuple) + self._hash = hash((self.s, self.start, self.end)) def add_family(self, lr0, rule, start, left, right): - self.children.add(PackedNode(self, lr0, rule, start, left, right)) + self._children.add(PackedNode(self, lr0, rule, start, left, right)) + + def add_path(self, transitive, node): + self.paths.add((transitive, node)) + + def load_paths(self): + for transitive, node in self.paths: + if transitive.next_titem is not None: + vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end) + vn.add_path(transitive.next_titem, node) + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn) + else: + self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node) + self.paths_loaded = True @property def is_ambiguous(self): return len(self.children) > 1 + @property + def children(self): + if not self.paths_loaded: + self.load_paths() + return self._children + def __iter__(self): - return iter(self.children) + return iter(self._children) def __eq__(self, other): if not isinstance(other, SymbolNode): return False - return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) + return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end) def __hash__(self): - return hash((self.s, self.start.i, self.end.i)) + return self._hash def __repr__(self): - symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name - return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) + if self.is_intermediate: + rule = self.s[0] + ptr = self.s[1] + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) + else: + symbol = self.s.name + return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority if self.priority is not None else 0) class PackedNode(ForestNode): """ @@ -77,7 +107,7 @@ class PackedNode(ForestNode): self.left = left self.right = right self.priority = None - self._hash = hash((self.s, self.start.i, self.left, self.right)) + self._hash = hash((self.s, self.start, self.left, self.right)) @property def is_empty(self): @@ -105,8 +135,15 @@ class PackedNode(ForestNode): return self._hash def __repr__(self): - symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name - return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0) + if isinstance(self.s, tuple): + rule = self.s[0] + ptr = self.s[1] + before = ( expansion.name for expansion in rule.expansion[:ptr] ) + after = ( expansion.name for expansion in rule.expansion[ptr:] ) + symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) + else: + symbol = self.s.name + return "({}, {}, {})".format(symbol, self.start, self.priority) class ForestVisitor(object): """ @@ -114,9 +151,7 @@ class ForestVisitor(object): Use this as a base when you need to walk the forest. """ - def __init__(self, root): - self.root = root - self.result = None + __slots__ = ['result'] def visit_token_node(self, node): pass def visit_symbol_node_in(self, node): pass @@ -124,7 +159,8 @@ class ForestVisitor(object): def visit_packed_node_in(self, node): pass def visit_packed_node_out(self, node): pass - def go(self): + def go(self, root): + self.result = None # Visiting is a list of IDs of all symbol/intermediate nodes currently in # the stack. It serves two purposes: to detect when we 'recurse' in and out # of a symbol/intermediate so that we can process both up and down. Also, @@ -134,7 +170,7 @@ class ForestVisitor(object): # We do not use recursion here to walk the Forest due to the limited # stack size in python. Therefore input_stack is essentially our stack. - input_stack = deque([self.root]) + input_stack = deque([root]) # It is much faster to cache these as locals since they are called # many times in large parses. @@ -170,8 +206,8 @@ class ForestVisitor(object): current_id = id(current) if current_id in visiting: - if isinstance(current, PackedNode): vpno(current) - else: vsno(current) + if isinstance(current, PackedNode): vpno(current) + else: vsno(current) input_stack.pop() visiting.remove(current_id) continue @@ -214,7 +250,7 @@ class ForestSumVisitor(ForestVisitor): def visit_symbol_node_out(self, node): node.priority = max(child.priority for child in node.children) - node.children = sorted(node.children, reverse = True) + node._children = sorted(node.children, reverse = True) class ForestAntiscoreSumVisitor(ForestSumVisitor): """ @@ -228,7 +264,7 @@ class ForestAntiscoreSumVisitor(ForestSumVisitor): """ def visit_symbol_node_out(self, node): node.priority = min(child.priority for child in node.children) - node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) + node._children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) class AntiscoreSumComparator(object): """ @@ -263,19 +299,21 @@ class ForestToTreeVisitor(ForestVisitor): implementation should be another ForestVisitor which sorts the children according to some priority mechanism. """ - def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None): - super(ForestToTreeVisitor, self).__init__(root) - self.forest_sum_visitor = forest_sum_visitor - self.output_stack = deque() + __slots__ = ['forest_sum_visitor', 'output_stack', 'callbacks'] + def __init__(self, forest_sum_visitor = ForestSumVisitor, callbacks = None): + self.forest_sum_visitor = forest_sum_visitor() self.callbacks = callbacks - self.result = None + + def go(self, root): + self.output_stack = deque() + return super(ForestToTreeVisitor, self).go(root) def visit_token_node(self, node): self.output_stack[-1].append(node) def visit_symbol_node_in(self, node): if node.is_ambiguous and node.priority is None: - self.forest_sum_visitor(node).go() + self.forest_sum_visitor.go(node) return next(iter(node.children)) def visit_packed_node_in(self, node): @@ -311,11 +349,13 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): This is mainly used by the test framework, to make it simpler to write tests ensuring the SPPF contains the right results. """ - def __init__(self, root, callbacks): - super(ForestToAmbiguousTreeVisitor, self).__init__(root) - self.output_stack = deque() + __slots__ = ['output_stack', 'callbacks'] + def __init__(self, callbacks): self.callbacks = callbacks - self.result = None + + def go(self, root): + self.output_stack = deque([]) + return super(ForestToAmbiguousTreeVisitor, self).go(root) def visit_token_node(self, node): self.output_stack[-1].children.append(node) @@ -326,7 +366,7 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): return iter(node.children) def visit_symbol_node_out(self, node): - if node.is_ambiguous: + if not node.is_intermediate and node.is_ambiguous: result = self.output_stack.pop() if self.output_stack: self.output_stack[-1].children.append(result) @@ -347,4 +387,78 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): if self.output_stack: self.output_stack[-1].children.append(result) else: - self.result = result \ No newline at end of file + self.result = result + +class ForestToPyDotVisitor(ForestVisitor): + """ + A Forest visitor which writes the SPPF to a PNG. + + The SPPF can get really large, really quickly because + of the amount of meta-data it stores, so this is probably + only useful for trivial trees and learning how the SPPF + is structured. + """ + def __init__(self, rankdir="TB"): + self.pydot = import_module('pydot') + self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir) + + def go(self, root, filename): + super(ForestToPyDotVisitor, self).go(root) + self.graph.write_png(filename) + + def visit_token_node(self, node): + graph_node_id = str(id(node)) + graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"')) + graph_node_color = 0x808080 + graph_node_style = "\"filled,rounded\"" + graph_node_shape = "diamond" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + + def visit_packed_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "filled" + graph_node_shape = "diamond" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + return iter([node.left, node.right]) + + def visit_packed_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in [node.left, node.right]: + if child is not None: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) + else: + #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay. + child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890)) + child_graph_node_style = "invis" + child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None") + child_edge_style = "invis" + self.graph.add_node(child_graph_node) + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style)) + + def visit_symbol_node_in(self, node): + graph_node_id = str(id(node)) + graph_node_label = repr(node) + graph_node_color = 0x808080 + graph_node_style = "\"filled\"" + if node.is_intermediate: + graph_node_shape = "ellipse" + else: + graph_node_shape = "rectangle" + graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) + self.graph.add_node(graph_node) + return iter(node.children) + + def visit_symbol_node_out(self, node): + graph_node_id = str(id(node)) + graph_node = self.graph.get_node(graph_node_id)[0] + for child in node.children: + child_graph_node_id = str(id(child)) + child_graph_node = self.graph.get_node(child_graph_node_id)[0] + self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index b42a305..1d801e7 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -22,7 +22,8 @@ from ..exceptions import ParseError, UnexpectedCharacters from ..lexer import Token from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal, Terminal -from .earley_common import Column, Item +from .earley import ApplyCallbacks +from .earley_common import Item, TransitiveItem from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor @@ -31,11 +32,11 @@ class Parser: analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity - self.forest_sum_visitor = forest_sum_visitor self.ignore = [Terminal(t) for t in ignore] self.complete_lex = complete_lex self.FIRST = analysis.FIRST + self.NULLABLE = analysis.NULLABLE self.callbacks = {} self.predictions = {} @@ -43,10 +44,12 @@ class Parser: # the slow 'isupper' in is_terminal. self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } + for rule in parser_conf.rules: self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] + self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): @@ -60,19 +63,74 @@ class Parser: # Cache for nodes & tokens created in a particular parse step. node_cache = {} token_cache = {} + columns = [] + transitives = [] text_line = 1 text_column = 1 - def make_symbol_node(s, start, end): - label = (s, start.i, end.i) - if label in node_cache: - node = node_cache[label] + def is_quasi_complete(item): + if item.is_complete: + return True + + quasi = item.advance() + while not quasi.is_complete: + symbol = quasi.expect + if symbol not in self.NULLABLE: + return False + if quasi.rule.origin == start_symbol and symbol == start_symbol: + return False + quasi = quasi.advance() + return True + + def create_leo_transitives(item, trule, previous, visited = None): + if visited is None: + visited = set() + + if item.rule.origin in transitives[item.start]: + previous = trule = transitives[item.start][item.rule.origin] + return trule, previous + + is_empty_rule = not self.FIRST[item.rule.origin] + if is_empty_rule: + return trule, previous + + originator = None + for key in columns[item.start]: + if key.expect is not None and key.expect == item.rule.origin: + if originator is not None: + return trule, previous + originator = key + + if originator is None: + return trule, previous + + if originator in visited: + return trule, previous + + visited.add(originator) + if not is_quasi_complete(originator): + return trule, previous + + trule = originator.advance() + if originator.start != item.start: + visited.clear() + + trule, previous = create_leo_transitives(originator, trule, previous, visited) + if trule is None: + return trule, previous + + titem = None + if previous is not None: + titem = TransitiveItem(item.rule.origin, trule, originator, previous.column) + previous.next_titem = titem else: - node = node_cache[label] = SymbolNode(s, start, end) - return node + titem = TransitiveItem(item.rule.origin, trule, originator, item.start) + + previous = transitives[item.start][item.rule.origin] = titem + return trule, previous - def predict_and_complete(column, to_scan): + def predict_and_complete(i, to_scan): """The core Earley Predictor and Completer. At each stage of the input, we handling any completed items (things @@ -82,61 +140,90 @@ class Parser: which can be added to the scan list for the next scanner cycle.""" held_completions.clear() + column = columns[i] # R (items) = Ei (column.items) - items = deque(column.items) + items = deque(column) while items: item = items.pop() # remove an element, A say, from R ### The Earley completer if item.is_complete: ### (item.s == string) if item.node is None: - item.node = make_symbol_node(item.s, item.start, column) + label = (item.s, item.start, i) + item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) item.node.add_family(item.s, item.rule, item.start, None, None) - # Empty has 0 length. If we complete an empty symbol in a particular - # parse step, we need to be able to use that same empty symbol to complete - # any predictions that result, that themselves require empty. Avoids - # infinite recursion on empty symbols. - # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start.i == column.i - if is_empty_item: - held_completions[item.rule.origin] = item.node - - originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] - for originator in originators: - new_item = originator.advance() - new_item.node = make_symbol_node(new_item.s, originator.start, column) - new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) + create_leo_transitives(item, None, None) + + ###R Joop Leo right recursion Completer + if item.rule.origin in transitives[item.start]: + transitive = transitives[item.start][item.s] + if transitive.previous in transitives[transitive.column]: + root_transitive = transitives[transitive.column][transitive.previous] + else: + root_transitive = transitive + + label = (root_transitive.s, root_transitive.start, i) + node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + vn.add_path(root_transitive, item.node) + + new_item = Item(transitive.rule, transitive.ptr, transitive.start) + new_item.node = vn if new_item.expect in self.TERMINALS: # Add (B :: aC.B, h, y) to Q to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: # Add (B :: aC.B, h, y) to Ei and R column.add(new_item) items.append(new_item) + ###R Regular Earley completer + else: + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start == i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + label = (new_item.s, originator.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) ### The Earley predictor elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) new_items = [] for rule in self.predictions[item.expect]: - new_item = Item(rule, 0, column) + new_item = Item(rule, 0, i) new_items.append(new_item) # Process any held completions (H). if item.expect in held_completions: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, item.start, column) + label = (new_item.s, item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) new_items.append(new_item) for new_item in new_items: if new_item.expect in self.TERMINALS: to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: column.add(new_item) items.append(new_item) - def scan(i, column, to_scan): + def scan(i, to_scan): """The core Earley Scanner. This is a custom implementation of the scanner that uses the @@ -155,7 +242,7 @@ class Parser: m = match(item.expect, stream, i) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[m.end()].append( (item, column, t) ) + delayed_matches[m.end()].append( (item, i, t) ) if self.complete_lex: s = m.group(0) @@ -163,7 +250,7 @@ class Parser: m = match(item.expect, s[:-j]) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[i+m.end()].append( (item, column, t) ) + delayed_matches[i+m.end()].append( (item, i, t) ) # Remove any items that successfully matched in this pass from the to_scan buffer. # This ensures we don't carry over tokens that already matched, if we're ignoring below. @@ -177,13 +264,16 @@ class Parser: m = match(x, stream, i) if m: # Carry over any items still in the scan buffer, to past the end of the ignored items. - delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ]) + delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ]) # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. - delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol]) + delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) - next_set = Column(i + 1, self.FIRST) # Ei+1 next_to_scan = set() + next_set = set() + columns.append(next_set) + next_transitives = dict() + transitives.append(next_transitives) ## 4) Process Tokens from delayed_matches. # This is the core of the Earley scanner. Create an SPPF node for each Token, @@ -193,7 +283,8 @@ class Parser: for item, start, token in delayed_matches[i+1]: if token is not None: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, new_item.start, column) + label = (new_item.s, new_item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) else: new_item = item @@ -210,11 +301,11 @@ class Parser: if not next_set and not delayed_matches and not next_to_scan: raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) - return next_set, next_to_scan + return next_to_scan # Main loop starts - column0 = Column(0, self.FIRST) - column = column0 + columns.append(set()) + transitives.append(dict()) ## The scan buffer. 'Q' in E.Scott's paper. to_scan = set() @@ -223,38 +314,41 @@ class Parser: # Add predicted items to the first Earley set (for the predictor) if they # result in a non-terminal, or the scanner if they result in a terminal. for rule in self.predictions[start_symbol]: - item = Item(rule, 0, column0) + item = Item(rule, 0, 0) if item.expect in self.TERMINALS: to_scan.add(item) else: - column.add(item) + columns[0].add(item) ## The main Earley loop. # Run the Prediction/Completion cycle for any Items in the current Earley set. # Completions will be added to the SPPF tree, and predictions will be recursively # processed down to terminals/empty nodes to be added to the scanner for the next # step. - for i, token in enumerate(stream): - predict_and_complete(column, to_scan) + i = 0 + for token in stream: + predict_and_complete(i, to_scan) # Clear the node_cache and token_cache, which are only relevant for each # step in the Earley pass. node_cache.clear() token_cache.clear() - column, to_scan = scan(i, column, to_scan) + node_cache.clear() + to_scan = scan(i, to_scan) if token == '\n': text_line += 1 text_column = 1 else: text_column += 1 + i += 1 - predict_and_complete(column, to_scan) + predict_and_complete(i, to_scan) ## Column is now the final column in the parse. If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in # this column. Find the item for the start_symbol, which is the root of the SPPF tree. - solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] + solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: expected_tokens = [t.expect for t in to_scan] @@ -265,9 +359,8 @@ class Parser: ## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. # This means the caller can work directly with the SPPF tree. if not self.resolve_ambiguity: - return ForestToAmbiguousTreeVisitor(solutions[0], self.callbacks).go() + return ForestToAmbiguousTreeVisitor(self.callbacks).go(solutions[0]) # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities # according to the rules. - return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() - + return self.forest_tree_visitor.go(solutions[0])