Added Chris' changes, Dec 2018

Merge remote-tracking branch 'origin/0.7b' into 0.7b
6 years ago · c968e212ff
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -16,17 +16,19 @@ from ..visitors import Transformer_InPlace, v_args
 from ..exceptions import ParseError, UnexpectedToken
 from .grammar_analysis import GrammarAnalyzer
 from ..grammar import NonTerminal
 from .earley_common import Column, Item
 from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor
 from .earley_common import Item, TransitiveItem
 from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode

 from collections import deque, defaultdict

 class Parser:
    def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor):
        analysis = GrammarAnalyzer(parser_conf)
        self.parser_conf = parser_conf
        self.resolve_ambiguity = resolve_ambiguity
        self.forest_sum_visitor = forest_sum_visitor

        self.FIRST = analysis.FIRST
        self.NULLABLE = analysis.NULLABLE
        self.callbacks = {}
        self.predictions = {}

@@ -39,6 +41,7 @@ class Parser:
            self.callbacks[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias)
            self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]

        self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks)
        self.term_matcher = term_matcher


@@ -46,19 +49,78 @@ class Parser:
        # Define parser functions
        start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
        match = self.term_matcher
        held_completions = defaultdict(list)

        # Held Completions (H in E.Scotts paper).
        held_completions = {}

        # Cache for nodes & tokens created in a particular parse step.
        node_cache = {}
        token_cache = {}

        def make_symbol_node(s, start, end):
            label = (s, start.i, end.i)
            if label in node_cache:
                node = node_cache[label]
        columns = []
        transitives = []

        def is_quasi_complete(item):
            if item.is_complete:
                return True

            quasi = item.advance()
            while not quasi.is_complete:
                symbol = quasi.expect
                if symbol not in self.NULLABLE:
                    return False
                if quasi.rule.origin == start_symbol and symbol == start_symbol:
                    return False
                quasi = quasi.advance()
            return True

        def create_leo_transitives(item, trule, previous, visited = None):
            if visited is None:
                visited = set()

            if item.rule.origin in transitives[item.start]:
                previous = trule = transitives[item.start][item.rule.origin]
                return trule, previous

            is_empty_rule = not self.FIRST[item.rule.origin]
            if is_empty_rule:
                return trule, previous

            originator = None
            for key in columns[item.start]:
                if key.expect is not None and key.expect == item.rule.origin:
                    if originator is not None:
                        return trule, previous
                    originator = key

            if originator is None:
                return trule, previous

            if originator in visited:
                return trule, previous

            visited.add(originator)
            if not is_quasi_complete(originator):
                return trule, previous

            trule = originator.advance()
            if originator.start != item.start:
                visited.clear()

            trule, previous = create_leo_transitives(originator, trule, previous, visited)
            if trule is None:
                return trule, previous

            titem = None
            if previous is not None:
                titem = TransitiveItem(item.rule.origin, trule, originator, previous.column)
                previous.next_titem = titem
            else:
                node = node_cache[label] = SymbolNode(s, start, end)
            return node
                titem = TransitiveItem(item.rule.origin, trule, originator, item.start)

            previous = transitives[item.start][item.rule.origin] = titem
            return trule, previous

        def predict_and_complete(column, to_scan):
        def predict_and_complete(i, to_scan):
            """The core Earley Predictor and Completer.

            At each stage of the input, we handling any completed items (things
@@ -68,61 +130,90 @@ class Parser:
            which can be added to the scan list for the next scanner cycle."""
            held_completions.clear()

            column = columns[i]
            # R (items) = Ei (column.items)
            items = deque(column.items)
            items = deque(column)
            while items:
                item = items.pop()    # remove an element, A say, from R

                ### The Earley completer
                if item.is_complete:   ### (item.s == string)
                    if item.node is None:
                        item.node = make_symbol_node(item.s, item.start, column)
                        label = (item.s, item.start, i)
                        item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                        item.node.add_family(item.s, item.rule, item.start, None, None)

                    # Empty has 0 length. If we complete an empty symbol in a particular
                    # parse step, we need to be able to use that same empty symbol to complete
                    # any predictions that result, that themselves require empty. Avoids
                    # infinite recursion on empty symbols.
                    # held_completions is 'H' in E.Scott's paper.
                    is_empty_item = item.start.i == column.i
                    if is_empty_item:
                        held_completions[item.rule.origin] = item.node

                    originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s]
                    for originator in originators:
                        new_item = originator.advance()
                        new_item.node = make_symbol_node(new_item.s, originator.start, column)
                        new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
                    create_leo_transitives(item, None, None)

                    ###R Joop Leo right recursion Completer
                    if item.rule.origin in transitives[item.start]:
                        transitive = transitives[item.start][item.s]
                        if transitive.previous in transitives[transitive.column]:
                            root_transitive = transitives[transitive.column][transitive.previous]
                        else:
                            root_transitive = transitive

                        label = (root_transitive.s, root_transitive.start, i)
                        node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                        vn.add_path(root_transitive, item.node)

                        new_item = Item(transitive.rule, transitive.ptr, transitive.start)
                        new_item.node = vn
                        if new_item.expect in self.TERMINALS:
                            # Add (B :: aC.B, h, y) to Q
                            to_scan.add(new_item)
                        elif new_item not in column.items:
                        elif new_item not in column:
                            # Add (B :: aC.B, h, y) to Ei and R
                            column.add(new_item)
                            items.append(new_item)
                    ###R Regular Earley completer
                    else:
                        # Empty has 0 length. If we complete an empty symbol in a particular
                        # parse step, we need to be able to use that same empty symbol to complete
                        # any predictions that result, that themselves require empty. Avoids
                        # infinite recursion on empty symbols.
                        # held_completions is 'H' in E.Scott's paper.
                        is_empty_item = item.start == i
                        if is_empty_item:
                            held_completions[item.rule.origin] = item.node

                        originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
                        for originator in originators:
                            new_item = originator.advance()
                            label = (new_item.s, originator.start, i)
                            new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                            new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
                            if new_item.expect in self.TERMINALS:
                                # Add (B :: aC.B, h, y) to Q
                                to_scan.add(new_item)
                            elif new_item not in column:
                                # Add (B :: aC.B, h, y) to Ei and R
                                column.add(new_item)
                                items.append(new_item)

                ### The Earley predictor
                elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
                    new_items = []
                    for rule in self.predictions[item.expect]:
                        new_item = Item(rule, 0, column)
                        new_item = Item(rule, 0, i)
                        new_items.append(new_item)

                    # Process any held completions (H).
                    if item.expect in held_completions:
                        new_item = item.advance()
                        new_item.node = make_symbol_node(new_item.s, item.start, column)
                        label = (new_item.s, item.start, i)
                        new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                        new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
                        new_items.append(new_item)

                    for new_item in new_items:
                        if new_item.expect in self.TERMINALS:
                            to_scan.add(new_item)
                        elif new_item not in column.items:
                        elif new_item not in column:
                            column.add(new_item)
                            items.append(new_item)

        def scan(i, token, column, to_scan):
        def scan(i, token, to_scan):
            """The core Earley Scanner.

            This is a custom implementation of the scanner that uses the
@@ -130,12 +221,17 @@ class Parser:
            Earley predictor, based on the previously completed tokens.
            This ensures that at each phase of the parse we have a custom
            lexer context, allowing for more complex ambiguities."""
            next_set = Column(i+1, self.FIRST)
            next_to_scan = set()
            next_set = set()
            columns.append(next_set)
            next_transitives = dict()
            transitives.append(next_transitives)

            for item in set(to_scan):
                if match(item.expect, token):
                    new_item = item.advance()
                    new_item.node = make_symbol_node(new_item.s, new_item.start, column)
                    label = (new_item.s, new_item.start, i)
                    new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                    new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)

                    if new_item.expect in self.TERMINALS:
@@ -149,11 +245,11 @@ class Parser:
                expect = {i.expect.name for i in to_scan}
                raise UnexpectedToken(token, expect, considered_rules = set(to_scan))

            return next_set, next_to_scan
            return next_to_scan

        # Main loop starts
        column0 = Column(0, self.FIRST)
        column = column0
        columns.append(set())
        transitives.append(dict())

        ## The scan buffer. 'Q' in E.Scott's paper.
        to_scan = set()
@@ -162,32 +258,34 @@ class Parser:
        # Add predicted items to the first Earley set (for the predictor) if they
        # result in a non-terminal, or the scanner if they result in a terminal.
        for rule in self.predictions[start_symbol]:
            item = Item(rule, 0, column0)
            item = Item(rule, 0, 0)
            if item.expect in self.TERMINALS:
                to_scan.add(item)
            else:
                column.add(item)
                columns[0].add(item)

        ## The main Earley loop.
        # Run the Prediction/Completion cycle for any Items in the current Earley set.
        # Completions will be added to the SPPF tree, and predictions will be recursively
        # processed down to terminals/empty nodes to be added to the scanner for the next
        # step.
        for i, token in enumerate(stream):
            predict_and_complete(column, to_scan)
        i = 0
        for token in stream:
            predict_and_complete(i, to_scan)

            # Clear the node_cache and token_cache, which are only relevant for each
            # step in the Earley pass.
            node_cache.clear()
            token_cache.clear()
            column, to_scan = scan(i, token, column, to_scan)
            to_scan = scan(i, token, to_scan)
            i += 1

        predict_and_complete(column, to_scan)
        predict_and_complete(i, to_scan)

        ## Column is now the final column in the parse. If the parse was successful, the start
        # symbol should have been completed in the last step of the Earley cycle, and will be in
        # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
        solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0]
        solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]

        if not solutions:
            raise ParseError('Incomplete parse: Could not find a solution to input')
@@ -201,7 +299,7 @@ class Parser:

        # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities
        # according to the rules.
        return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go()
        return self.forest_tree_visitor.go(solutions[0])

 class ApplyCallbacks(Transformer_InPlace):
    def __init__(self, postprocess):
--- a/lark/parsers/earley_common.py
+++ b/lark/parsers/earley_common.py
@@ -13,27 +13,12 @@
 # Author: Erez Shinan (2017)
 # Email : erezshin@gmail.com

 ## for recursive repr
 from ..tree import Tree

 class Derivation(Tree):
    def __init__(self, rule, children = None):
        Tree.__init__(self, 'drv', children if children is not None else [])
        self.meta.rule = rule
        self._hash = None

    def __repr__(self, indent = 0):
        return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...')

    def __hash__(self):
        if self._hash is None:
            self._hash = Tree.__hash__(self)
        return self._hash
 from ..grammar import NonTerminal, Terminal

 class Item(object):
    "An Earley Item, the atom of the algorithm."

    __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash')
    __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash')
    def __init__(self, rule, ptr, start):
        self.is_complete = len(rule.expansion) == ptr
        self.rule = rule    # rule
@@ -43,38 +28,48 @@ class Item(object):
        if self.is_complete:
            self.s = rule.origin
            self.expect = None
            self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
        else:
            self.s = (rule, ptr)
            self.expect = rule.expansion[ptr]
        self._hash = hash((self.s, self.start.i))
            self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
        self._hash = hash((self.s, self.start))

    def advance(self):
        return self.__class__(self.rule, self.ptr + 1, self.start)
        return Item(self.rule, self.ptr + 1, self.start)

    def __eq__(self, other):
        return self is other or (self.s == other.s and self.start.i == other.start.i)
        return self is other or (self.s == other.s and self.start == other.start)

    def __hash__(self):
        return self._hash

    def __repr__(self):
        return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i)

 class Column:
    "An entry in the table, aka Earley Chart. Contains lists of items."
    def __init__(self, i, FIRST):
        self.i = i
        self.items = set()
        self.FIRST = FIRST
        before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
        after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
        symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after))
        return '%s (%d)' % (symbol, self.start)


 class TransitiveItem(Item):
    __slots__ = ('recognized', 'reduction', 'column', 'next_titem')
    def __init__(self, recognized, trule, originator, start):
        super(TransitiveItem, self).__init__(trule.rule, trule.ptr, trule.start)
        self.recognized = recognized
        self.reduction = originator
        self.column = start
        self.next_titem = None
        self._hash = hash((self.s, self.start, self.recognized))

    def add(self, item):
        """Sort items into scan/predict/reduce newslists

        Makes sure only unique items are added.
        """
        self.items.add(item)
    def __eq__(self, other):
        if not isinstance(other, TransitiveItem):
            return False
        return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.recognized == other.recognized)

    def __bool__(self):
        return bool(self.items)
    def __hash__(self):
        return self._hash

    __nonzero__ = __bool__  # Py2 backwards-compatibility
    def __repr__(self):
        before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
        after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
        return '{} : {} -> {}* {} ({}, {})'.format(self.recognized.name, self.rule.origin.name, ' '.join(before), ' '.join(after), self.column, self.start)
--- a/lark/parsers/earley_forest.py
+++ b/lark/parsers/earley_forest.py
@@ -7,14 +7,15 @@ Full reference and more details is here:
 http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
 """

 from random import randint
 from ..tree import Tree
 from ..exceptions import ParseError
 from ..lexer import Token
 from ..utils import Str
 from ..grammar import NonTerminal, Terminal
 from .earley_common import Column, Derivation
 from ..grammar import NonTerminal, Terminal, Symbol

 from collections import deque
 from importlib import import_module

 class ForestNode(object):
    pass
@@ -33,36 +34,65 @@ class SymbolNode(ForestNode):

    Hence a Symbol Node with a single child is unambiguous.
    """
    __slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate')
    __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash')
    def __init__(self, s, start, end):
        self.s = s
        self.start = start
        self.end = end
        self.children = set()
        self._children = set()
        self.paths = set()
        self.paths_loaded = False
        self.priority = None
        self.is_intermediate = isinstance(s, tuple)
        self._hash = hash((self.s, self.start, self.end))

    def add_family(self, lr0, rule, start, left, right):
        self.children.add(PackedNode(self, lr0, rule, start, left, right))
        self._children.add(PackedNode(self, lr0, rule, start, left, right))

    def add_path(self, transitive, node):
        self.paths.add((transitive, node))

    def load_paths(self):
        for transitive, node in self.paths:
            if transitive.next_titem is not None:
                vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end)
                vn.add_path(transitive.next_titem, node)
                self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
            else:
                self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
        self.paths_loaded = True

    @property
    def is_ambiguous(self):
        return len(self.children) > 1

    @property
    def children(self):
        if not self.paths_loaded:
            self.load_paths()
        return self._children

    def __iter__(self):
        return iter(self.children)
        return iter(self._children)

    def __eq__(self, other):
        if not isinstance(other, SymbolNode):
            return False
        return self is other or (self.s == other.s and self.start == other.start and self.end is other.end)
        return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end)

    def __hash__(self):
        return hash((self.s, self.start.i, self.end.i))
        return self._hash

    def __repr__(self):
        symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name
        return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0)
        if self.is_intermediate:
            rule = self.s[0]
            ptr = self.s[1]
            before = ( expansion.name for expansion in rule.expansion[:ptr] )
            after = ( expansion.name for expansion in rule.expansion[ptr:] )
            symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
        else:
            symbol = self.s.name
        return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority if self.priority is not None else 0)

 class PackedNode(ForestNode):
    """
@@ -77,7 +107,7 @@ class PackedNode(ForestNode):
        self.left = left
        self.right = right
        self.priority = None
        self._hash = hash((self.s, self.start.i, self.left, self.right))
        self._hash = hash((self.s, self.start, self.left, self.right))

    @property
    def is_empty(self):
@@ -105,8 +135,15 @@ class PackedNode(ForestNode):
        return self._hash

    def __repr__(self):
        symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name
        return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0)
        if isinstance(self.s, tuple):
            rule = self.s[0]
            ptr = self.s[1]
            before = ( expansion.name for expansion in rule.expansion[:ptr] )
            after = ( expansion.name for expansion in rule.expansion[ptr:] )
            symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
        else:
            symbol = self.s.name
        return "({}, {}, {})".format(symbol, self.start, self.priority)

 class ForestVisitor(object):
    """
@@ -114,9 +151,7 @@ class ForestVisitor(object):

    Use this as a base when you need to walk the forest.
    """
    def __init__(self, root):
        self.root = root
        self.result = None
    __slots__ = ['result']

    def visit_token_node(self, node): pass
    def visit_symbol_node_in(self, node): pass
@@ -124,7 +159,8 @@ class ForestVisitor(object):
    def visit_packed_node_in(self, node): pass
    def visit_packed_node_out(self, node): pass

    def go(self):
    def go(self, root):
        self.result = None
        # Visiting is a list of IDs of all symbol/intermediate nodes currently in
        # the stack. It serves two purposes: to detect when we 'recurse' in and out
        # of a symbol/intermediate so that we can process both up and down. Also,
@@ -134,7 +170,7 @@ class ForestVisitor(object):

        # We do not use recursion here to walk the Forest due to the limited
        # stack size in python. Therefore input_stack is essentially our stack.
        input_stack = deque([self.root])
        input_stack = deque([root])

        # It is much faster to cache these as locals since they are called
        # many times in large parses.
@@ -170,8 +206,8 @@ class ForestVisitor(object):

            current_id = id(current)
            if current_id in visiting:
                if isinstance(current, PackedNode): vpno(current)
                else:                               vsno(current)
                if isinstance(current, PackedNode):    vpno(current)
                else:                                  vsno(current)
                input_stack.pop()
                visiting.remove(current_id)
                continue
@@ -214,7 +250,7 @@ class ForestSumVisitor(ForestVisitor):

    def visit_symbol_node_out(self, node):
        node.priority = max(child.priority for child in node.children)
        node.children = sorted(node.children, reverse = True)
        node._children = sorted(node.children, reverse = True)

 class ForestAntiscoreSumVisitor(ForestSumVisitor):
    """
@@ -228,7 +264,7 @@ class ForestAntiscoreSumVisitor(ForestSumVisitor):
    """
    def visit_symbol_node_out(self, node):
        node.priority = min(child.priority for child in node.children)
        node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True)
        node._children = sorted(node.children, key=AntiscoreSumComparator, reverse = True)

 class AntiscoreSumComparator(object):
    """
@@ -263,19 +299,21 @@ class ForestToTreeVisitor(ForestVisitor):
    implementation should be another ForestVisitor which sorts the children
    according to some priority mechanism.
    """
    def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None):
        super(ForestToTreeVisitor, self).__init__(root)
        self.forest_sum_visitor = forest_sum_visitor
        self.output_stack = deque()
    __slots__ = ['forest_sum_visitor', 'output_stack', 'callbacks']
    def __init__(self, forest_sum_visitor = ForestSumVisitor, callbacks = None):
        self.forest_sum_visitor = forest_sum_visitor()
        self.callbacks = callbacks
        self.result = None

    def go(self, root):
        self.output_stack = deque()
        return super(ForestToTreeVisitor, self).go(root)

    def visit_token_node(self, node):
        self.output_stack[-1].append(node)

    def visit_symbol_node_in(self, node):
        if node.is_ambiguous and node.priority is None:
            self.forest_sum_visitor(node).go()
            self.forest_sum_visitor.go(node)
        return next(iter(node.children))

    def visit_packed_node_in(self, node):
@@ -311,11 +349,13 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor):
    This is mainly used by the test framework, to make it simpler to write
    tests ensuring the SPPF contains the right results.
    """
    def __init__(self, root, callbacks):
        super(ForestToAmbiguousTreeVisitor, self).__init__(root)
        self.output_stack = deque()
    __slots__ = ['output_stack', 'callbacks']
    def __init__(self, callbacks):
        self.callbacks = callbacks
        self.result = None

    def go(self, root):
        self.output_stack = deque([])
        return super(ForestToAmbiguousTreeVisitor, self).go(root)

    def visit_token_node(self, node):
        self.output_stack[-1].children.append(node)
@@ -326,7 +366,7 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor):
        return iter(node.children)

    def visit_symbol_node_out(self, node):
        if node.is_ambiguous:
        if not node.is_intermediate and node.is_ambiguous:
            result = self.output_stack.pop()
            if self.output_stack:
                self.output_stack[-1].children.append(result)
@@ -347,4 +387,78 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor):
            if self.output_stack:
                self.output_stack[-1].children.append(result)
            else:
                self.result = result
                self.result = result

 class ForestToPyDotVisitor(ForestVisitor):
    """
    A Forest visitor which writes the SPPF to a PNG.

    The SPPF can get really large, really quickly because
    of the amount of meta-data it stores, so this is probably
    only useful for trivial trees and learning how the SPPF
    is structured.
    """
    def __init__(self, rankdir="TB"):
        self.pydot = import_module('pydot')
        self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir)

    def go(self, root, filename):
        super(ForestToPyDotVisitor, self).go(root)
        self.graph.write_png(filename)

    def visit_token_node(self, node):
        graph_node_id = str(id(node))
        graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
        graph_node_color = 0x808080
        graph_node_style = "\"filled,rounded\""
        graph_node_shape = "diamond"
        graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
        self.graph.add_node(graph_node)

    def visit_packed_node_in(self, node):
        graph_node_id = str(id(node))
        graph_node_label = repr(node)
        graph_node_color = 0x808080
        graph_node_style = "filled"
        graph_node_shape = "diamond"
        graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
        self.graph.add_node(graph_node)
        return iter([node.left, node.right])

    def visit_packed_node_out(self, node):
        graph_node_id = str(id(node))
        graph_node = self.graph.get_node(graph_node_id)[0]
        for child in [node.left, node.right]:
            if child is not None:
                child_graph_node_id = str(id(child))
                child_graph_node = self.graph.get_node(child_graph_node_id)[0]
                self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
            else:
                #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay.
                child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890))
                child_graph_node_style = "invis"
                child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None")
                child_edge_style = "invis"
                self.graph.add_node(child_graph_node)
                self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style))

    def visit_symbol_node_in(self, node):
        graph_node_id = str(id(node))
        graph_node_label = repr(node)
        graph_node_color = 0x808080
        graph_node_style = "\"filled\""
        if node.is_intermediate:
            graph_node_shape = "ellipse"
        else:
            graph_node_shape = "rectangle"
        graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
        self.graph.add_node(graph_node)
        return iter(node.children)

    def visit_symbol_node_out(self, node):
        graph_node_id = str(id(node))
        graph_node = self.graph.get_node(graph_node_id)[0]
        for child in node.children:
            child_graph_node_id = str(id(child))
            child_graph_node = self.graph.get_node(child_graph_node_id)[0]
            self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -22,7 +22,8 @@ from ..exceptions import ParseError, UnexpectedCharacters
 from ..lexer import Token
 from .grammar_analysis import GrammarAnalyzer
 from ..grammar import NonTerminal, Terminal
 from .earley_common import Column, Item
 from .earley import ApplyCallbacks
 from .earley_common import Item, TransitiveItem
 from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor


@@ -31,11 +32,11 @@ class Parser:
        analysis = GrammarAnalyzer(parser_conf)
        self.parser_conf = parser_conf
        self.resolve_ambiguity = resolve_ambiguity
        self.forest_sum_visitor = forest_sum_visitor
        self.ignore = [Terminal(t) for t in ignore]
        self.complete_lex = complete_lex

        self.FIRST = analysis.FIRST
        self.NULLABLE = analysis.NULLABLE
        self.callbacks = {}
        self.predictions = {}

@@ -43,10 +44,12 @@ class Parser:
        #  the slow 'isupper' in is_terminal.
        self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
        self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }

        for rule in parser_conf.rules:
            self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None)
            self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]

        self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks)
        self.term_matcher = term_matcher

    def parse(self, stream, start_symbol=None):
@@ -60,19 +63,74 @@ class Parser:
        # Cache for nodes & tokens created in a particular parse step.
        node_cache = {}
        token_cache = {}
        columns = []
        transitives = []

        text_line = 1
        text_column = 1

        def make_symbol_node(s, start, end):
            label = (s, start.i, end.i)
            if label in node_cache:
                node = node_cache[label]
        def is_quasi_complete(item):
            if item.is_complete:
                return True

            quasi = item.advance()
            while not quasi.is_complete:
                symbol = quasi.expect
                if symbol not in self.NULLABLE:
                    return False
                if quasi.rule.origin == start_symbol and symbol == start_symbol:
                    return False
                quasi = quasi.advance()
            return True

        def create_leo_transitives(item, trule, previous, visited = None):
            if visited is None:
                visited = set()

            if item.rule.origin in transitives[item.start]:
                previous = trule = transitives[item.start][item.rule.origin]
                return trule, previous

            is_empty_rule = not self.FIRST[item.rule.origin]
            if is_empty_rule:
                return trule, previous

            originator = None
            for key in columns[item.start]:
                if key.expect is not None and key.expect == item.rule.origin:
                    if originator is not None:
                        return trule, previous
                    originator = key

            if originator is None:
                return trule, previous

            if originator in visited:
                return trule, previous

            visited.add(originator)
            if not is_quasi_complete(originator):
                return trule, previous

            trule = originator.advance()
            if originator.start != item.start:
                visited.clear()

            trule, previous = create_leo_transitives(originator, trule, previous, visited)
            if trule is None:
                return trule, previous

            titem = None
            if previous is not None:
                titem = TransitiveItem(item.rule.origin, trule, originator, previous.column)
                previous.next_titem = titem
            else:
                node = node_cache[label] = SymbolNode(s, start, end)
            return node
                titem = TransitiveItem(item.rule.origin, trule, originator, item.start)

            previous = transitives[item.start][item.rule.origin] = titem
            return trule, previous

        def predict_and_complete(column, to_scan):
        def predict_and_complete(i, to_scan):
            """The core Earley Predictor and Completer.

            At each stage of the input, we handling any completed items (things
@@ -82,61 +140,90 @@ class Parser:
            which can be added to the scan list for the next scanner cycle."""
            held_completions.clear()

            column = columns[i]
            # R (items) = Ei (column.items)
            items = deque(column.items)
            items = deque(column)
            while items:
                item = items.pop()    # remove an element, A say, from R

                ### The Earley completer
                if item.is_complete:   ### (item.s == string)
                    if item.node is None:
                        item.node = make_symbol_node(item.s, item.start, column)
                        label = (item.s, item.start, i)
                        item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                        item.node.add_family(item.s, item.rule, item.start, None, None)

                    # Empty has 0 length. If we complete an empty symbol in a particular
                    # parse step, we need to be able to use that same empty symbol to complete
                    # any predictions that result, that themselves require empty. Avoids
                    # infinite recursion on empty symbols.
                    # held_completions is 'H' in E.Scott's paper.
                    is_empty_item = item.start.i == column.i
                    if is_empty_item:
                        held_completions[item.rule.origin] = item.node

                    originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s]
                    for originator in originators:
                        new_item = originator.advance()
                        new_item.node = make_symbol_node(new_item.s, originator.start, column)
                        new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
                    create_leo_transitives(item, None, None)

                    ###R Joop Leo right recursion Completer
                    if item.rule.origin in transitives[item.start]:
                        transitive = transitives[item.start][item.s]
                        if transitive.previous in transitives[transitive.column]:
                            root_transitive = transitives[transitive.column][transitive.previous]
                        else:
                            root_transitive = transitive

                        label = (root_transitive.s, root_transitive.start, i)
                        node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                        vn.add_path(root_transitive, item.node)

                        new_item = Item(transitive.rule, transitive.ptr, transitive.start)
                        new_item.node = vn
                        if new_item.expect in self.TERMINALS:
                            # Add (B :: aC.B, h, y) to Q
                            to_scan.add(new_item)
                        elif new_item not in column.items:
                        elif new_item not in column:
                            # Add (B :: aC.B, h, y) to Ei and R
                            column.add(new_item)
                            items.append(new_item)
                    ###R Regular Earley completer
                    else:
                        # Empty has 0 length. If we complete an empty symbol in a particular
                        # parse step, we need to be able to use that same empty symbol to complete
                        # any predictions that result, that themselves require empty. Avoids
                        # infinite recursion on empty symbols.
                        # held_completions is 'H' in E.Scott's paper.
                        is_empty_item = item.start == i
                        if is_empty_item:
                            held_completions[item.rule.origin] = item.node

                        originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
                        for originator in originators:
                            new_item = originator.advance()
                            label = (new_item.s, originator.start, i)
                            new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                            new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
                            if new_item.expect in self.TERMINALS:
                                # Add (B :: aC.B, h, y) to Q
                                to_scan.add(new_item)
                            elif new_item not in column:
                                # Add (B :: aC.B, h, y) to Ei and R
                                column.add(new_item)
                                items.append(new_item)

                ### The Earley predictor
                elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
                    new_items = []
                    for rule in self.predictions[item.expect]:
                        new_item = Item(rule, 0, column)
                        new_item = Item(rule, 0, i)
                        new_items.append(new_item)

                    # Process any held completions (H).
                    if item.expect in held_completions:
                        new_item = item.advance()
                        new_item.node = make_symbol_node(new_item.s, item.start, column)
                        label = (new_item.s, item.start, i)
                        new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                        new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
                        new_items.append(new_item)

                    for new_item in new_items:
                        if new_item.expect in self.TERMINALS:
                            to_scan.add(new_item)
                        elif new_item not in column.items:
                        elif new_item not in column:
                            column.add(new_item)
                            items.append(new_item)

        def scan(i, column, to_scan):
        def scan(i, to_scan):
            """The core Earley Scanner.

            This is a custom implementation of the scanner that uses the
@@ -155,7 +242,7 @@ class Parser:
                m = match(item.expect, stream, i)
                if m:
                    t = Token(item.expect.name, m.group(0), i, text_line, text_column)
                    delayed_matches[m.end()].append( (item, column, t) )
                    delayed_matches[m.end()].append( (item, i, t) )

                    if self.complete_lex:
                        s = m.group(0)
@@ -163,7 +250,7 @@ class Parser:
                            m = match(item.expect, s[:-j])
                            if m:
                                t = Token(item.expect.name, m.group(0), i, text_line, text_column)
                                delayed_matches[i+m.end()].append( (item, column, t) )
                                delayed_matches[i+m.end()].append( (item, i, t) )

                    # Remove any items that successfully matched in this pass from the to_scan buffer.
                    # This ensures we don't carry over tokens that already matched, if we're ignoring below.
@@ -177,13 +264,16 @@ class Parser:
                m = match(x, stream, i)
                if m:
                    # Carry over any items still in the scan buffer, to past the end of the ignored items.
                    delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ])
                    delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ])

                    # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
                    delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol])
                    delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])

            next_set = Column(i + 1, self.FIRST)    # Ei+1
            next_to_scan = set()
            next_set = set()
            columns.append(next_set)
            next_transitives = dict()
            transitives.append(next_transitives)

            ## 4) Process Tokens from delayed_matches.
            # This is the core of the Earley scanner. Create an SPPF node for each Token,
@@ -193,7 +283,8 @@ class Parser:
            for item, start, token in delayed_matches[i+1]:
                if token is not None:
                    new_item = item.advance()
                    new_item.node = make_symbol_node(new_item.s, new_item.start, column)
                    label = (new_item.s, new_item.start, i)
                    new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
                    new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)
                else:
                    new_item = item
@@ -210,11 +301,11 @@ class Parser:
            if not next_set and not delayed_matches and not next_to_scan:
                raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan))

            return next_set, next_to_scan
            return next_to_scan

        # Main loop starts
        column0 = Column(0, self.FIRST)
        column = column0
        columns.append(set())
        transitives.append(dict())

        ## The scan buffer. 'Q' in E.Scott's paper.
        to_scan = set()
@@ -223,38 +314,41 @@ class Parser:
        # Add predicted items to the first Earley set (for the predictor) if they
        # result in a non-terminal, or the scanner if they result in a terminal.
        for rule in self.predictions[start_symbol]:
            item = Item(rule, 0, column0)
            item = Item(rule, 0, 0)
            if item.expect in self.TERMINALS:
                to_scan.add(item)
            else:
                column.add(item)
                columns[0].add(item)

        ## The main Earley loop.
        # Run the Prediction/Completion cycle for any Items in the current Earley set.
        # Completions will be added to the SPPF tree, and predictions will be recursively
        # processed down to terminals/empty nodes to be added to the scanner for the next
        # step.
        for i, token in enumerate(stream):
            predict_and_complete(column, to_scan)
        i = 0
        for token in stream:
            predict_and_complete(i, to_scan)

            # Clear the node_cache and token_cache, which are only relevant for each
            # step in the Earley pass.
            node_cache.clear()
            token_cache.clear()
            column, to_scan = scan(i, column, to_scan)
            node_cache.clear()
            to_scan = scan(i, to_scan)

            if token == '\n':
                text_line += 1
                text_column = 1
            else:
                text_column += 1
            i += 1

        predict_and_complete(column, to_scan)
        predict_and_complete(i, to_scan)

        ## Column is now the final column in the parse. If the parse was successful, the start
        # symbol should have been completed in the last step of the Earley cycle, and will be in
        # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
        solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0]
        solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]

        if not solutions:
            expected_tokens = [t.expect for t in to_scan]
@@ -265,9 +359,8 @@ class Parser:
        ## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller.
        # This means the caller can work directly with the SPPF tree.
        if not self.resolve_ambiguity:
            return ForestToAmbiguousTreeVisitor(solutions[0], self.callbacks).go()
            return ForestToAmbiguousTreeVisitor(self.callbacks).go(solutions[0])

        # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities
        # according to the rules.
        return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go()

        return self.forest_tree_visitor.go(solutions[0])