Remove Earley Column

We can replace Earley Columns with basic python sets for improved performance and simplicity.
7 years ago · 8fa8ac36fc
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -16,7 +16,7 @@ from ..visitors import Transformer_InPlace, v_args
 from ..exceptions import ParseError, UnexpectedToken
 from .grammar_analysis import GrammarAnalyzer
 from ..grammar import NonTerminal
 from .earley_common import Column, Item
 from .earley_common import Item
 from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode

 from collections import deque, defaultdict
@@ -48,19 +48,24 @@ class Parser:
        # Define parser functions
        start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
        match = self.term_matcher
        held_completions = defaultdict(list)

        # Held Completions (H in E.Scotts paper).
        held_completions = {}

        # Cache for nodes & tokens created in a particular parse step.
        node_cache = {}
        token_cache = {}
        columns = []

        def make_symbol_node(s, start, end):
            label = (s, start.i, end.i)
            label = (s, start, end)
            if label in node_cache:
                node = node_cache[label]
            else:
                node = node_cache[label] = SymbolNode(s, start, end)
            return node

        def predict_and_complete(column, to_scan):
        def predict_and_complete(i, to_scan):
            """The core Earley Predictor and Completer.

            At each stage of the input, we handling any completed items (things
@@ -70,15 +75,16 @@ class Parser:
            which can be added to the scan list for the next scanner cycle."""
            held_completions.clear()

            column = columns[i]
            # R (items) = Ei (column.items)
            items = deque(column.items)
            items = deque(column)
            while items:
                item = items.pop()    # remove an element, A say, from R

                ### The Earley completer
                if item.is_complete:   ### (item.s == string)
                    if item.node is None:
                        item.node = make_symbol_node(item.s, item.start, column)
                        item.node = make_symbol_node(item.s, item.start, i)
                        item.node.add_family(item.s, item.rule, item.start, None, None)

                    # Empty has 0 length. If we complete an empty symbol in a particular
@@ -86,19 +92,19 @@ class Parser:
                    # any predictions that result, that themselves require empty. Avoids
                    # infinite recursion on empty symbols.
                    # held_completions is 'H' in E.Scott's paper.
                    is_empty_item = item.start.i == column.i
                    is_empty_item = item.start == i
                    if is_empty_item:
                        held_completions[item.rule.origin] = item.node

                    originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s]
                    originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
                    for originator in originators:
                        new_item = originator.advance()
                        new_item.node = make_symbol_node(new_item.s, originator.start, column)
                        new_item.node = make_symbol_node(new_item.s, originator.start, i)
                        new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
                        if new_item.expect in self.TERMINALS:
                            # Add (B :: aC.B, h, y) to Q
                            to_scan.add(new_item)
                        elif new_item not in column.items:
                        elif new_item not in column:
                            # Add (B :: aC.B, h, y) to Ei and R
                            column.add(new_item)
                            items.append(new_item)
@@ -107,24 +113,24 @@ class Parser:
                elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
                    new_items = []
                    for rule in self.predictions[item.expect]:
                        new_item = Item(rule, 0, column)
                        new_item = Item(rule, 0, i)
                        new_items.append(new_item)

                    # Process any held completions (H).
                    if item.expect in held_completions:
                        new_item = item.advance()
                        new_item.node = make_symbol_node(new_item.s, item.start, column)
                        new_item.node = make_symbol_node(new_item.s, item.start, i)
                        new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
                        new_items.append(new_item)

                    for new_item in new_items:
                        if new_item.expect in self.TERMINALS:
                            to_scan.add(new_item)
                        elif new_item not in column.items:
                        elif new_item not in column:
                            column.add(new_item)
                            items.append(new_item)

        def scan(i, token, column, to_scan):
        def scan(i, token, to_scan):
            """The core Earley Scanner.

            This is a custom implementation of the scanner that uses the
@@ -132,12 +138,14 @@ class Parser:
            Earley predictor, based on the previously completed tokens.
            This ensures that at each phase of the parse we have a custom
            lexer context, allowing for more complex ambiguities."""
            next_set = Column(i+1, self.FIRST)
            next_to_scan = set()
            next_set = set()
            columns.append(next_set)

            for item in set(to_scan):
                if match(item.expect, token):
                    new_item = item.advance()
                    new_item.node = make_symbol_node(new_item.s, new_item.start, column)
                    new_item.node = make_symbol_node(new_item.s, new_item.start, i)
                    new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)

                    if new_item.expect in self.TERMINALS:
@@ -151,11 +159,10 @@ class Parser:
                expect = {i.expect.name for i in to_scan}
                raise UnexpectedToken(token, expect, considered_rules = set(to_scan))

            return next_set, next_to_scan
            return next_to_scan

        # Main loop starts
        column0 = Column(0, self.FIRST)
        column = column0
        columns.append(set())

        ## The scan buffer. 'Q' in E.Scott's paper.
        to_scan = set()
@@ -164,32 +171,34 @@ class Parser:
        # Add predicted items to the first Earley set (for the predictor) if they
        # result in a non-terminal, or the scanner if they result in a terminal.
        for rule in self.predictions[start_symbol]:
            item = Item(rule, 0, column0)
            item = Item(rule, 0, 0)
            if item.expect in self.TERMINALS:
                to_scan.add(item)
            else:
                column.add(item)
                columns[0].add(item)

        ## The main Earley loop.
        # Run the Prediction/Completion cycle for any Items in the current Earley set.
        # Completions will be added to the SPPF tree, and predictions will be recursively
        # processed down to terminals/empty nodes to be added to the scanner for the next
        # step.
        for i, token in enumerate(stream):
            predict_and_complete(column, to_scan)
        i = 0
        for token in stream:
            predict_and_complete(i, to_scan)

            # Clear the node_cache and token_cache, which are only relevant for each
            # step in the Earley pass.
            node_cache.clear()
            token_cache.clear()
            column, to_scan = scan(i, token, column, to_scan)
            to_scan = scan(i, token, to_scan)
            i += 1

        predict_and_complete(column, to_scan)
        predict_and_complete(i, to_scan)

        ## Column is now the final column in the parse. If the parse was successful, the start
        # symbol should have been completed in the last step of the Earley cycle, and will be in
        # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
        solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0]
        solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]

        if not solutions:
            raise ParseError('Incomplete parse: Could not find a solution to input')
--- a/lark/parsers/earley_common.py
+++ b/lark/parsers/earley_common.py
@@ -35,6 +35,7 @@ class Item(object):

    __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash')
    def __init__(self, rule, ptr, start):
        assert isinstance(start, int), "start is not an int"
        self.is_complete = len(rule.expansion) == ptr
        self.rule = rule    # rule
        self.ptr = ptr      # ptr
@@ -46,35 +47,16 @@ class Item(object):
        else:
            self.s = (rule, ptr)
            self.expect = rule.expansion[ptr]
        self._hash = hash((self.s, self.start.i))
        self._hash = hash((self.s, self.start))

    def advance(self):
        return self.__class__(self.rule, self.ptr + 1, self.start)

    def __eq__(self, other):
        return self is other or (self.s == other.s and self.start.i == other.start.i)
        return self is other or (self.s == other.s and self.start == other.start)

    def __hash__(self):
        return self._hash

    def __repr__(self):
        return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i)

 class Column:
    "An entry in the table, aka Earley Chart. Contains lists of items."
    def __init__(self, i, FIRST):
        self.i = i
        self.items = set()
        self.FIRST = FIRST

    def add(self, item):
        """Sort items into scan/predict/reduce newslists

        Makes sure only unique items are added.
        """
        self.items.add(item)

    def __bool__(self):
        return bool(self.items)

    __nonzero__ = __bool__  # Py2 backwards-compatibility
        return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start)
--- a/lark/parsers/earley_forest.py
+++ b/lark/parsers/earley_forest.py
@@ -13,7 +13,7 @@ from ..exceptions import ParseError
 from ..lexer import Token
 from ..utils import Str
 from ..grammar import NonTerminal, Terminal
 from .earley_common import Column, Derivation
 from .earley_common import Derivation

 from collections import deque
 from importlib import import_module
@@ -60,7 +60,7 @@ class SymbolNode(ForestNode):
        return self is other or (self.s == other.s and self.start == other.start and self.end is other.end)

    def __hash__(self):
        return hash((self.s, self.start.i, self.end.i))
        return hash((self.s, self.start, self.end))

    def __repr__(self):
        if self.is_intermediate:
@@ -70,7 +70,7 @@ class SymbolNode(ForestNode):
            symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names))
        else:
            symbol = self.s.name
        return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0)
        return "(%s, %d, %d, %d)" % (symbol, self.start, self.end, self.priority if self.priority is not None else 0)

 class PackedNode(ForestNode):
    """
@@ -85,7 +85,7 @@ class PackedNode(ForestNode):
        self.left = left
        self.right = right
        self.priority = None
        self._hash = hash((self.s, self.start.i, self.left, self.right))
        self._hash = hash((self.s, self.start, self.left, self.right))

    @property
    def is_empty(self):
@@ -120,7 +120,7 @@ class PackedNode(ForestNode):
            symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names))
        else:
            symbol = self.s.name
        return "{%s, %d, %d}" % (symbol, self.start.i, self.priority if self.priority is not None else 0)
        return "{%s, %d, %d}" % (symbol, self.start, self.priority if self.priority is not None else 0)

 class ForestVisitor(object):
    """
--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -24,7 +24,7 @@ from ..tree import Tree
 from .grammar_analysis import GrammarAnalyzer
 from ..grammar import NonTerminal, Terminal
 from .earley import ApplyCallbacks
 from .earley_common import Column, Item
 from .earley_common import Item
 from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode


@@ -44,12 +44,13 @@ class Parser:
        #  the slow 'isupper' in is_terminal.
        self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
        self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }

        for rule in parser_conf.rules:
            self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None)
            self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]

        self.term_matcher = term_matcher
        self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks)
        self.term_matcher = term_matcher

    def parse(self, stream, start_symbol=None):
        start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
@@ -62,19 +63,20 @@ class Parser:
        # Cache for nodes & tokens created in a particular parse step.
        node_cache = {}
        token_cache = {}
        columns = []

        text_line = 1
        text_column = 1

        def make_symbol_node(s, start, end):
            label = (s, start.i, end.i)
            label = (s, start, end)
            if label in node_cache:
                node = node_cache[label]
            else:
                node = node_cache[label] = SymbolNode(s, start, end)
            return node

        def predict_and_complete(column, to_scan):
        def predict_and_complete(i, to_scan):
            """The core Earley Predictor and Completer.

            At each stage of the input, we handling any completed items (things
@@ -84,15 +86,16 @@ class Parser:
            which can be added to the scan list for the next scanner cycle."""
            held_completions.clear()

            column = columns[i]
            # R (items) = Ei (column.items)
            items = deque(column.items)
            items = deque(column)
            while items:
                item = items.pop()    # remove an element, A say, from R

                ### The Earley completer
                if item.is_complete:   ### (item.s == string)
                    if item.node is None:
                        item.node = make_symbol_node(item.s, item.start, column)
                        item.node = make_symbol_node(item.s, item.start, i)
                        item.node.add_family(item.s, item.rule, item.start, None, None)

                    # Empty has 0 length. If we complete an empty symbol in a particular
@@ -100,19 +103,19 @@ class Parser:
                    # any predictions that result, that themselves require empty. Avoids
                    # infinite recursion on empty symbols.
                    # held_completions is 'H' in E.Scott's paper.
                    is_empty_item = item.start.i == column.i
                    is_empty_item = item.start == i
                    if is_empty_item:
                        held_completions[item.rule.origin] = item.node

                    originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s]
                    originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
                    for originator in originators:
                        new_item = originator.advance()
                        new_item.node = make_symbol_node(new_item.s, originator.start, column)
                        new_item.node = make_symbol_node(new_item.s, originator.start, i)
                        new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
                        if new_item.expect in self.TERMINALS:
                            # Add (B :: aC.B, h, y) to Q
                            to_scan.add(new_item)
                        elif new_item not in column.items:
                        elif new_item not in column:
                            # Add (B :: aC.B, h, y) to Ei and R
                            column.add(new_item)
                            items.append(new_item)
@@ -121,24 +124,24 @@ class Parser:
                elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
                    new_items = []
                    for rule in self.predictions[item.expect]:
                        new_item = Item(rule, 0, column)
                        new_item = Item(rule, 0, i)
                        new_items.append(new_item)

                    # Process any held completions (H).
                    if item.expect in held_completions:
                        new_item = item.advance()
                        new_item.node = make_symbol_node(new_item.s, item.start, column)
                        new_item.node = make_symbol_node(new_item.s, item.start, i)
                        new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
                        new_items.append(new_item)

                    for new_item in new_items:
                        if new_item.expect in self.TERMINALS:
                            to_scan.add(new_item)
                        elif new_item not in column.items:
                        elif new_item not in column:
                            column.add(new_item)
                            items.append(new_item)

        def scan(i, column, to_scan):
        def scan(i, to_scan):
            """The core Earley Scanner.

            This is a custom implementation of the scanner that uses the
@@ -157,7 +160,7 @@ class Parser:
                m = match(item.expect, stream, i)
                if m:
                    t = Token(item.expect.name, m.group(0), i, text_line, text_column)
                    delayed_matches[m.end()].append( (item, column, t) )
                    delayed_matches[m.end()].append( (item, i, t) )

                    if self.complete_lex:
                        s = m.group(0)
@@ -165,7 +168,7 @@ class Parser:
                            m = match(item.expect, s[:-j])
                            if m:
                                t = Token(item.expect.name, m.group(0), i, text_line, text_column)
                                delayed_matches[i+m.end()].append( (item, column, t) )
                                delayed_matches[i+m.end()].append( (item, i, t) )

                    # Remove any items that successfully matched in this pass from the to_scan buffer.
                    # This ensures we don't carry over tokens that already matched, if we're ignoring below.
@@ -179,13 +182,14 @@ class Parser:
                m = match(x, stream, i)
                if m:
                    # Carry over any items still in the scan buffer, to past the end of the ignored items.
                    delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ])
                    delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ])

                    # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
                    delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol])
                    delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])

            next_set = Column(i + 1, self.FIRST)    # Ei+1
            next_to_scan = set()
            next_set = set()
            columns.append(next_set)

            ## 4) Process Tokens from delayed_matches.
            # This is the core of the Earley scanner. Create an SPPF node for each Token,
@@ -195,7 +199,8 @@ class Parser:
            for item, start, token in delayed_matches[i+1]:
                if token is not None:
                    new_item = item.advance()
                    new_item.node = make_symbol_node(new_item.s, new_item.start, column)
 #                   new_item.start = start # Should we update this to account for gaps due to ignores?
                    new_item.node = make_symbol_node(new_item.s, new_item.start, i)
                    new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)
                else:
                    new_item = item
@@ -212,11 +217,10 @@ class Parser:
            if not next_set and not delayed_matches and not next_to_scan:
                raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan))

            return next_set, next_to_scan
            return next_to_scan

        # Main loop starts
        column0 = Column(0, self.FIRST)
        column = column0
        columns.append(set())

        ## The scan buffer. 'Q' in E.Scott's paper.
        to_scan = set()
@@ -225,38 +229,40 @@ class Parser:
        # Add predicted items to the first Earley set (for the predictor) if they
        # result in a non-terminal, or the scanner if they result in a terminal.
        for rule in self.predictions[start_symbol]:
            item = Item(rule, 0, column0)
            item = Item(rule, 0, 0)
            if item.expect in self.TERMINALS:
                to_scan.add(item)
            else:
                column.add(item)
                columns[0].add(item)

        ## The main Earley loop.
        # Run the Prediction/Completion cycle for any Items in the current Earley set.
        # Completions will be added to the SPPF tree, and predictions will be recursively
        # processed down to terminals/empty nodes to be added to the scanner for the next
        # step.
        for i, token in enumerate(stream):
            predict_and_complete(column, to_scan)
        i = 0
        for token in stream:
            predict_and_complete(i, to_scan)

            # Clear the node_cache and token_cache, which are only relevant for each
            # step in the Earley pass.
            node_cache.clear()
            token_cache.clear()
            column, to_scan = scan(i, column, to_scan)
            to_scan = scan(i, to_scan)

            if token == '\n':
                text_line += 1
                text_column = 1
            else:
                text_column += 1
            i += 1

        predict_and_complete(column, to_scan)
        predict_and_complete(i, to_scan)

        ## Column is now the final column in the parse. If the parse was successful, the start
        # symbol should have been completed in the last step of the Earley cycle, and will be in
        # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
        solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0]
        solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]

        if not solutions:
            expected_tokens = [t.expect for t in to_scan]