diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 02bc5d4..7600915 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -16,7 +16,7 @@ from ..visitors import Transformer_InPlace, v_args from ..exceptions import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal -from .earley_common import Column, Item +from .earley_common import Item from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode from collections import deque, defaultdict @@ -48,19 +48,24 @@ class Parser: # Define parser functions start_symbol = NonTerminal(start_symbol or self.parser_conf.start) match = self.term_matcher - held_completions = defaultdict(list) + + # Held Completions (H in E.Scotts paper). + held_completions = {} + + # Cache for nodes & tokens created in a particular parse step. node_cache = {} token_cache = {} + columns = [] def make_symbol_node(s, start, end): - label = (s, start.i, end.i) + label = (s, start, end) if label in node_cache: node = node_cache[label] else: node = node_cache[label] = SymbolNode(s, start, end) return node - def predict_and_complete(column, to_scan): + def predict_and_complete(i, to_scan): """The core Earley Predictor and Completer. At each stage of the input, we handling any completed items (things @@ -70,15 +75,16 @@ class Parser: which can be added to the scan list for the next scanner cycle.""" held_completions.clear() + column = columns[i] # R (items) = Ei (column.items) - items = deque(column.items) + items = deque(column) while items: item = items.pop() # remove an element, A say, from R ### The Earley completer if item.is_complete: ### (item.s == string) if item.node is None: - item.node = make_symbol_node(item.s, item.start, column) + item.node = make_symbol_node(item.s, item.start, i) item.node.add_family(item.s, item.rule, item.start, None, None) # Empty has 0 length. If we complete an empty symbol in a particular @@ -86,19 +92,19 @@ class Parser: # any predictions that result, that themselves require empty. Avoids # infinite recursion on empty symbols. # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start.i == column.i + is_empty_item = item.start == i if is_empty_item: held_completions[item.rule.origin] = item.node - originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] for originator in originators: new_item = originator.advance() - new_item.node = make_symbol_node(new_item.s, originator.start, column) + new_item.node = make_symbol_node(new_item.s, originator.start, i) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) if new_item.expect in self.TERMINALS: # Add (B :: aC.B, h, y) to Q to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: # Add (B :: aC.B, h, y) to Ei and R column.add(new_item) items.append(new_item) @@ -107,24 +113,24 @@ class Parser: elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) new_items = [] for rule in self.predictions[item.expect]: - new_item = Item(rule, 0, column) + new_item = Item(rule, 0, i) new_items.append(new_item) # Process any held completions (H). if item.expect in held_completions: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, item.start, column) + new_item.node = make_symbol_node(new_item.s, item.start, i) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) new_items.append(new_item) for new_item in new_items: if new_item.expect in self.TERMINALS: to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: column.add(new_item) items.append(new_item) - def scan(i, token, column, to_scan): + def scan(i, token, to_scan): """The core Earley Scanner. This is a custom implementation of the scanner that uses the @@ -132,12 +138,14 @@ class Parser: Earley predictor, based on the previously completed tokens. This ensures that at each phase of the parse we have a custom lexer context, allowing for more complex ambiguities.""" - next_set = Column(i+1, self.FIRST) next_to_scan = set() + next_set = set() + columns.append(next_set) + for item in set(to_scan): if match(item.expect, token): new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, new_item.start, column) + new_item.node = make_symbol_node(new_item.s, new_item.start, i) new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) if new_item.expect in self.TERMINALS: @@ -151,11 +159,10 @@ class Parser: expect = {i.expect.name for i in to_scan} raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) - return next_set, next_to_scan + return next_to_scan # Main loop starts - column0 = Column(0, self.FIRST) - column = column0 + columns.append(set()) ## The scan buffer. 'Q' in E.Scott's paper. to_scan = set() @@ -164,32 +171,34 @@ class Parser: # Add predicted items to the first Earley set (for the predictor) if they # result in a non-terminal, or the scanner if they result in a terminal. for rule in self.predictions[start_symbol]: - item = Item(rule, 0, column0) + item = Item(rule, 0, 0) if item.expect in self.TERMINALS: to_scan.add(item) else: - column.add(item) + columns[0].add(item) ## The main Earley loop. # Run the Prediction/Completion cycle for any Items in the current Earley set. # Completions will be added to the SPPF tree, and predictions will be recursively # processed down to terminals/empty nodes to be added to the scanner for the next # step. - for i, token in enumerate(stream): - predict_and_complete(column, to_scan) + i = 0 + for token in stream: + predict_and_complete(i, to_scan) # Clear the node_cache and token_cache, which are only relevant for each # step in the Earley pass. node_cache.clear() token_cache.clear() - column, to_scan = scan(i, token, column, to_scan) + to_scan = scan(i, token, to_scan) + i += 1 - predict_and_complete(column, to_scan) + predict_and_complete(i, to_scan) ## Column is now the final column in the parse. If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in # this column. Find the item for the start_symbol, which is the root of the SPPF tree. - solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] + solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: raise ParseError('Incomplete parse: Could not find a solution to input') diff --git a/lark/parsers/earley_common.py b/lark/parsers/earley_common.py index d17abe4..74dd388 100644 --- a/lark/parsers/earley_common.py +++ b/lark/parsers/earley_common.py @@ -35,6 +35,7 @@ class Item(object): __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') def __init__(self, rule, ptr, start): + assert isinstance(start, int), "start is not an int" self.is_complete = len(rule.expansion) == ptr self.rule = rule # rule self.ptr = ptr # ptr @@ -46,35 +47,16 @@ class Item(object): else: self.s = (rule, ptr) self.expect = rule.expansion[ptr] - self._hash = hash((self.s, self.start.i)) + self._hash = hash((self.s, self.start)) def advance(self): return self.__class__(self.rule, self.ptr + 1, self.start) def __eq__(self, other): - return self is other or (self.s == other.s and self.start.i == other.start.i) + return self is other or (self.s == other.s and self.start == other.start) def __hash__(self): return self._hash def __repr__(self): - return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i) - -class Column: - "An entry in the table, aka Earley Chart. Contains lists of items." - def __init__(self, i, FIRST): - self.i = i - self.items = set() - self.FIRST = FIRST - - def add(self, item): - """Sort items into scan/predict/reduce newslists - - Makes sure only unique items are added. - """ - self.items.add(item) - - def __bool__(self): - return bool(self.items) - - __nonzero__ = __bool__ # Py2 backwards-compatibility + return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index e5038d9..730ebe1 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -13,7 +13,7 @@ from ..exceptions import ParseError from ..lexer import Token from ..utils import Str from ..grammar import NonTerminal, Terminal -from .earley_common import Column, Derivation +from .earley_common import Derivation from collections import deque from importlib import import_module @@ -60,7 +60,7 @@ class SymbolNode(ForestNode): return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) def __hash__(self): - return hash((self.s, self.start.i, self.end.i)) + return hash((self.s, self.start, self.end)) def __repr__(self): if self.is_intermediate: @@ -70,7 +70,7 @@ class SymbolNode(ForestNode): symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) else: symbol = self.s.name - return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) + return "(%s, %d, %d, %d)" % (symbol, self.start, self.end, self.priority if self.priority is not None else 0) class PackedNode(ForestNode): """ @@ -85,7 +85,7 @@ class PackedNode(ForestNode): self.left = left self.right = right self.priority = None - self._hash = hash((self.s, self.start.i, self.left, self.right)) + self._hash = hash((self.s, self.start, self.left, self.right)) @property def is_empty(self): @@ -120,7 +120,7 @@ class PackedNode(ForestNode): symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) else: symbol = self.s.name - return "{%s, %d, %d}" % (symbol, self.start.i, self.priority if self.priority is not None else 0) + return "{%s, %d, %d}" % (symbol, self.start, self.priority if self.priority is not None else 0) class ForestVisitor(object): """ diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 30729ef..57aab61 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -24,7 +24,7 @@ from ..tree import Tree from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal, Terminal from .earley import ApplyCallbacks -from .earley_common import Column, Item +from .earley_common import Item from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode @@ -44,12 +44,13 @@ class Parser: # the slow 'isupper' in is_terminal. self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } + for rule in parser_conf.rules: self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] - self.term_matcher = term_matcher self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) + self.term_matcher = term_matcher def parse(self, stream, start_symbol=None): start_symbol = NonTerminal(start_symbol or self.parser_conf.start) @@ -62,19 +63,20 @@ class Parser: # Cache for nodes & tokens created in a particular parse step. node_cache = {} token_cache = {} + columns = [] text_line = 1 text_column = 1 def make_symbol_node(s, start, end): - label = (s, start.i, end.i) + label = (s, start, end) if label in node_cache: node = node_cache[label] else: node = node_cache[label] = SymbolNode(s, start, end) return node - def predict_and_complete(column, to_scan): + def predict_and_complete(i, to_scan): """The core Earley Predictor and Completer. At each stage of the input, we handling any completed items (things @@ -84,15 +86,16 @@ class Parser: which can be added to the scan list for the next scanner cycle.""" held_completions.clear() + column = columns[i] # R (items) = Ei (column.items) - items = deque(column.items) + items = deque(column) while items: item = items.pop() # remove an element, A say, from R ### The Earley completer if item.is_complete: ### (item.s == string) if item.node is None: - item.node = make_symbol_node(item.s, item.start, column) + item.node = make_symbol_node(item.s, item.start, i) item.node.add_family(item.s, item.rule, item.start, None, None) # Empty has 0 length. If we complete an empty symbol in a particular @@ -100,19 +103,19 @@ class Parser: # any predictions that result, that themselves require empty. Avoids # infinite recursion on empty symbols. # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start.i == column.i + is_empty_item = item.start == i if is_empty_item: held_completions[item.rule.origin] = item.node - originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] for originator in originators: new_item = originator.advance() - new_item.node = make_symbol_node(new_item.s, originator.start, column) + new_item.node = make_symbol_node(new_item.s, originator.start, i) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) if new_item.expect in self.TERMINALS: # Add (B :: aC.B, h, y) to Q to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: # Add (B :: aC.B, h, y) to Ei and R column.add(new_item) items.append(new_item) @@ -121,24 +124,24 @@ class Parser: elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) new_items = [] for rule in self.predictions[item.expect]: - new_item = Item(rule, 0, column) + new_item = Item(rule, 0, i) new_items.append(new_item) # Process any held completions (H). if item.expect in held_completions: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, item.start, column) + new_item.node = make_symbol_node(new_item.s, item.start, i) new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) new_items.append(new_item) for new_item in new_items: if new_item.expect in self.TERMINALS: to_scan.add(new_item) - elif new_item not in column.items: + elif new_item not in column: column.add(new_item) items.append(new_item) - def scan(i, column, to_scan): + def scan(i, to_scan): """The core Earley Scanner. This is a custom implementation of the scanner that uses the @@ -157,7 +160,7 @@ class Parser: m = match(item.expect, stream, i) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[m.end()].append( (item, column, t) ) + delayed_matches[m.end()].append( (item, i, t) ) if self.complete_lex: s = m.group(0) @@ -165,7 +168,7 @@ class Parser: m = match(item.expect, s[:-j]) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[i+m.end()].append( (item, column, t) ) + delayed_matches[i+m.end()].append( (item, i, t) ) # Remove any items that successfully matched in this pass from the to_scan buffer. # This ensures we don't carry over tokens that already matched, if we're ignoring below. @@ -179,13 +182,14 @@ class Parser: m = match(x, stream, i) if m: # Carry over any items still in the scan buffer, to past the end of the ignored items. - delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ]) + delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ]) # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. - delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol]) + delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) - next_set = Column(i + 1, self.FIRST) # Ei+1 next_to_scan = set() + next_set = set() + columns.append(next_set) ## 4) Process Tokens from delayed_matches. # This is the core of the Earley scanner. Create an SPPF node for each Token, @@ -195,7 +199,8 @@ class Parser: for item, start, token in delayed_matches[i+1]: if token is not None: new_item = item.advance() - new_item.node = make_symbol_node(new_item.s, new_item.start, column) +# new_item.start = start # Should we update this to account for gaps due to ignores? + new_item.node = make_symbol_node(new_item.s, new_item.start, i) new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) else: new_item = item @@ -212,11 +217,10 @@ class Parser: if not next_set and not delayed_matches and not next_to_scan: raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) - return next_set, next_to_scan + return next_to_scan # Main loop starts - column0 = Column(0, self.FIRST) - column = column0 + columns.append(set()) ## The scan buffer. 'Q' in E.Scott's paper. to_scan = set() @@ -225,38 +229,40 @@ class Parser: # Add predicted items to the first Earley set (for the predictor) if they # result in a non-terminal, or the scanner if they result in a terminal. for rule in self.predictions[start_symbol]: - item = Item(rule, 0, column0) + item = Item(rule, 0, 0) if item.expect in self.TERMINALS: to_scan.add(item) else: - column.add(item) + columns[0].add(item) ## The main Earley loop. # Run the Prediction/Completion cycle for any Items in the current Earley set. # Completions will be added to the SPPF tree, and predictions will be recursively # processed down to terminals/empty nodes to be added to the scanner for the next # step. - for i, token in enumerate(stream): - predict_and_complete(column, to_scan) + i = 0 + for token in stream: + predict_and_complete(i, to_scan) # Clear the node_cache and token_cache, which are only relevant for each # step in the Earley pass. node_cache.clear() token_cache.clear() - column, to_scan = scan(i, column, to_scan) + to_scan = scan(i, to_scan) if token == '\n': text_line += 1 text_column = 1 else: text_column += 1 + i += 1 - predict_and_complete(column, to_scan) + predict_and_complete(i, to_scan) ## Column is now the final column in the parse. If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in # this column. Find the item for the start_symbol, which is the root of the SPPF tree. - solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] + solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: expected_tokens = [t.expect for t in to_scan]