We can replace Earley Columns with basic python sets for improved performance and simplicity.tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.6
@@ -16,7 +16,7 @@ from ..visitors import Transformer_InPlace, v_args | |||||
from ..exceptions import ParseError, UnexpectedToken | from ..exceptions import ParseError, UnexpectedToken | ||||
from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
from ..grammar import NonTerminal | from ..grammar import NonTerminal | ||||
from .earley_common import Column, Item | |||||
from .earley_common import Item | |||||
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | ||||
from collections import deque, defaultdict | from collections import deque, defaultdict | ||||
@@ -48,19 +48,24 @@ class Parser: | |||||
# Define parser functions | # Define parser functions | ||||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | ||||
match = self.term_matcher | match = self.term_matcher | ||||
held_completions = defaultdict(list) | |||||
# Held Completions (H in E.Scotts paper). | |||||
held_completions = {} | |||||
# Cache for nodes & tokens created in a particular parse step. | |||||
node_cache = {} | node_cache = {} | ||||
token_cache = {} | token_cache = {} | ||||
columns = [] | |||||
def make_symbol_node(s, start, end): | def make_symbol_node(s, start, end): | ||||
label = (s, start.i, end.i) | |||||
label = (s, start, end) | |||||
if label in node_cache: | if label in node_cache: | ||||
node = node_cache[label] | node = node_cache[label] | ||||
else: | else: | ||||
node = node_cache[label] = SymbolNode(s, start, end) | node = node_cache[label] = SymbolNode(s, start, end) | ||||
return node | return node | ||||
def predict_and_complete(column, to_scan): | |||||
def predict_and_complete(i, to_scan): | |||||
"""The core Earley Predictor and Completer. | """The core Earley Predictor and Completer. | ||||
At each stage of the input, we handling any completed items (things | At each stage of the input, we handling any completed items (things | ||||
@@ -70,15 +75,16 @@ class Parser: | |||||
which can be added to the scan list for the next scanner cycle.""" | which can be added to the scan list for the next scanner cycle.""" | ||||
held_completions.clear() | held_completions.clear() | ||||
column = columns[i] | |||||
# R (items) = Ei (column.items) | # R (items) = Ei (column.items) | ||||
items = deque(column.items) | |||||
items = deque(column) | |||||
while items: | while items: | ||||
item = items.pop() # remove an element, A say, from R | item = items.pop() # remove an element, A say, from R | ||||
### The Earley completer | ### The Earley completer | ||||
if item.is_complete: ### (item.s == string) | if item.is_complete: ### (item.s == string) | ||||
if item.node is None: | if item.node is None: | ||||
item.node = make_symbol_node(item.s, item.start, column) | |||||
item.node = make_symbol_node(item.s, item.start, i) | |||||
item.node.add_family(item.s, item.rule, item.start, None, None) | item.node.add_family(item.s, item.rule, item.start, None, None) | ||||
# Empty has 0 length. If we complete an empty symbol in a particular | # Empty has 0 length. If we complete an empty symbol in a particular | ||||
@@ -86,19 +92,19 @@ class Parser: | |||||
# any predictions that result, that themselves require empty. Avoids | # any predictions that result, that themselves require empty. Avoids | ||||
# infinite recursion on empty symbols. | # infinite recursion on empty symbols. | ||||
# held_completions is 'H' in E.Scott's paper. | # held_completions is 'H' in E.Scott's paper. | ||||
is_empty_item = item.start.i == column.i | |||||
is_empty_item = item.start == i | |||||
if is_empty_item: | if is_empty_item: | ||||
held_completions[item.rule.origin] = item.node | held_completions[item.rule.origin] = item.node | ||||
originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||||
originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] | |||||
for originator in originators: | for originator in originators: | ||||
new_item = originator.advance() | new_item = originator.advance() | ||||
new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||||
new_item.node = make_symbol_node(new_item.s, originator.start, i) | |||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | ||||
if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
# Add (B :: aC.B, h, y) to Q | # Add (B :: aC.B, h, y) to Q | ||||
to_scan.add(new_item) | to_scan.add(new_item) | ||||
elif new_item not in column.items: | |||||
elif new_item not in column: | |||||
# Add (B :: aC.B, h, y) to Ei and R | # Add (B :: aC.B, h, y) to Ei and R | ||||
column.add(new_item) | column.add(new_item) | ||||
items.append(new_item) | items.append(new_item) | ||||
@@ -107,24 +113,24 @@ class Parser: | |||||
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | ||||
new_items = [] | new_items = [] | ||||
for rule in self.predictions[item.expect]: | for rule in self.predictions[item.expect]: | ||||
new_item = Item(rule, 0, column) | |||||
new_item = Item(rule, 0, i) | |||||
new_items.append(new_item) | new_items.append(new_item) | ||||
# Process any held completions (H). | # Process any held completions (H). | ||||
if item.expect in held_completions: | if item.expect in held_completions: | ||||
new_item = item.advance() | new_item = item.advance() | ||||
new_item.node = make_symbol_node(new_item.s, item.start, column) | |||||
new_item.node = make_symbol_node(new_item.s, item.start, i) | |||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | ||||
new_items.append(new_item) | new_items.append(new_item) | ||||
for new_item in new_items: | for new_item in new_items: | ||||
if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
to_scan.add(new_item) | to_scan.add(new_item) | ||||
elif new_item not in column.items: | |||||
elif new_item not in column: | |||||
column.add(new_item) | column.add(new_item) | ||||
items.append(new_item) | items.append(new_item) | ||||
def scan(i, token, column, to_scan): | |||||
def scan(i, token, to_scan): | |||||
"""The core Earley Scanner. | """The core Earley Scanner. | ||||
This is a custom implementation of the scanner that uses the | This is a custom implementation of the scanner that uses the | ||||
@@ -132,12 +138,14 @@ class Parser: | |||||
Earley predictor, based on the previously completed tokens. | Earley predictor, based on the previously completed tokens. | ||||
This ensures that at each phase of the parse we have a custom | This ensures that at each phase of the parse we have a custom | ||||
lexer context, allowing for more complex ambiguities.""" | lexer context, allowing for more complex ambiguities.""" | ||||
next_set = Column(i+1, self.FIRST) | |||||
next_to_scan = set() | next_to_scan = set() | ||||
next_set = set() | |||||
columns.append(next_set) | |||||
for item in set(to_scan): | for item in set(to_scan): | ||||
if match(item.expect, token): | if match(item.expect, token): | ||||
new_item = item.advance() | new_item = item.advance() | ||||
new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||||
new_item.node = make_symbol_node(new_item.s, new_item.start, i) | |||||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | ||||
if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
@@ -151,11 +159,10 @@ class Parser: | |||||
expect = {i.expect.name for i in to_scan} | expect = {i.expect.name for i in to_scan} | ||||
raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) | raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) | ||||
return next_set, next_to_scan | |||||
return next_to_scan | |||||
# Main loop starts | # Main loop starts | ||||
column0 = Column(0, self.FIRST) | |||||
column = column0 | |||||
columns.append(set()) | |||||
## The scan buffer. 'Q' in E.Scott's paper. | ## The scan buffer. 'Q' in E.Scott's paper. | ||||
to_scan = set() | to_scan = set() | ||||
@@ -164,32 +171,34 @@ class Parser: | |||||
# Add predicted items to the first Earley set (for the predictor) if they | # Add predicted items to the first Earley set (for the predictor) if they | ||||
# result in a non-terminal, or the scanner if they result in a terminal. | # result in a non-terminal, or the scanner if they result in a terminal. | ||||
for rule in self.predictions[start_symbol]: | for rule in self.predictions[start_symbol]: | ||||
item = Item(rule, 0, column0) | |||||
item = Item(rule, 0, 0) | |||||
if item.expect in self.TERMINALS: | if item.expect in self.TERMINALS: | ||||
to_scan.add(item) | to_scan.add(item) | ||||
else: | else: | ||||
column.add(item) | |||||
columns[0].add(item) | |||||
## The main Earley loop. | ## The main Earley loop. | ||||
# Run the Prediction/Completion cycle for any Items in the current Earley set. | # Run the Prediction/Completion cycle for any Items in the current Earley set. | ||||
# Completions will be added to the SPPF tree, and predictions will be recursively | # Completions will be added to the SPPF tree, and predictions will be recursively | ||||
# processed down to terminals/empty nodes to be added to the scanner for the next | # processed down to terminals/empty nodes to be added to the scanner for the next | ||||
# step. | # step. | ||||
for i, token in enumerate(stream): | |||||
predict_and_complete(column, to_scan) | |||||
i = 0 | |||||
for token in stream: | |||||
predict_and_complete(i, to_scan) | |||||
# Clear the node_cache and token_cache, which are only relevant for each | # Clear the node_cache and token_cache, which are only relevant for each | ||||
# step in the Earley pass. | # step in the Earley pass. | ||||
node_cache.clear() | node_cache.clear() | ||||
token_cache.clear() | token_cache.clear() | ||||
column, to_scan = scan(i, token, column, to_scan) | |||||
to_scan = scan(i, token, to_scan) | |||||
i += 1 | |||||
predict_and_complete(column, to_scan) | |||||
predict_and_complete(i, to_scan) | |||||
## Column is now the final column in the parse. If the parse was successful, the start | ## Column is now the final column in the parse. If the parse was successful, the start | ||||
# symbol should have been completed in the last step of the Earley cycle, and will be in | # symbol should have been completed in the last step of the Earley cycle, and will be in | ||||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree. | # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | ||||
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||||
solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||||
if not solutions: | if not solutions: | ||||
raise ParseError('Incomplete parse: Could not find a solution to input') | raise ParseError('Incomplete parse: Could not find a solution to input') | ||||
@@ -35,6 +35,7 @@ class Item(object): | |||||
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') | __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') | ||||
def __init__(self, rule, ptr, start): | def __init__(self, rule, ptr, start): | ||||
assert isinstance(start, int), "start is not an int" | |||||
self.is_complete = len(rule.expansion) == ptr | self.is_complete = len(rule.expansion) == ptr | ||||
self.rule = rule # rule | self.rule = rule # rule | ||||
self.ptr = ptr # ptr | self.ptr = ptr # ptr | ||||
@@ -46,35 +47,16 @@ class Item(object): | |||||
else: | else: | ||||
self.s = (rule, ptr) | self.s = (rule, ptr) | ||||
self.expect = rule.expansion[ptr] | self.expect = rule.expansion[ptr] | ||||
self._hash = hash((self.s, self.start.i)) | |||||
self._hash = hash((self.s, self.start)) | |||||
def advance(self): | def advance(self): | ||||
return self.__class__(self.rule, self.ptr + 1, self.start) | return self.__class__(self.rule, self.ptr + 1, self.start) | ||||
def __eq__(self, other): | def __eq__(self, other): | ||||
return self is other or (self.s == other.s and self.start.i == other.start.i) | |||||
return self is other or (self.s == other.s and self.start == other.start) | |||||
def __hash__(self): | def __hash__(self): | ||||
return self._hash | return self._hash | ||||
def __repr__(self): | def __repr__(self): | ||||
return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i) | |||||
class Column: | |||||
"An entry in the table, aka Earley Chart. Contains lists of items." | |||||
def __init__(self, i, FIRST): | |||||
self.i = i | |||||
self.items = set() | |||||
self.FIRST = FIRST | |||||
def add(self, item): | |||||
"""Sort items into scan/predict/reduce newslists | |||||
Makes sure only unique items are added. | |||||
""" | |||||
self.items.add(item) | |||||
def __bool__(self): | |||||
return bool(self.items) | |||||
__nonzero__ = __bool__ # Py2 backwards-compatibility | |||||
return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start) |
@@ -13,7 +13,7 @@ from ..exceptions import ParseError | |||||
from ..lexer import Token | from ..lexer import Token | ||||
from ..utils import Str | from ..utils import Str | ||||
from ..grammar import NonTerminal, Terminal | from ..grammar import NonTerminal, Terminal | ||||
from .earley_common import Column, Derivation | |||||
from .earley_common import Derivation | |||||
from collections import deque | from collections import deque | ||||
from importlib import import_module | from importlib import import_module | ||||
@@ -60,7 +60,7 @@ class SymbolNode(ForestNode): | |||||
return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) | return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) | ||||
def __hash__(self): | def __hash__(self): | ||||
return hash((self.s, self.start.i, self.end.i)) | |||||
return hash((self.s, self.start, self.end)) | |||||
def __repr__(self): | def __repr__(self): | ||||
if self.is_intermediate: | if self.is_intermediate: | ||||
@@ -70,7 +70,7 @@ class SymbolNode(ForestNode): | |||||
symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) | symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) | ||||
else: | else: | ||||
symbol = self.s.name | symbol = self.s.name | ||||
return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) | |||||
return "(%s, %d, %d, %d)" % (symbol, self.start, self.end, self.priority if self.priority is not None else 0) | |||||
class PackedNode(ForestNode): | class PackedNode(ForestNode): | ||||
""" | """ | ||||
@@ -85,7 +85,7 @@ class PackedNode(ForestNode): | |||||
self.left = left | self.left = left | ||||
self.right = right | self.right = right | ||||
self.priority = None | self.priority = None | ||||
self._hash = hash((self.s, self.start.i, self.left, self.right)) | |||||
self._hash = hash((self.s, self.start, self.left, self.right)) | |||||
@property | @property | ||||
def is_empty(self): | def is_empty(self): | ||||
@@ -120,7 +120,7 @@ class PackedNode(ForestNode): | |||||
symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) | symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names)) | ||||
else: | else: | ||||
symbol = self.s.name | symbol = self.s.name | ||||
return "{%s, %d, %d}" % (symbol, self.start.i, self.priority if self.priority is not None else 0) | |||||
return "{%s, %d, %d}" % (symbol, self.start, self.priority if self.priority is not None else 0) | |||||
class ForestVisitor(object): | class ForestVisitor(object): | ||||
""" | """ | ||||
@@ -24,7 +24,7 @@ from ..tree import Tree | |||||
from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
from ..grammar import NonTerminal, Terminal | from ..grammar import NonTerminal, Terminal | ||||
from .earley import ApplyCallbacks | from .earley import ApplyCallbacks | ||||
from .earley_common import Column, Item | |||||
from .earley_common import Item | |||||
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | ||||
@@ -44,12 +44,13 @@ class Parser: | |||||
# the slow 'isupper' in is_terminal. | # the slow 'isupper' in is_terminal. | ||||
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | ||||
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | ||||
for rule in parser_conf.rules: | for rule in parser_conf.rules: | ||||
self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) | self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) | ||||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | ||||
self.term_matcher = term_matcher | |||||
self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) | self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) | ||||
self.term_matcher = term_matcher | |||||
def parse(self, stream, start_symbol=None): | def parse(self, stream, start_symbol=None): | ||||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | ||||
@@ -62,19 +63,20 @@ class Parser: | |||||
# Cache for nodes & tokens created in a particular parse step. | # Cache for nodes & tokens created in a particular parse step. | ||||
node_cache = {} | node_cache = {} | ||||
token_cache = {} | token_cache = {} | ||||
columns = [] | |||||
text_line = 1 | text_line = 1 | ||||
text_column = 1 | text_column = 1 | ||||
def make_symbol_node(s, start, end): | def make_symbol_node(s, start, end): | ||||
label = (s, start.i, end.i) | |||||
label = (s, start, end) | |||||
if label in node_cache: | if label in node_cache: | ||||
node = node_cache[label] | node = node_cache[label] | ||||
else: | else: | ||||
node = node_cache[label] = SymbolNode(s, start, end) | node = node_cache[label] = SymbolNode(s, start, end) | ||||
return node | return node | ||||
def predict_and_complete(column, to_scan): | |||||
def predict_and_complete(i, to_scan): | |||||
"""The core Earley Predictor and Completer. | """The core Earley Predictor and Completer. | ||||
At each stage of the input, we handling any completed items (things | At each stage of the input, we handling any completed items (things | ||||
@@ -84,15 +86,16 @@ class Parser: | |||||
which can be added to the scan list for the next scanner cycle.""" | which can be added to the scan list for the next scanner cycle.""" | ||||
held_completions.clear() | held_completions.clear() | ||||
column = columns[i] | |||||
# R (items) = Ei (column.items) | # R (items) = Ei (column.items) | ||||
items = deque(column.items) | |||||
items = deque(column) | |||||
while items: | while items: | ||||
item = items.pop() # remove an element, A say, from R | item = items.pop() # remove an element, A say, from R | ||||
### The Earley completer | ### The Earley completer | ||||
if item.is_complete: ### (item.s == string) | if item.is_complete: ### (item.s == string) | ||||
if item.node is None: | if item.node is None: | ||||
item.node = make_symbol_node(item.s, item.start, column) | |||||
item.node = make_symbol_node(item.s, item.start, i) | |||||
item.node.add_family(item.s, item.rule, item.start, None, None) | item.node.add_family(item.s, item.rule, item.start, None, None) | ||||
# Empty has 0 length. If we complete an empty symbol in a particular | # Empty has 0 length. If we complete an empty symbol in a particular | ||||
@@ -100,19 +103,19 @@ class Parser: | |||||
# any predictions that result, that themselves require empty. Avoids | # any predictions that result, that themselves require empty. Avoids | ||||
# infinite recursion on empty symbols. | # infinite recursion on empty symbols. | ||||
# held_completions is 'H' in E.Scott's paper. | # held_completions is 'H' in E.Scott's paper. | ||||
is_empty_item = item.start.i == column.i | |||||
is_empty_item = item.start == i | |||||
if is_empty_item: | if is_empty_item: | ||||
held_completions[item.rule.origin] = item.node | held_completions[item.rule.origin] = item.node | ||||
originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||||
originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] | |||||
for originator in originators: | for originator in originators: | ||||
new_item = originator.advance() | new_item = originator.advance() | ||||
new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||||
new_item.node = make_symbol_node(new_item.s, originator.start, i) | |||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | ||||
if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
# Add (B :: aC.B, h, y) to Q | # Add (B :: aC.B, h, y) to Q | ||||
to_scan.add(new_item) | to_scan.add(new_item) | ||||
elif new_item not in column.items: | |||||
elif new_item not in column: | |||||
# Add (B :: aC.B, h, y) to Ei and R | # Add (B :: aC.B, h, y) to Ei and R | ||||
column.add(new_item) | column.add(new_item) | ||||
items.append(new_item) | items.append(new_item) | ||||
@@ -121,24 +124,24 @@ class Parser: | |||||
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | ||||
new_items = [] | new_items = [] | ||||
for rule in self.predictions[item.expect]: | for rule in self.predictions[item.expect]: | ||||
new_item = Item(rule, 0, column) | |||||
new_item = Item(rule, 0, i) | |||||
new_items.append(new_item) | new_items.append(new_item) | ||||
# Process any held completions (H). | # Process any held completions (H). | ||||
if item.expect in held_completions: | if item.expect in held_completions: | ||||
new_item = item.advance() | new_item = item.advance() | ||||
new_item.node = make_symbol_node(new_item.s, item.start, column) | |||||
new_item.node = make_symbol_node(new_item.s, item.start, i) | |||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | ||||
new_items.append(new_item) | new_items.append(new_item) | ||||
for new_item in new_items: | for new_item in new_items: | ||||
if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
to_scan.add(new_item) | to_scan.add(new_item) | ||||
elif new_item not in column.items: | |||||
elif new_item not in column: | |||||
column.add(new_item) | column.add(new_item) | ||||
items.append(new_item) | items.append(new_item) | ||||
def scan(i, column, to_scan): | |||||
def scan(i, to_scan): | |||||
"""The core Earley Scanner. | """The core Earley Scanner. | ||||
This is a custom implementation of the scanner that uses the | This is a custom implementation of the scanner that uses the | ||||
@@ -157,7 +160,7 @@ class Parser: | |||||
m = match(item.expect, stream, i) | m = match(item.expect, stream, i) | ||||
if m: | if m: | ||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | t = Token(item.expect.name, m.group(0), i, text_line, text_column) | ||||
delayed_matches[m.end()].append( (item, column, t) ) | |||||
delayed_matches[m.end()].append( (item, i, t) ) | |||||
if self.complete_lex: | if self.complete_lex: | ||||
s = m.group(0) | s = m.group(0) | ||||
@@ -165,7 +168,7 @@ class Parser: | |||||
m = match(item.expect, s[:-j]) | m = match(item.expect, s[:-j]) | ||||
if m: | if m: | ||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | t = Token(item.expect.name, m.group(0), i, text_line, text_column) | ||||
delayed_matches[i+m.end()].append( (item, column, t) ) | |||||
delayed_matches[i+m.end()].append( (item, i, t) ) | |||||
# Remove any items that successfully matched in this pass from the to_scan buffer. | # Remove any items that successfully matched in this pass from the to_scan buffer. | ||||
# This ensures we don't carry over tokens that already matched, if we're ignoring below. | # This ensures we don't carry over tokens that already matched, if we're ignoring below. | ||||
@@ -179,13 +182,14 @@ class Parser: | |||||
m = match(x, stream, i) | m = match(x, stream, i) | ||||
if m: | if m: | ||||
# Carry over any items still in the scan buffer, to past the end of the ignored items. | # Carry over any items still in the scan buffer, to past the end of the ignored items. | ||||
delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ]) | |||||
delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ]) | |||||
# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. | # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. | ||||
delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol]) | |||||
delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) | |||||
next_set = Column(i + 1, self.FIRST) # Ei+1 | |||||
next_to_scan = set() | next_to_scan = set() | ||||
next_set = set() | |||||
columns.append(next_set) | |||||
## 4) Process Tokens from delayed_matches. | ## 4) Process Tokens from delayed_matches. | ||||
# This is the core of the Earley scanner. Create an SPPF node for each Token, | # This is the core of the Earley scanner. Create an SPPF node for each Token, | ||||
@@ -195,7 +199,8 @@ class Parser: | |||||
for item, start, token in delayed_matches[i+1]: | for item, start, token in delayed_matches[i+1]: | ||||
if token is not None: | if token is not None: | ||||
new_item = item.advance() | new_item = item.advance() | ||||
new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||||
# new_item.start = start # Should we update this to account for gaps due to ignores? | |||||
new_item.node = make_symbol_node(new_item.s, new_item.start, i) | |||||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | ||||
else: | else: | ||||
new_item = item | new_item = item | ||||
@@ -212,11 +217,10 @@ class Parser: | |||||
if not next_set and not delayed_matches and not next_to_scan: | if not next_set and not delayed_matches and not next_to_scan: | ||||
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) | raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) | ||||
return next_set, next_to_scan | |||||
return next_to_scan | |||||
# Main loop starts | # Main loop starts | ||||
column0 = Column(0, self.FIRST) | |||||
column = column0 | |||||
columns.append(set()) | |||||
## The scan buffer. 'Q' in E.Scott's paper. | ## The scan buffer. 'Q' in E.Scott's paper. | ||||
to_scan = set() | to_scan = set() | ||||
@@ -225,38 +229,40 @@ class Parser: | |||||
# Add predicted items to the first Earley set (for the predictor) if they | # Add predicted items to the first Earley set (for the predictor) if they | ||||
# result in a non-terminal, or the scanner if they result in a terminal. | # result in a non-terminal, or the scanner if they result in a terminal. | ||||
for rule in self.predictions[start_symbol]: | for rule in self.predictions[start_symbol]: | ||||
item = Item(rule, 0, column0) | |||||
item = Item(rule, 0, 0) | |||||
if item.expect in self.TERMINALS: | if item.expect in self.TERMINALS: | ||||
to_scan.add(item) | to_scan.add(item) | ||||
else: | else: | ||||
column.add(item) | |||||
columns[0].add(item) | |||||
## The main Earley loop. | ## The main Earley loop. | ||||
# Run the Prediction/Completion cycle for any Items in the current Earley set. | # Run the Prediction/Completion cycle for any Items in the current Earley set. | ||||
# Completions will be added to the SPPF tree, and predictions will be recursively | # Completions will be added to the SPPF tree, and predictions will be recursively | ||||
# processed down to terminals/empty nodes to be added to the scanner for the next | # processed down to terminals/empty nodes to be added to the scanner for the next | ||||
# step. | # step. | ||||
for i, token in enumerate(stream): | |||||
predict_and_complete(column, to_scan) | |||||
i = 0 | |||||
for token in stream: | |||||
predict_and_complete(i, to_scan) | |||||
# Clear the node_cache and token_cache, which are only relevant for each | # Clear the node_cache and token_cache, which are only relevant for each | ||||
# step in the Earley pass. | # step in the Earley pass. | ||||
node_cache.clear() | node_cache.clear() | ||||
token_cache.clear() | token_cache.clear() | ||||
column, to_scan = scan(i, column, to_scan) | |||||
to_scan = scan(i, to_scan) | |||||
if token == '\n': | if token == '\n': | ||||
text_line += 1 | text_line += 1 | ||||
text_column = 1 | text_column = 1 | ||||
else: | else: | ||||
text_column += 1 | text_column += 1 | ||||
i += 1 | |||||
predict_and_complete(column, to_scan) | |||||
predict_and_complete(i, to_scan) | |||||
## Column is now the final column in the parse. If the parse was successful, the start | ## Column is now the final column in the parse. If the parse was successful, the start | ||||
# symbol should have been completed in the last step of the Earley cycle, and will be in | # symbol should have been completed in the last step of the Earley cycle, and will be in | ||||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree. | # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | ||||
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||||
solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||||
if not solutions: | if not solutions: | ||||
expected_tokens = [t.expect for t in to_scan] | expected_tokens = [t.expect for t in to_scan] | ||||