types of ambiguity. - Rewritten along the lines of Elizabeth Scott's parser. https://www.sciencedirect.com/science/article/pii/S1571066108001497 - Implement SPPF trees per Elizabeth Scott and Bram van der Sanden's work. http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.6
@@ -62,14 +62,13 @@ class LarkOptions(object): | |||
self.profile = o.pop('profile', False) | |||
self.ambiguity = o.pop('ambiguity', 'auto') | |||
self.propagate_positions = o.pop('propagate_positions', False) | |||
self.earley__predict_all = o.pop('earley__predict_all', False) | |||
self.lexer_callbacks = o.pop('lexer_callbacks', {}) | |||
assert self.parser in ('earley', 'lalr', 'cyk', None) | |||
if self.parser == 'earley' and self.transformer: | |||
raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.' | |||
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') | |||
if self.ambiguity == 'explicit' and self.transformer: | |||
raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm for explicit ambiguity.' | |||
'Please use your transformer on the resulting Forest, or use a different algorithm (i.e. LALR)') | |||
if o: | |||
raise ValueError("Unknown options: %s" % o.keys()) | |||
@@ -176,7 +175,7 @@ class Lark: | |||
def _build_parser(self): | |||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | |||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') | |||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit') | |||
callback = self._parse_tree_builder.create_callback(self.options.transformer) | |||
if self.profiler: | |||
for f in dir(callback): | |||
@@ -7,6 +7,7 @@ from .visitors import InlineTransformer # XXX Deprecated | |||
###{standalone | |||
from functools import partial, wraps | |||
from itertools import repeat, product | |||
class ExpandSingleChild: | |||
@@ -62,23 +63,11 @@ class PropagatePositions: | |||
class ChildFilter: | |||
"Optimized childfilter (assumes no duplication in parse tree, so it's safe to change it)" | |||
def __init__(self, to_include, node_builder): | |||
self.node_builder = node_builder | |||
self.to_include = to_include | |||
def __call__(self, children): | |||
filtered = [] | |||
for i, to_expand in self.to_include: | |||
if to_expand: | |||
filtered += children[i].children | |||
else: | |||
filtered.append(children[i]) | |||
return self.node_builder(filtered) | |||
class ChildFilterLALR(ChildFilter): | |||
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" | |||
def __call__(self, children): | |||
filtered = [] | |||
for i, to_expand in self.to_include: | |||
@@ -89,19 +78,43 @@ class ChildFilterLALR(ChildFilter): | |||
filtered = children[i].children | |||
else: | |||
filtered.append(children[i]) | |||
return self.node_builder(filtered) | |||
def _should_expand(sym): | |||
return not sym.is_term and sym.name.startswith('_') | |||
def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous): | |||
def maybe_create_child_filter(expansion, keep_all_tokens): | |||
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) | |||
if keep_all_tokens or not (sym.is_term and sym.filter_out)] | |||
if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | |||
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) | |||
return partial(ChildFilter, to_include) | |||
class AmbiguousExpander: | |||
"""Deal with the case where we're expanding children ('_rule') into a parent but the children | |||
are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself | |||
ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children | |||
into the right parents in the right places, essentially shifting the ambiguiuty up the tree.""" | |||
def __init__(self, to_expand, tree_class, node_builder): | |||
self.node_builder = node_builder | |||
self.tree_class = tree_class | |||
self.to_expand = to_expand | |||
def __call__(self, children): | |||
def _is_ambig_tree(child): | |||
return hasattr(child, 'data') and child.data == '_ambig' | |||
ambiguous = [i for i in self.to_expand if _is_ambig_tree(children[i])] | |||
if ambiguous: | |||
expand = [iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children)] | |||
return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))]) | |||
return self.node_builder(children) | |||
def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): | |||
to_expand = [i for i, sym in enumerate(expansion) | |||
if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))] | |||
if to_expand: | |||
return partial(AmbiguousExpander, to_expand, tree_class) | |||
class Callback(object): | |||
pass | |||
@@ -113,8 +126,6 @@ def ptb_inline_args(func): | |||
return func(*children) | |||
return f | |||
class ParseTreeBuilder: | |||
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): | |||
self.tree_class = tree_class | |||
@@ -135,7 +146,8 @@ class ParseTreeBuilder: | |||
wrapper_chain = filter(None, [ | |||
self.propagate_positions and PropagatePositions, | |||
(expand_single_child and not rule.alias) and ExpandSingleChild, | |||
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous), | |||
maybe_create_child_filter(rule.expansion, keep_all_tokens), | |||
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), | |||
]) | |||
yield rule, wrapper_chain | |||
@@ -4,8 +4,7 @@ from functools import partial | |||
from .utils import get_regexp_width | |||
from .parsers.grammar_analysis import GrammarAnalyzer | |||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | |||
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | |||
from .parsers import lalr_parser, earley, earley_forest, xearley, cyk | |||
from .tree import Tree | |||
class WithLexer: | |||
@@ -54,13 +53,13 @@ class LALR_CustomLexer(WithLexer): | |||
self.lexer = lexer_cls(lexer_conf) | |||
def get_ambiguity_resolver(options): | |||
def get_ambiguity_options(options): | |||
if not options or options.ambiguity == 'resolve': | |||
return resolve_ambig.standard_resolve_ambig | |||
return {} | |||
elif options.ambiguity == 'resolve__antiscore_sum': | |||
return resolve_ambig.antiscore_sum_resolve_ambig | |||
return {'forest_sum_visitor': earley_forest.ForestAntiscoreSumVisitor} | |||
elif options.ambiguity == 'explicit': | |||
return None | |||
return {'resolve_ambiguity': False} | |||
raise ValueError(options) | |||
def tokenize_text(text): | |||
@@ -76,8 +75,7 @@ class Earley(WithLexer): | |||
def __init__(self, lexer_conf, parser_conf, options=None): | |||
self.init_traditional_lexer(lexer_conf) | |||
self.parser = earley.Parser(parser_conf, self.match, | |||
resolve_ambiguity=get_ambiguity_resolver(options)) | |||
self.parser = earley.Parser(parser_conf, self.match, **get_ambiguity_options(options)) | |||
def match(self, term, token): | |||
return term.name == token.type | |||
@@ -89,11 +87,10 @@ class XEarley: | |||
self._prepare_match(lexer_conf) | |||
kw.update(get_ambiguity_options(options)) | |||
self.parser = xearley.Parser(parser_conf, | |||
self.match, | |||
resolve_ambiguity=get_ambiguity_resolver(options), | |||
ignore=lexer_conf.ignore, | |||
predict_all=options.earley__predict_all, | |||
**kw | |||
) | |||
@@ -1,160 +1,44 @@ | |||
"This module implements an Earley Parser" | |||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
# (right now ambiguity resolution is not developed beyond the needs of lark) | |||
# Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||
# I use the no-recursion version of Transformer, because the tree might be | |||
# deeper than Python's recursion limit (a bit absurd, but that's life) | |||
# | |||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||
# Column keeps track of new items using NewsList instances. | |||
# | |||
"""This module implements an scanerless Earley parser. | |||
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: | |||
https://www.sciencedirect.com/science/article/pii/S1571066108001497 | |||
That is probably the best reference for understanding the algorithm here. | |||
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format | |||
is better documented here: | |||
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||
""" | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from ..tree import Tree | |||
from ..visitors import Transformer_InPlace, v_args | |||
from ..exceptions import ParseError, UnexpectedToken | |||
from .grammar_analysis import GrammarAnalyzer | |||
from ..grammar import NonTerminal | |||
from .earley_common import Column, Item | |||
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | |||
class Derivation(Tree): | |||
def __init__(self, rule, items=None): | |||
Tree.__init__(self, 'drv', items or []) | |||
self.meta.rule = rule | |||
self._hash = None | |||
def _pretty_label(self): # Nicer pretty for debugging the parser | |||
return self.meta.rule.origin.name if self.meta.rule else self.data | |||
def __hash__(self): | |||
if self._hash is None: | |||
self._hash = Tree.__hash__(self) | |||
return self._hash | |||
class Item(object): | |||
"An Earley Item, the atom of the algorithm." | |||
def __init__(self, rule, ptr, start, tree): | |||
self.rule = rule | |||
self.ptr = ptr | |||
self.start = start | |||
self.tree = tree if tree is not None else Derivation(self.rule) | |||
@property | |||
def expect(self): | |||
return self.rule.expansion[self.ptr] | |||
@property | |||
def is_complete(self): | |||
return self.ptr == len(self.rule.expansion) | |||
def advance(self, tree): | |||
assert self.tree.data == 'drv' | |||
new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||
return self.__class__(self.rule, self.ptr+1, self.start, new_tree) | |||
def __eq__(self, other): | |||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
def __hash__(self): | |||
return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ | |||
def __repr__(self): | |||
before = list(map(str, self.rule.expansion[:self.ptr])) | |||
after = list(map(str, self.rule.expansion[self.ptr:])) | |||
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||
class NewsList(list): | |||
"Keeps track of newly added items (append-only)" | |||
def __init__(self, initial=None): | |||
list.__init__(self, initial or []) | |||
self.last_iter = 0 | |||
def get_news(self): | |||
i = self.last_iter | |||
self.last_iter = len(self) | |||
return self[i:] | |||
class Column: | |||
"An entry in the table, aka Earley Chart. Contains lists of items." | |||
def __init__(self, i, FIRST, predict_all=False): | |||
self.i = i | |||
self.to_reduce = NewsList() | |||
self.to_predict = NewsList() | |||
self.to_scan = [] | |||
self.item_count = 0 | |||
self.FIRST = FIRST | |||
self.predicted = set() | |||
self.completed = {} | |||
self.predict_all = predict_all | |||
def add(self, items): | |||
"""Sort items into scan/predict/reduce newslists | |||
Makes sure only unique items are added. | |||
""" | |||
for item in items: | |||
item_key = item, item.tree # Elsewhere, tree is not part of the comparison | |||
if item.is_complete: | |||
# XXX Potential bug: What happens if there's ambiguity in an empty rule? | |||
if item.rule.expansion and item_key in self.completed: | |||
old_tree = self.completed[item_key].tree | |||
if old_tree == item.tree: | |||
is_empty = not self.FIRST[item.rule.origin] | |||
if not is_empty: | |||
continue | |||
if old_tree.data != '_ambig': | |||
new_tree = old_tree.copy() | |||
new_tree.meta.rule = old_tree.meta.rule | |||
old_tree.set('_ambig', [new_tree]) | |||
old_tree.meta.rule = None # No longer a 'drv' node | |||
if item.tree.children[0] is old_tree: # XXX a little hacky! | |||
raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) | |||
if item.tree not in old_tree.children: | |||
old_tree.children.append(item.tree) | |||
# old_tree.children.append(item.tree) | |||
else: | |||
self.completed[item_key] = item | |||
self.to_reduce.append(item) | |||
else: | |||
if item.expect.is_term: | |||
self.to_scan.append(item) | |||
else: | |||
k = item_key if self.predict_all else item | |||
if k in self.predicted: | |||
continue | |||
self.predicted.add(k) | |||
self.to_predict.append(item) | |||
self.item_count += 1 # Only count if actually added | |||
def __bool__(self): | |||
return bool(self.item_count) | |||
__nonzero__ = __bool__ # Py2 backwards-compatibility | |||
from collections import deque, defaultdict | |||
class Parser: | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor): | |||
analysis = GrammarAnalyzer(parser_conf) | |||
self.parser_conf = parser_conf | |||
self.resolve_ambiguity = resolve_ambiguity | |||
self.forest_sum_visitor = forest_sum_visitor | |||
self.FIRST = analysis.FIRST | |||
self.postprocess = {} | |||
self.callbacks = {} | |||
self.predictions = {} | |||
## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than | |||
# the slow 'isupper' in is_terminal. | |||
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | |||
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | |||
for rule in parser_conf.rules: | |||
self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | |||
self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) | |||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | |||
self.term_matcher = term_matcher | |||
@@ -163,72 +47,163 @@ class Parser: | |||
def parse(self, stream, start_symbol=None): | |||
# Define parser functions | |||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | |||
_Item = Item | |||
match = self.term_matcher | |||
def predict(nonterm, column): | |||
assert not nonterm.is_term, nonterm | |||
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
def complete(item): | |||
name = item.rule.origin | |||
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||
def predict_and_complete(column): | |||
while True: | |||
to_predict = {x.expect for x in column.to_predict.get_news() | |||
if x.ptr} # if not part of an already predicted batch | |||
to_reduce = set(column.to_reduce.get_news()) | |||
if not (to_predict or to_reduce): | |||
break | |||
for nonterm in to_predict: | |||
column.add( predict(nonterm, column) ) | |||
for item in to_reduce: | |||
new_items = list(complete(item)) | |||
if item in new_items: | |||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||
column.add(new_items) | |||
def scan(i, token, column): | |||
next_set = Column(i, self.FIRST) | |||
next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) | |||
if not next_set: | |||
expect = {i.expect.name for i in column.to_scan} | |||
raise UnexpectedToken(token, expect, considered_rules=set(column.to_scan)) | |||
return next_set | |||
held_completions = defaultdict(list) | |||
node_cache = {} | |||
token_cache = {} | |||
def make_symbol_node(s, start, end): | |||
label = (s, start.i, end.i) | |||
if label in node_cache: | |||
node = node_cache[label] | |||
else: | |||
node = node_cache[label] = SymbolNode(s, start, end) | |||
return node | |||
def predict_and_complete(column, to_scan): | |||
"""The core Earley Predictor and Completer. | |||
At each stage of the input, we handling any completed items (things | |||
that matched on the last cycle) and use those to predict what should | |||
come next in the input stream. The completions and any predicted | |||
non-terminals are recursively processed until we reach a set of, | |||
which can be added to the scan list for the next scanner cycle.""" | |||
held_completions.clear() | |||
# R (items) = Ei (column.items) | |||
items = deque(column.items) | |||
while items: | |||
item = items.pop() # remove an element, A say, from R | |||
### The Earley completer | |||
if item.is_complete: ### (item.s == string) | |||
if item.node is None: | |||
item.node = make_symbol_node(item.s, item.start, column) | |||
item.node.add_family(item.s, item.rule, item.start, None, None) | |||
# Empty has 0 length. If we complete an empty symbol in a particular | |||
# parse step, we need to be able to use that same empty symbol to complete | |||
# any predictions that result, that themselves require empty. Avoids | |||
# infinite recursion on empty symbols. | |||
# held_completions is 'H' in E.Scott's paper. | |||
is_empty_item = item.start.i == column.i | |||
if is_empty_item: | |||
held_completions[item.rule.origin] = item.node | |||
originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||
for originator in originators: | |||
new_item = originator.advance() | |||
new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | |||
if new_item.expect in self.TERMINALS: | |||
# Add (B :: aC.B, h, y) to Q | |||
to_scan.add(new_item) | |||
elif new_item not in column.items: | |||
# Add (B :: aC.B, h, y) to Ei and R | |||
column.add(new_item) | |||
items.append(new_item) | |||
### The Earley predictor | |||
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | |||
new_items = [] | |||
for rule in self.predictions[item.expect]: | |||
new_item = Item(rule, 0, column) | |||
new_items.append(new_item) | |||
# Process any held completions (H). | |||
if item.expect in held_completions: | |||
new_item = item.advance() | |||
new_item.node = make_symbol_node(new_item.s, item.start, column) | |||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | |||
new_items.append(new_item) | |||
for new_item in new_items: | |||
if new_item.expect in self.TERMINALS: | |||
to_scan.add(new_item) | |||
elif new_item not in column.items: | |||
column.add(new_item) | |||
items.append(new_item) | |||
def scan(i, token, column, to_scan): | |||
"""The core Earley Scanner. | |||
This is a custom implementation of the scanner that uses the | |||
Lark lexer to match tokens. The scan list is built by the | |||
Earley predictor, based on the previously completed tokens. | |||
This ensures that at each phase of the parse we have a custom | |||
lexer context, allowing for more complex ambiguities.""" | |||
next_set = Column(i+1, self.FIRST) | |||
next_to_scan = set() | |||
for item in set(to_scan): | |||
if match(item.expect, token): | |||
new_item = item.advance() | |||
new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | |||
if new_item.expect in self.TERMINALS: | |||
# add (B ::= Aai+1.B, h, y) to Q' | |||
next_to_scan.add(new_item) | |||
else: | |||
# add (B ::= Aa+1.B, h, y) to Ei+1 | |||
next_set.add(new_item) | |||
if not next_set and not next_to_scan: | |||
expect = {i.expect.name for i in to_scan} | |||
raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) | |||
return next_set, next_to_scan | |||
# Main loop starts | |||
column0 = Column(0, self.FIRST) | |||
column0.add(predict(start_symbol, column0)) | |||
column = column0 | |||
## The scan buffer. 'Q' in E.Scott's paper. | |||
to_scan = set() | |||
## Predict for the start_symbol. | |||
# Add predicted items to the first Earley set (for the predictor) if they | |||
# result in a non-terminal, or the scanner if they result in a terminal. | |||
for rule in self.predictions[start_symbol]: | |||
item = Item(rule, 0, column0) | |||
if item.expect in self.TERMINALS: | |||
to_scan.add(item) | |||
else: | |||
column.add(item) | |||
## The main Earley loop. | |||
# Run the Prediction/Completion cycle for any Items in the current Earley set. | |||
# Completions will be added to the SPPF tree, and predictions will be recursively | |||
# processed down to terminals/empty nodes to be added to the scanner for the next | |||
# step. | |||
for i, token in enumerate(stream): | |||
predict_and_complete(column) | |||
column = scan(i, token, column) | |||
predict_and_complete(column, to_scan) | |||
# Clear the node_cache and token_cache, which are only relevant for each | |||
# step in the Earley pass. | |||
node_cache.clear() | |||
token_cache.clear() | |||
column, to_scan = scan(i, token, column, to_scan) | |||
predict_and_complete(column) | |||
predict_and_complete(column, to_scan) | |||
# Parse ended. Now build a parse tree | |||
solutions = [n.tree for n in column.to_reduce | |||
if n.rule.origin==start_symbol and n.start is column0] | |||
## Column is now the final column in the parse. If the parse was successful, the start | |||
# symbol should have been completed in the last step of the Earley cycle, and will be in | |||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||
if not solutions: | |||
raise ParseError('Incomplete parse: Could not find a solution to input') | |||
elif len(solutions) == 1: | |||
tree = solutions[0] | |||
else: | |||
tree = Tree('_ambig', solutions) | |||
if self.resolve_ambiguity: | |||
tree = self.resolve_ambiguity(tree) | |||
elif len(solutions) > 1: | |||
raise ParseError('Earley should not generate multiple start symbol items!') | |||
return ApplyCallbacks(self.postprocess).transform(tree) | |||
## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. | |||
# This means the caller can work directly with the SPPF tree. | |||
if not self.resolve_ambiguity: | |||
return solutions[0] | |||
# ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | |||
# according to the rules. | |||
return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() | |||
class ApplyCallbacks(Transformer_InPlace): | |||
def __init__(self, postprocess): | |||
@@ -0,0 +1,80 @@ | |||
"This module implements an Earley Parser" | |||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
# (right now ambiguity resolution is not developed beyond the needs of lark) | |||
# Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||
# I use the no-recursion version of Transformer, because the tree might be | |||
# deeper than Python's recursion limit (a bit absurd, but that's life) | |||
# | |||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||
# Column keeps track of new items using NewsList instances. | |||
# | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
## for recursive repr | |||
from ..tree import Tree | |||
class Derivation(Tree): | |||
def __init__(self, rule, children = None): | |||
Tree.__init__(self, 'drv', children if children is not None else []) | |||
self.meta.rule = rule | |||
self._hash = None | |||
def __repr__(self, indent = 0): | |||
return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...') | |||
def __hash__(self): | |||
if self._hash is None: | |||
self._hash = Tree.__hash__(self) | |||
return self._hash | |||
class Item(object): | |||
"An Earley Item, the atom of the algorithm." | |||
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') | |||
def __init__(self, rule, ptr, start): | |||
self.is_complete = len(rule.expansion) == ptr | |||
self.rule = rule # rule | |||
self.ptr = ptr # ptr | |||
self.start = start # j | |||
self.node = None # w | |||
if self.is_complete: | |||
self.s = rule.origin | |||
self.expect = None | |||
else: | |||
self.s = (rule, ptr) | |||
self.expect = rule.expansion[ptr] | |||
self._hash = hash((self.s, self.start.i)) | |||
def advance(self): | |||
return self.__class__(self.rule, self.ptr + 1, self.start) | |||
def __eq__(self, other): | |||
return self is other or (self.s == other.s and self.start.i == other.start.i) | |||
def __hash__(self): | |||
return self._hash | |||
def __repr__(self): | |||
return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i) | |||
class Column: | |||
"An entry in the table, aka Earley Chart. Contains lists of items." | |||
def __init__(self, i, FIRST): | |||
self.i = i | |||
self.items = set() | |||
self.FIRST = FIRST | |||
def add(self, item): | |||
"""Sort items into scan/predict/reduce newslists | |||
Makes sure only unique items are added. | |||
""" | |||
self.items.add(item) | |||
def __bool__(self): | |||
return bool(self.items) | |||
__nonzero__ = __bool__ # Py2 backwards-compatibility |
@@ -0,0 +1,347 @@ | |||
""""This module implements an SPPF implementation | |||
This is used as the primary output mechanism for the Earley parser | |||
in order to store complex ambiguities. | |||
Full reference and more details is here: | |||
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||
""" | |||
from ..tree import Tree | |||
from ..exceptions import ParseError | |||
from ..lexer import Token | |||
from ..utils import Str | |||
from ..grammar import NonTerminal, Terminal | |||
from .earley_common import Column, Derivation | |||
from collections import deque | |||
class SymbolNode(object): | |||
""" | |||
A Symbol Node represents a symbol (or Intermediate LR0). | |||
Symbol nodes are keyed by the symbol (s). For intermediate nodes | |||
s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol | |||
nodes, s will be a string representing the non-terminal origin (i.e. | |||
the left hand side of the rule). | |||
The children of a Symbol or Intermediate Node will always be Packed Nodes; | |||
with each Packed Node child representing a single derivation of a production. | |||
Hence a Symbol Node with a single child is unambiguous. | |||
""" | |||
__slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate') | |||
def __init__(self, s, start, end): | |||
self.s = s | |||
self.start = start | |||
self.end = end | |||
self.children = set() | |||
self.priority = None | |||
self.is_intermediate = isinstance(s, tuple) | |||
def add_family(self, lr0, rule, start, left, right): | |||
self.children.add(PackedNode(self, lr0, rule, start, left, right)) | |||
@property | |||
def is_ambiguous(self): | |||
return len(self.children) > 1 | |||
def __iter__(self): | |||
return iter(self.children) | |||
def __eq__(self, other): | |||
if not isinstance(other, SymbolNode): | |||
return False | |||
return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) | |||
def __hash__(self): | |||
return hash((self.s, self.start.i, self.end.i)) | |||
def __repr__(self): | |||
symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name | |||
return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) | |||
class PackedNode(object): | |||
""" | |||
A Packed Node represents a single derivation in a symbol node. | |||
""" | |||
__slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash') | |||
def __init__(self, parent, s, rule, start, left, right): | |||
self.parent = parent | |||
self.s = s | |||
self.start = start | |||
self.rule = rule | |||
self.left = left | |||
self.right = right | |||
self.priority = None | |||
self._hash = hash((self.s, self.start.i, self.left, self.right)) | |||
@property | |||
def is_empty(self): | |||
return self.left is None and self.right is None | |||
def __iter__(self): | |||
return iter([self.left, self.right]) | |||
def __lt__(self, other): | |||
if self.is_empty and not other.is_empty: return True | |||
if self.priority < other.priority: return True | |||
return False | |||
def __gt__(self, other): | |||
if self.is_empty and not other.is_empty: return True | |||
if self.priority > other.priority: return True | |||
return False | |||
def __eq__(self, other): | |||
if not isinstance(other, PackedNode): | |||
return False | |||
return self is other or (self.s == other.s and self.start == other.start and self.left == other.left and self.right == other.right) | |||
def __hash__(self): | |||
return self._hash | |||
def __repr__(self): | |||
symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name | |||
return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0) | |||
class ForestVisitor(object): | |||
""" | |||
An abstract base class for building forest visitors. | |||
Use this as a base when you need to walk the forest. | |||
""" | |||
def __init__(self, root): | |||
self.root = root | |||
self.result = None | |||
def visit_token_node(self, node): pass | |||
def visit_symbol_node_in(self, node): pass | |||
def visit_symbol_node_out(self, node): pass | |||
def visit_packed_node_in(self, node): pass | |||
def visit_packed_node_out(self, node): pass | |||
def go(self): | |||
# Visiting is a list of IDs of all symbol/intermediate nodes currently in | |||
# the stack. It serves two purposes: to detect when we 'recurse' in and out | |||
# of a symbol/intermediate so that we can process both up and down. Also, | |||
# since the SPPF can have cycles it allows us to detect if we're trying | |||
# to recurse into a node that's already on the stack (infinite recursion). | |||
visiting = set() | |||
# We do not use recursion here to walk the Forest due to the limited | |||
# stack size in python. Therefore input_stack is essentially our stack. | |||
input_stack = deque([self.root]) | |||
# It is much faster to cache these as locals since they are called | |||
# many times in large parses. | |||
vpno = getattr(self, 'visit_packed_node_out') | |||
vpni = getattr(self, 'visit_packed_node_in') | |||
vsno = getattr(self, 'visit_symbol_node_out') | |||
vsni = getattr(self, 'visit_symbol_node_in') | |||
vtn = getattr(self, 'visit_token_node') | |||
while input_stack: | |||
current = next(reversed(input_stack)) | |||
try: | |||
next_node = next(current) | |||
except StopIteration: | |||
input_stack.pop() | |||
continue | |||
except TypeError: | |||
### If the current object is not an iterator, pass through to Token/SymbolNode | |||
pass | |||
else: | |||
if next_node is None: | |||
continue | |||
if id(next_node) in visiting: | |||
raise ParseError("Infinite recursion in grammar!") | |||
input_stack.append(next_node) | |||
continue | |||
if isinstance(current, Str): | |||
vtn(current) | |||
input_stack.pop() | |||
continue | |||
current_id = id(current) | |||
if current_id in visiting: | |||
if isinstance(current, PackedNode): vpno(current) | |||
else: vsno(current) | |||
input_stack.pop() | |||
visiting.remove(current_id) | |||
continue | |||
else: | |||
visiting.add(current_id) | |||
if isinstance(current, PackedNode): next_node = vpni(current) | |||
else: next_node = vsni(current) | |||
if next_node is None: | |||
continue | |||
if id(next_node) in visiting: | |||
raise ParseError("Infinite recursion in grammar!") | |||
input_stack.append(next_node) | |||
continue | |||
return self.result | |||
class ForestSumVisitor(ForestVisitor): | |||
""" | |||
A visitor for prioritizing ambiguous parts of the Forest. | |||
This visitor is the default when resolving ambiguity. It pushes the priorities | |||
from the rules into the SPPF nodes; and then sorts the packed node children | |||
of ambiguous symbol or intermediate node according to the priorities. | |||
This relies on the custom sort function provided in PackedNode.__lt__; which | |||
uses these properties (and other factors) to sort the ambiguous packed nodes. | |||
""" | |||
def visit_packed_node_in(self, node): | |||
return iter([node.left, node.right]) | |||
def visit_symbol_node_in(self, node): | |||
return iter(node.children) | |||
def visit_packed_node_out(self, node): | |||
node.priority = 0 | |||
if node.rule.options and node.rule.options.priority: node.priority += node.rule.options.priority | |||
if node.right is not None and hasattr(node.right, 'priority'): node.priority += node.right.priority | |||
if node.left is not None and hasattr(node.left, 'priority'): node.priority += node.left.priority | |||
def visit_symbol_node_out(self, node): | |||
node.priority = max(child.priority for child in node.children) | |||
node.children = sorted(node.children, reverse = True) | |||
class ForestAntiscoreSumVisitor(ForestSumVisitor): | |||
""" | |||
A visitor for prioritizing ambiguous parts of the Forest. | |||
This visitor is used when resolve_ambiguity == 'resolve__antiscore_sum'. | |||
It pushes the priorities from the rules into the SPPF nodes, and implements | |||
a 'least cost' mechanism for resolving ambiguity (reverse of the default | |||
priority mechanism). It uses a custom __lt__ comparator key for sorting | |||
the packed node children. | |||
""" | |||
def visit_symbol_node_out(self, node): | |||
node.priority = min(child.priority for child in node.children) | |||
node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) | |||
class AntiscoreSumComparator(object): | |||
""" | |||
An antiscore-sum comparator for PackedNode objects. | |||
This allows 'sorting' an iterable of PackedNode objects so that they | |||
are arranged lowest priority first. | |||
""" | |||
__slots__ = ['obj'] | |||
def __init__(self, obj, *args): | |||
self.obj = obj | |||
def __lt__(self, other): | |||
if self.obj.is_empty and not other.obj.is_empty: return True | |||
if self.obj.priority > other.obj.priority: return True | |||
return False | |||
def __gt__(self, other): | |||
if self.obj.is_empty and not other.obj.is_empty: return True | |||
if self.obj.priority < other.obj.priority: return True | |||
return False | |||
class ForestToTreeVisitor(ForestVisitor): | |||
""" | |||
A Forest visitor which converts an SPPF forest to an unambiguous AST. | |||
The implementation in this visitor walks only the first ambiguous child | |||
of each symbol node. When it finds an ambiguous symbol node it first | |||
calls the forest_sum_visitor implementation to sort the children | |||
into preference order using the algorithms defined there; so the first | |||
child should always be the highest preference. The forest_sum_visitor | |||
implementation should be another ForestVisitor which sorts the children | |||
according to some priority mechanism. | |||
""" | |||
def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None): | |||
super(ForestToTreeVisitor, self).__init__(root) | |||
self.forest_sum_visitor = forest_sum_visitor | |||
self.output_stack = deque() | |||
self.callbacks = callbacks | |||
self.result = None | |||
def visit_token_node(self, node): | |||
self.output_stack[-1].append(node) | |||
def visit_symbol_node_in(self, node): | |||
if node.is_ambiguous and node.priority is None: | |||
self.forest_sum_visitor(node).go() | |||
return next(iter(node.children)) | |||
def visit_packed_node_in(self, node): | |||
if not node.parent.is_intermediate: | |||
self.output_stack.append([]) | |||
return iter([node.left, node.right]) | |||
def visit_packed_node_out(self, node): | |||
if not node.parent.is_intermediate: | |||
result = self.callbacks[node.rule](self.output_stack.pop()) | |||
if self.output_stack: | |||
self.output_stack[-1].append(result) | |||
else: | |||
self.result = result | |||
class ForestToAmbiguousTreeVisitor(ForestVisitor): | |||
""" | |||
A Forest visitor which converts an SPPF forest to an ambiguous AST. | |||
Because of the fundamental disparity between what can be stored in | |||
an SPPF and what can be stored in a Tree; this implementation is not | |||
complete. It correctly deals with ambiguities that occur on symbol nodes only, | |||
and cannot deal with ambiguities that occur on intermediate nodes. | |||
Usually, most parsers can be rewritten to avoid intermediate node | |||
ambiguities. Also, this implementation could be fixed, however | |||
the code to handle intermediate node ambiguities is messy and | |||
would not be performant. It is much better not to use this and | |||
instead to correctly disambiguate the forest and only store unambiguous | |||
parses in Trees. It is here just to provide some parity with the | |||
old ambiguity='explicit'. | |||
This is mainly used by the test framework, to make it simpler to write | |||
tests ensuring the SPPF contains the right results. | |||
""" | |||
def __init__(self, root, callbacks): | |||
super(ForestToAmbiguousTreeVisitor, self).__init__(root) | |||
self.output_stack = deque() | |||
self.callbacks = callbacks | |||
self.result = None | |||
def visit_token_node(self, node): | |||
self.output_stack[-1].children.append(node) | |||
def visit_symbol_node_in(self, node): | |||
if not node.is_intermediate and node.is_ambiguous: | |||
self.output_stack.append(Tree('_ambig', [])) | |||
return iter(node.children) | |||
def visit_symbol_node_out(self, node): | |||
if node.is_ambiguous: | |||
result = self.output_stack.pop() | |||
if self.output_stack: | |||
self.output_stack[-1].children.append(result) | |||
else: | |||
self.result = result | |||
def visit_packed_node_in(self, node): | |||
#### NOTE: | |||
## When an intermediate node (node.parent.s == tuple) has ambiguous children this | |||
## forest visitor will break. | |||
if not node.parent.is_intermediate: | |||
self.output_stack.append(Tree('drv', [])) | |||
return iter([node.left, node.right]) | |||
def visit_packed_node_out(self, node): | |||
if not node.parent.is_intermediate: | |||
result = self.callbacks[node.rule](self.output_stack.pop().children) | |||
if self.output_stack: | |||
self.output_stack[-1].children.append(result) | |||
else: | |||
self.result = result |
@@ -1,109 +0,0 @@ | |||
from ..utils import compare | |||
from functools import cmp_to_key | |||
from ..tree import Tree | |||
# Standard ambiguity resolver (uses comparison) | |||
# | |||
# Author: Erez Sh | |||
def _compare_rules(rule1, rule2): | |||
return -compare( len(rule1.expansion), len(rule2.expansion)) | |||
def _sum_priority(tree): | |||
p = 0 | |||
for n in tree.iter_subtrees(): | |||
try: | |||
p += n.meta.rule.options.priority or 0 | |||
except AttributeError: | |||
pass | |||
return p | |||
def _compare_priority(tree1, tree2): | |||
tree1.iter_subtrees() | |||
def _compare_drv(tree1, tree2): | |||
try: | |||
rule1 = tree1.meta.rule | |||
except AttributeError: | |||
rule1 = None | |||
try: | |||
rule2 = tree2.meta.rule | |||
except AttributeError: | |||
rule2 = None | |||
if None == rule1 == rule2: | |||
return compare(tree1, tree2) | |||
elif rule1 is None: | |||
return -1 | |||
elif rule2 is None: | |||
return 1 | |||
assert tree1.data != '_ambig' | |||
assert tree2.data != '_ambig' | |||
p1 = _sum_priority(tree1) | |||
p2 = _sum_priority(tree2) | |||
c = (p1 or p2) and compare(p1, p2) | |||
if c: | |||
return c | |||
c = _compare_rules(tree1.meta.rule, tree2.meta.rule) | |||
if c: | |||
return c | |||
# rules are "equal", so compare trees | |||
if len(tree1.children) == len(tree2.children): | |||
for t1, t2 in zip(tree1.children, tree2.children): | |||
c = _compare_drv(t1, t2) | |||
if c: | |||
return c | |||
return compare(len(tree1.children), len(tree2.children)) | |||
def _standard_resolve_ambig(tree): | |||
assert tree.data == '_ambig' | |||
key_f = cmp_to_key(_compare_drv) | |||
best = max(tree.children, key=key_f) | |||
assert best.data == 'drv' | |||
tree.set('drv', best.children) | |||
tree.meta.rule = best.meta.rule # needed for applying callbacks | |||
def standard_resolve_ambig(tree): | |||
for ambig in tree.find_data('_ambig'): | |||
_standard_resolve_ambig(ambig) | |||
return tree | |||
# Anti-score Sum | |||
# | |||
# Author: Uriva (https://github.com/uriva) | |||
def _antiscore_sum_drv(tree): | |||
if not isinstance(tree, Tree): | |||
return 0 | |||
assert tree.data != '_ambig' | |||
return _sum_priority(tree) | |||
def _antiscore_sum_resolve_ambig(tree): | |||
assert tree.data == '_ambig' | |||
best = min(tree.children, key=_antiscore_sum_drv) | |||
assert best.data == 'drv' | |||
tree.set('drv', best.children) | |||
tree.meta.rule = best.meta.rule # needed for applying callbacks | |||
def antiscore_sum_resolve_ambig(tree): | |||
for ambig in tree.find_data('_ambig'): | |||
_antiscore_sum_resolve_ambig(ambig) | |||
return tree |
@@ -1,107 +1,163 @@ | |||
"This module implements an experimental Earley Parser with a dynamic lexer" | |||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
# (right now ambiguity resolution is not developed beyond the needs of lark) | |||
# Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||
# I use the no-recursion version of Transformer and Visitor, because the tree might be | |||
# deeper than Python's recursion limit (a bit absurd, but that's life) | |||
# | |||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||
# Column keeps track of new items using NewsList instances. | |||
# | |||
# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser | |||
# uses regular expressions by necessity, achieving high-performance while maintaining all of | |||
# Earley's power in parsing any CFG. | |||
# | |||
# | |||
"""This module implements an experimental Earley parser with a dynamic lexer | |||
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: | |||
https://www.sciencedirect.com/science/article/pii/S1571066108001497 | |||
That is probably the best reference for understanding the algorithm here. | |||
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format | |||
is better documented here: | |||
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||
Instead of running a lexer beforehand, or using a costy char-by-char method, this parser | |||
uses regular expressions by necessity, achieving high-performance while maintaining all of | |||
Earley's power in parsing any CFG. | |||
""" | |||
# Author: Erez Shinan (2017) | |||
# Email : erezshin@gmail.com | |||
from collections import defaultdict | |||
from collections import defaultdict, deque | |||
from ..exceptions import ParseError, UnexpectedCharacters | |||
from ..lexer import Token | |||
from ..tree import Tree | |||
from .grammar_analysis import GrammarAnalyzer | |||
from ..grammar import NonTerminal, Terminal | |||
from .earley import ApplyCallbacks, Item, Column | |||
from .earley import ApplyCallbacks | |||
from .earley_common import Column, Item | |||
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | |||
class Parser: | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False): | |||
self.analysis = GrammarAnalyzer(parser_conf) | |||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor, ignore = (), complete_lex = False): | |||
analysis = GrammarAnalyzer(parser_conf) | |||
self.parser_conf = parser_conf | |||
self.resolve_ambiguity = resolve_ambiguity | |||
self.forest_sum_visitor = forest_sum_visitor | |||
self.ignore = [Terminal(t) for t in ignore] | |||
self.predict_all = predict_all | |||
self.complete_lex = complete_lex | |||
self.FIRST = self.analysis.FIRST | |||
self.postprocess = {} | |||
self.FIRST = analysis.FIRST | |||
self.callbacks = {} | |||
self.predictions = {} | |||
## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than | |||
# the slow 'isupper' in is_terminal. | |||
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | |||
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | |||
for rule in parser_conf.rules: | |||
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) | |||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) | |||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | |||
self.term_matcher = term_matcher | |||
def parse(self, stream, start_symbol=None): | |||
# Define parser functions | |||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | |||
delayed_matches = defaultdict(list) | |||
match = self.term_matcher | |||
text_line = 1 | |||
text_column = 1 | |||
def predict(nonterm, column): | |||
assert not nonterm.is_term, nonterm | |||
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
def complete(item): | |||
name = item.rule.origin | |||
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||
def predict_and_complete(column): | |||
while True: | |||
to_predict = {x.expect for x in column.to_predict.get_news() | |||
if x.ptr} # if not part of an already predicted batch | |||
to_reduce = column.to_reduce.get_news() | |||
if not (to_predict or to_reduce): | |||
break | |||
for nonterm in to_predict: | |||
column.add( predict(nonterm, column) ) | |||
for item in to_reduce: | |||
new_items = list(complete(item)) | |||
if item in new_items: | |||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||
column.add(new_items) | |||
def scan(i, column): | |||
to_scan = column.to_scan | |||
# Held Completions (H in E.Scotts paper). | |||
held_completions = {} | |||
for x in self.ignore: | |||
m = match(x, stream, i) | |||
if m: | |||
delayed_matches[m.end()] += set(to_scan) | |||
delayed_matches[m.end()] += set(column.to_reduce) | |||
# Cache for nodes & tokens created in a particular parse step. | |||
node_cache = {} | |||
token_cache = {} | |||
# TODO add partial matches for ignore too? | |||
# s = m.group(0) | |||
# for j in range(1, len(s)): | |||
# m = x.match(s[:-j]) | |||
# if m: | |||
# delayed_matches[m.end()] += to_scan | |||
text_line = 1 | |||
text_column = 0 | |||
for item in to_scan: | |||
def make_symbol_node(s, start, end): | |||
label = (s, start.i, end.i) | |||
if label in node_cache: | |||
node = node_cache[label] | |||
else: | |||
node = node_cache[label] = SymbolNode(s, start, end) | |||
return node | |||
def predict_and_complete(column, to_scan): | |||
"""The core Earley Predictor and Completer. | |||
At each stage of the input, we handling any completed items (things | |||
that matched on the last cycle) and use those to predict what should | |||
come next in the input stream. The completions and any predicted | |||
non-terminals are recursively processed until we reach a set of, | |||
which can be added to the scan list for the next scanner cycle.""" | |||
held_completions.clear() | |||
# R (items) = Ei (column.items) | |||
items = deque(column.items) | |||
while items: | |||
item = items.pop() # remove an element, A say, from R | |||
### The Earley completer | |||
if item.is_complete: ### (item.s == string) | |||
if item.node is None: | |||
item.node = make_symbol_node(item.s, item.start, column) | |||
item.node.add_family(item.s, item.rule, item.start, None, None) | |||
# Empty has 0 length. If we complete an empty symbol in a particular | |||
# parse step, we need to be able to use that same empty symbol to complete | |||
# any predictions that result, that themselves require empty. Avoids | |||
# infinite recursion on empty symbols. | |||
# held_completions is 'H' in E.Scott's paper. | |||
is_empty_item = item.start.i == column.i | |||
if is_empty_item: | |||
held_completions[item.rule.origin] = item.node | |||
originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||
for originator in originators: | |||
new_item = originator.advance() | |||
new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | |||
if new_item.expect in self.TERMINALS: | |||
# Add (B :: aC.B, h, y) to Q | |||
to_scan.add(new_item) | |||
elif new_item not in column.items: | |||
# Add (B :: aC.B, h, y) to Ei and R | |||
column.add(new_item) | |||
items.append(new_item) | |||
### The Earley predictor | |||
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | |||
new_items = [] | |||
for rule in self.predictions[item.expect]: | |||
new_item = Item(rule, 0, column) | |||
new_items.append(new_item) | |||
# Process any held completions (H). | |||
if item.expect in held_completions: | |||
new_item = item.advance() | |||
new_item.node = make_symbol_node(new_item.s, item.start, column) | |||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | |||
new_items.append(new_item) | |||
for new_item in new_items: | |||
if new_item.expect in self.TERMINALS: | |||
to_scan.add(new_item) | |||
elif new_item not in column.items: | |||
column.add(new_item) | |||
items.append(new_item) | |||
def scan(i, column, to_scan): | |||
"""The core Earley Scanner. | |||
This is a custom implementation of the scanner that uses the | |||
Lark lexer to match tokens. The scan list is built by the | |||
Earley predictor, based on the previously completed tokens. | |||
This ensures that at each phase of the parse we have a custom | |||
lexer context, allowing for more complex ambiguities.""" | |||
# 1) Loop the expectations and ask the lexer to match. | |||
# Since regexp is forward looking on the input stream, and we only | |||
# want to process tokens when we hit the point in the stream at which | |||
# they complete, we push all tokens into a buffer (delayed_matches), to | |||
# be held possibly for a later parse step when we reach the point in the | |||
# input stream at which they complete. | |||
for item in set(to_scan): | |||
m = match(item.expect, stream, i) | |||
if m: | |||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
delayed_matches[m.end()].append(item.advance(t)) | |||
delayed_matches[m.end()].append( (item, column, t) ) | |||
if self.complete_lex: | |||
s = m.group(0) | |||
@@ -109,25 +165,85 @@ class Parser: | |||
m = match(item.expect, s[:-j]) | |||
if m: | |||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
delayed_matches[i+m.end()].append(item.advance(t)) | |||
delayed_matches[i+m.end()].append( (item, column, t) ) | |||
# Remove any items that successfully matched in this pass from the to_scan buffer. | |||
# This ensures we don't carry over tokens that already matched, if we're ignoring below. | |||
to_scan.remove(item) | |||
# 3) Process any ignores. This is typically used for e.g. whitespace. | |||
# We carry over any unmatched items from the to_scan buffer to be matched again after | |||
# the ignore. This should allow us to use ignored symbols in non-terminals to implement | |||
# e.g. mandatory spacing. | |||
for x in self.ignore: | |||
m = match(x, stream, i) | |||
if m: | |||
# Carry over any items still in the scan buffer, to past the end of the ignored items. | |||
delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ]) | |||
# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. | |||
delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol]) | |||
next_set = Column(i + 1, self.FIRST) # Ei+1 | |||
next_to_scan = set() | |||
## 4) Process Tokens from delayed_matches. | |||
# This is the core of the Earley scanner. Create an SPPF node for each Token, | |||
# and create the symbol node in the SPPF tree. Advance the item that completed, | |||
# and add the resulting new item to either the Earley set (for processing by the | |||
# completer/predictor) or the to_scan buffer for the next parse step. | |||
for item, start, token in delayed_matches[i+1]: | |||
if token is not None: | |||
new_item = item.advance() | |||
new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | |||
else: | |||
new_item = item | |||
if new_item.expect in self.TERMINALS: | |||
# add (B ::= Aai+1.B, h, y) to Q' | |||
next_to_scan.add(new_item) | |||
else: | |||
# add (B ::= Aa+1.B, h, y) to Ei+1 | |||
next_set.add(new_item) | |||
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | |||
next_set.add(delayed_matches[i+1]) | |||
del delayed_matches[i+1] # No longer needed, so unburden memory | |||
if not next_set and not delayed_matches: | |||
if not next_set and not delayed_matches and not next_to_scan: | |||
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) | |||
return next_set | |||
return next_set, next_to_scan | |||
# Main loop starts | |||
column0 = Column(0, self.FIRST, predict_all=self.predict_all) | |||
column0.add(predict(start_symbol, column0)) | |||
column0 = Column(0, self.FIRST) | |||
column = column0 | |||
## The scan buffer. 'Q' in E.Scott's paper. | |||
to_scan = set() | |||
## Predict for the start_symbol. | |||
# Add predicted items to the first Earley set (for the predictor) if they | |||
# result in a non-terminal, or the scanner if they result in a terminal. | |||
for rule in self.predictions[start_symbol]: | |||
item = Item(rule, 0, column0) | |||
if item.expect in self.TERMINALS: | |||
to_scan.add(item) | |||
else: | |||
column.add(item) | |||
## The main Earley loop. | |||
# Run the Prediction/Completion cycle for any Items in the current Earley set. | |||
# Completions will be added to the SPPF tree, and predictions will be recursively | |||
# processed down to terminals/empty nodes to be added to the scanner for the next | |||
# step. | |||
for i, token in enumerate(stream): | |||
predict_and_complete(column) | |||
column = scan(i, column) | |||
predict_and_complete(column, to_scan) | |||
# Clear the node_cache and token_cache, which are only relevant for each | |||
# step in the Earley pass. | |||
node_cache.clear() | |||
token_cache.clear() | |||
column, to_scan = scan(i, column, to_scan) | |||
if token == '\n': | |||
text_line += 1 | |||
@@ -135,24 +251,24 @@ class Parser: | |||
else: | |||
text_column += 1 | |||
predict_and_complete(column) | |||
predict_and_complete(column, to_scan) | |||
# Parse ended. Now build a parse tree | |||
solutions = [n.tree for n in column.to_reduce | |||
if n.rule.origin==start_symbol and n.start is column0] | |||
## Column is now the final column in the parse. If the parse was successful, the start | |||
# symbol should have been completed in the last step of the Earley cycle, and will be in | |||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||
if not solutions: | |||
expected_tokens = [t.expect for t in column.to_scan] | |||
expected_tokens = [t.expect for t in to_scan] | |||
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) | |||
elif len(solutions) > 1: | |||
raise Exception('Earley should not generate more than one start symbol - bug') | |||
elif len(solutions) == 1: | |||
tree = solutions[0] | |||
else: | |||
tree = Tree('_ambig', solutions) | |||
if self.resolve_ambiguity: | |||
tree = self.resolve_ambiguity(tree) | |||
return ApplyCallbacks(self.postprocess).transform(tree) | |||
## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. | |||
# This means the caller can work directly with the SPPF tree. | |||
if not self.resolve_ambiguity: | |||
return solutions[0] | |||
# ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | |||
# according to the rules. | |||
return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() |
@@ -114,7 +114,7 @@ class Reconstructor: | |||
def _reconstruct(self, tree): | |||
# TODO: ambiguity? | |||
parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=resolve_ambig.standard_resolve_ambig) | |||
parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=True) | |||
unreduced_tree = parser.parse(tree.children) # find a full derivation | |||
assert unreduced_tree.data == tree.data | |||
res = self.write_tokens.transform(unreduced_tree) | |||
@@ -21,6 +21,8 @@ from lark.lark import Lark | |||
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput | |||
from lark.tree import Tree | |||
from lark.visitors import Transformer | |||
from lark.parsers.earley_forest import ForestToAmbiguousTreeVisitor | |||
from lark.parsers.earley import ApplyCallbacks | |||
__path__ = os.path.dirname(__file__) | |||
def _read(n, *args): | |||
@@ -236,10 +238,11 @@ def _make_full_earley_test(LEXER): | |||
""" | |||
parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') | |||
res = parser.parse('ab') | |||
self.assertEqual( res.data, '_ambig') | |||
self.assertEqual( len(res.children), 2) | |||
root_symbol = parser.parse('ab') | |||
ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() | |||
print(ambig_tree.pretty()) | |||
self.assertEqual( ambig_tree.data, '_ambig') | |||
self.assertEqual( len(ambig_tree.children), 2) | |||
def test_ambiguity1(self): | |||
grammar = """ | |||
@@ -251,9 +254,35 @@ def _make_full_earley_test(LEXER): | |||
""" | |||
l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) | |||
x = l.parse('cde') | |||
assert x.data == '_ambig', x | |||
assert len(x.children) == 2 | |||
root_symbol = l.parse('cde') | |||
ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, l.parser.parser.callbacks).go() | |||
print(ambig_tree.pretty()) | |||
# tree = ApplyCallbacks(l.parser.parser.postprocess).transform(ambig_tree) | |||
assert ambig_tree.data == '_ambig', ambig_tree | |||
assert len(ambig_tree.children) == 2 | |||
@unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") | |||
def test_ambiguity2(self): | |||
grammar = """ | |||
ANY: /[a-zA-Z0-9 ]+/ | |||
a.2: "A" b+ | |||
b.2: "B" | |||
c: ANY | |||
start: (a|c)* | |||
""" | |||
l = Lark(grammar, parser='earley', lexer=LEXER) | |||
res = l.parse('ABX') | |||
expected = Tree('start', [ | |||
Tree('a', [ | |||
Tree('b', []) | |||
]), | |||
Tree('c', [ | |||
'X' | |||
]) | |||
]) | |||
self.assertEqual(res, expected) | |||
def test_fruitflies_ambig(self): | |||
grammar = """ | |||
@@ -272,7 +301,9 @@ def _make_full_earley_test(LEXER): | |||
%ignore WS | |||
""" | |||
parser = Lark(grammar, ambiguity='explicit', lexer=LEXER) | |||
res = parser.parse('fruit flies like bananas') | |||
root_symbol = parser.parse('fruit flies like bananas') | |||
tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() | |||
# tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree) | |||
expected = Tree('_ambig', [ | |||
Tree('comparative', [ | |||
@@ -290,7 +321,7 @@ def _make_full_earley_test(LEXER): | |||
# print res.pretty() | |||
# print expected.pretty() | |||
self.assertEqual(res, expected) | |||
self.assertEqual(tree, expected) | |||
@unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") | |||
@@ -303,7 +334,9 @@ def _make_full_earley_test(LEXER): | |||
text = """cat""" | |||
parser = Lark(grammar, start='start', ambiguity='explicit') | |||
tree = parser.parse(text) | |||
root_symbol = parser.parse(text) | |||
ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol).go() | |||
tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree) | |||
self.assertEqual(tree.data, '_ambig') | |||
combinations = {tuple(str(s) for s in t.children) for t in tree.children} | |||