@@ -66,7 +66,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||||
if allowed: | if allowed: | ||||
message += '\nExpecting: %s\n' % allowed | message += '\nExpecting: %s\n' % allowed | ||||
super(UnexpectedCharacters, self).__init__(message) | |||||
super(UnexpectedCharacters, self).__init__(message.encode('utf-8')) | |||||
@@ -84,6 +84,6 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||||
"Expected one of: \n\t* %s\n" | "Expected one of: \n\t* %s\n" | ||||
% (token, self.line, self.column, '\n\t* '.join(self.expected))) | % (token, self.line, self.column, '\n\t* '.join(self.expected))) | ||||
super(UnexpectedToken, self).__init__(message) | |||||
super(UnexpectedToken, self).__init__(message.encode('utf-8')) | |||||
###} | ###} |
@@ -62,14 +62,13 @@ class LarkOptions(object): | |||||
self.profile = o.pop('profile', False) | self.profile = o.pop('profile', False) | ||||
self.ambiguity = o.pop('ambiguity', 'auto') | self.ambiguity = o.pop('ambiguity', 'auto') | ||||
self.propagate_positions = o.pop('propagate_positions', False) | self.propagate_positions = o.pop('propagate_positions', False) | ||||
self.earley__predict_all = o.pop('earley__predict_all', False) | |||||
self.lexer_callbacks = o.pop('lexer_callbacks', {}) | self.lexer_callbacks = o.pop('lexer_callbacks', {}) | ||||
assert self.parser in ('earley', 'lalr', 'cyk', None) | assert self.parser in ('earley', 'lalr', 'cyk', None) | ||||
if self.parser == 'earley' and self.transformer: | |||||
raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.' | |||||
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') | |||||
if self.ambiguity == 'explicit' and self.transformer: | |||||
raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm for explicit ambiguity.' | |||||
'Please use your transformer on the resulting Forest, or use a different algorithm (i.e. LALR)') | |||||
if o: | if o: | ||||
raise ValueError("Unknown options: %s" % o.keys()) | raise ValueError("Unknown options: %s" % o.keys()) | ||||
@@ -176,7 +175,7 @@ class Lark: | |||||
def _build_parser(self): | def _build_parser(self): | ||||
self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') | |||||
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit') | |||||
callback = self._parse_tree_builder.create_callback(self.options.transformer) | callback = self._parse_tree_builder.create_callback(self.options.transformer) | ||||
if self.profiler: | if self.profiler: | ||||
for f in dir(callback): | for f in dir(callback): | ||||
@@ -7,6 +7,7 @@ from .visitors import InlineTransformer # XXX Deprecated | |||||
###{standalone | ###{standalone | ||||
from functools import partial, wraps | from functools import partial, wraps | ||||
from itertools import repeat, product | |||||
class ExpandSingleChild: | class ExpandSingleChild: | ||||
@@ -62,23 +63,11 @@ class PropagatePositions: | |||||
class ChildFilter: | class ChildFilter: | ||||
"Optimized childfilter (assumes no duplication in parse tree, so it's safe to change it)" | |||||
def __init__(self, to_include, node_builder): | def __init__(self, to_include, node_builder): | ||||
self.node_builder = node_builder | self.node_builder = node_builder | ||||
self.to_include = to_include | self.to_include = to_include | ||||
def __call__(self, children): | |||||
filtered = [] | |||||
for i, to_expand in self.to_include: | |||||
if to_expand: | |||||
filtered += children[i].children | |||||
else: | |||||
filtered.append(children[i]) | |||||
return self.node_builder(filtered) | |||||
class ChildFilterLALR(ChildFilter): | |||||
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" | |||||
def __call__(self, children): | def __call__(self, children): | ||||
filtered = [] | filtered = [] | ||||
for i, to_expand in self.to_include: | for i, to_expand in self.to_include: | ||||
@@ -89,19 +78,43 @@ class ChildFilterLALR(ChildFilter): | |||||
filtered = children[i].children | filtered = children[i].children | ||||
else: | else: | ||||
filtered.append(children[i]) | filtered.append(children[i]) | ||||
return self.node_builder(filtered) | return self.node_builder(filtered) | ||||
def _should_expand(sym): | def _should_expand(sym): | ||||
return not sym.is_term and sym.name.startswith('_') | return not sym.is_term and sym.name.startswith('_') | ||||
def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous): | |||||
def maybe_create_child_filter(expansion, keep_all_tokens): | |||||
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) | to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) | ||||
if keep_all_tokens or not (sym.is_term and sym.filter_out)] | if keep_all_tokens or not (sym.is_term and sym.filter_out)] | ||||
if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | ||||
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) | |||||
return partial(ChildFilter, to_include) | |||||
class AmbiguousExpander: | |||||
"""Deal with the case where we're expanding children ('_rule') into a parent but the children | |||||
are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself | |||||
ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children | |||||
into the right parents in the right places, essentially shifting the ambiguiuty up the tree.""" | |||||
def __init__(self, to_expand, tree_class, node_builder): | |||||
self.node_builder = node_builder | |||||
self.tree_class = tree_class | |||||
self.to_expand = to_expand | |||||
def __call__(self, children): | |||||
def _is_ambig_tree(child): | |||||
return hasattr(child, 'data') and child.data == '_ambig' | |||||
ambiguous = [i for i in self.to_expand if _is_ambig_tree(children[i])] | |||||
if ambiguous: | |||||
expand = [iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children)] | |||||
return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))]) | |||||
return self.node_builder(children) | |||||
def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): | |||||
to_expand = [i for i, sym in enumerate(expansion) | |||||
if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))] | |||||
if to_expand: | |||||
return partial(AmbiguousExpander, to_expand, tree_class) | |||||
class Callback(object): | class Callback(object): | ||||
pass | pass | ||||
@@ -113,8 +126,6 @@ def ptb_inline_args(func): | |||||
return func(*children) | return func(*children) | ||||
return f | return f | ||||
class ParseTreeBuilder: | class ParseTreeBuilder: | ||||
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): | def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): | ||||
self.tree_class = tree_class | self.tree_class = tree_class | ||||
@@ -133,9 +144,10 @@ class ParseTreeBuilder: | |||||
expand_single_child = options.expand1 if options else False | expand_single_child = options.expand1 if options else False | ||||
wrapper_chain = filter(None, [ | wrapper_chain = filter(None, [ | ||||
(expand_single_child and not rule.alias) and ExpandSingleChild, | |||||
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous), | |||||
self.propagate_positions and PropagatePositions, | self.propagate_positions and PropagatePositions, | ||||
(expand_single_child and not rule.alias) and ExpandSingleChild, | |||||
maybe_create_child_filter(rule.expansion, keep_all_tokens), | |||||
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), | |||||
]) | ]) | ||||
yield rule, wrapper_chain | yield rule, wrapper_chain | ||||
@@ -4,8 +4,7 @@ from functools import partial | |||||
from .utils import get_regexp_width | from .utils import get_regexp_width | ||||
from .parsers.grammar_analysis import GrammarAnalyzer | from .parsers.grammar_analysis import GrammarAnalyzer | ||||
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | ||||
from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | |||||
from .parsers import lalr_parser, earley, earley_forest, xearley, cyk | |||||
from .tree import Tree | from .tree import Tree | ||||
class WithLexer: | class WithLexer: | ||||
@@ -56,13 +55,13 @@ class LALR_CustomLexer(WithLexer): | |||||
self.lexer = lexer_cls(lexer_conf) | self.lexer = lexer_cls(lexer_conf) | ||||
def get_ambiguity_resolver(options): | |||||
def get_ambiguity_options(options): | |||||
if not options or options.ambiguity == 'resolve': | if not options or options.ambiguity == 'resolve': | ||||
return resolve_ambig.standard_resolve_ambig | |||||
return {} | |||||
elif options.ambiguity == 'resolve__antiscore_sum': | elif options.ambiguity == 'resolve__antiscore_sum': | ||||
return resolve_ambig.antiscore_sum_resolve_ambig | |||||
return {'forest_sum_visitor': earley_forest.ForestAntiscoreSumVisitor} | |||||
elif options.ambiguity == 'explicit': | elif options.ambiguity == 'explicit': | ||||
return None | |||||
return {'resolve_ambiguity': False} | |||||
raise ValueError(options) | raise ValueError(options) | ||||
def tokenize_text(text): | def tokenize_text(text): | ||||
@@ -78,8 +77,7 @@ class Earley(WithLexer): | |||||
def __init__(self, lexer_conf, parser_conf, options=None): | def __init__(self, lexer_conf, parser_conf, options=None): | ||||
self.init_traditional_lexer(lexer_conf) | self.init_traditional_lexer(lexer_conf) | ||||
self.parser = earley.Parser(parser_conf, self.match, | |||||
resolve_ambiguity=get_ambiguity_resolver(options)) | |||||
self.parser = earley.Parser(parser_conf, self.match, **get_ambiguity_options(options)) | |||||
def match(self, term, token): | def match(self, term, token): | ||||
return term.name == token.type | return term.name == token.type | ||||
@@ -91,11 +89,10 @@ class XEarley: | |||||
self._prepare_match(lexer_conf) | self._prepare_match(lexer_conf) | ||||
kw.update(get_ambiguity_options(options)) | |||||
self.parser = xearley.Parser(parser_conf, | self.parser = xearley.Parser(parser_conf, | ||||
self.match, | self.match, | ||||
resolve_ambiguity=get_ambiguity_resolver(options), | |||||
ignore=lexer_conf.ignore, | ignore=lexer_conf.ignore, | ||||
predict_all=options.earley__predict_all, | |||||
**kw | **kw | ||||
) | ) | ||||
@@ -1,160 +1,44 @@ | |||||
"This module implements an Earley Parser" | |||||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||||
# (right now ambiguity resolution is not developed beyond the needs of lark) | |||||
# Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||||
# I use the no-recursion version of Transformer, because the tree might be | |||||
# deeper than Python's recursion limit (a bit absurd, but that's life) | |||||
# | |||||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||||
# Column keeps track of new items using NewsList instances. | |||||
# | |||||
"""This module implements an scanerless Earley parser. | |||||
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: | |||||
https://www.sciencedirect.com/science/article/pii/S1571066108001497 | |||||
That is probably the best reference for understanding the algorithm here. | |||||
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format | |||||
is better documented here: | |||||
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||||
""" | |||||
# Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from ..tree import Tree | |||||
from ..visitors import Transformer_InPlace, v_args | from ..visitors import Transformer_InPlace, v_args | ||||
from ..exceptions import ParseError, UnexpectedToken | from ..exceptions import ParseError, UnexpectedToken | ||||
from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
from ..grammar import NonTerminal | from ..grammar import NonTerminal | ||||
from .earley_common import Column, Item | |||||
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | |||||
class Derivation(Tree): | |||||
def __init__(self, rule, items=None): | |||||
Tree.__init__(self, 'drv', items or []) | |||||
self.meta.rule = rule | |||||
self._hash = None | |||||
def _pretty_label(self): # Nicer pretty for debugging the parser | |||||
return self.meta.rule.origin.name if self.meta.rule else self.data | |||||
def __hash__(self): | |||||
if self._hash is None: | |||||
self._hash = Tree.__hash__(self) | |||||
return self._hash | |||||
class Item(object): | |||||
"An Earley Item, the atom of the algorithm." | |||||
def __init__(self, rule, ptr, start, tree): | |||||
self.rule = rule | |||||
self.ptr = ptr | |||||
self.start = start | |||||
self.tree = tree if tree is not None else Derivation(self.rule) | |||||
@property | |||||
def expect(self): | |||||
return self.rule.expansion[self.ptr] | |||||
@property | |||||
def is_complete(self): | |||||
return self.ptr == len(self.rule.expansion) | |||||
def advance(self, tree): | |||||
assert self.tree.data == 'drv' | |||||
new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||||
return self.__class__(self.rule, self.ptr+1, self.start, new_tree) | |||||
def __eq__(self, other): | |||||
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||||
def __hash__(self): | |||||
return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ | |||||
def __repr__(self): | |||||
before = list(map(str, self.rule.expansion[:self.ptr])) | |||||
after = list(map(str, self.rule.expansion[self.ptr:])) | |||||
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||||
class NewsList(list): | |||||
"Keeps track of newly added items (append-only)" | |||||
def __init__(self, initial=None): | |||||
list.__init__(self, initial or []) | |||||
self.last_iter = 0 | |||||
def get_news(self): | |||||
i = self.last_iter | |||||
self.last_iter = len(self) | |||||
return self[i:] | |||||
class Column: | |||||
"An entry in the table, aka Earley Chart. Contains lists of items." | |||||
def __init__(self, i, FIRST, predict_all=False): | |||||
self.i = i | |||||
self.to_reduce = NewsList() | |||||
self.to_predict = NewsList() | |||||
self.to_scan = [] | |||||
self.item_count = 0 | |||||
self.FIRST = FIRST | |||||
self.predicted = set() | |||||
self.completed = {} | |||||
self.predict_all = predict_all | |||||
def add(self, items): | |||||
"""Sort items into scan/predict/reduce newslists | |||||
Makes sure only unique items are added. | |||||
""" | |||||
for item in items: | |||||
item_key = item, item.tree # Elsewhere, tree is not part of the comparison | |||||
if item.is_complete: | |||||
# XXX Potential bug: What happens if there's ambiguity in an empty rule? | |||||
if item.rule.expansion and item_key in self.completed: | |||||
old_tree = self.completed[item_key].tree | |||||
if old_tree == item.tree: | |||||
is_empty = not self.FIRST[item.rule.origin] | |||||
if not is_empty: | |||||
continue | |||||
if old_tree.data != '_ambig': | |||||
new_tree = old_tree.copy() | |||||
new_tree.meta.rule = old_tree.meta.rule | |||||
old_tree.set('_ambig', [new_tree]) | |||||
old_tree.meta.rule = None # No longer a 'drv' node | |||||
if item.tree.children[0] is old_tree: # XXX a little hacky! | |||||
raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) | |||||
if item.tree not in old_tree.children: | |||||
old_tree.children.append(item.tree) | |||||
# old_tree.children.append(item.tree) | |||||
else: | |||||
self.completed[item_key] = item | |||||
self.to_reduce.append(item) | |||||
else: | |||||
if item.expect.is_term: | |||||
self.to_scan.append(item) | |||||
else: | |||||
k = item_key if self.predict_all else item | |||||
if k in self.predicted: | |||||
continue | |||||
self.predicted.add(k) | |||||
self.to_predict.append(item) | |||||
self.item_count += 1 # Only count if actually added | |||||
def __bool__(self): | |||||
return bool(self.item_count) | |||||
__nonzero__ = __bool__ # Py2 backwards-compatibility | |||||
from collections import deque, defaultdict | |||||
class Parser: | class Parser: | ||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | |||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor): | |||||
analysis = GrammarAnalyzer(parser_conf) | analysis = GrammarAnalyzer(parser_conf) | ||||
self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
self.forest_sum_visitor = forest_sum_visitor | |||||
self.FIRST = analysis.FIRST | self.FIRST = analysis.FIRST | ||||
self.postprocess = {} | |||||
self.callbacks = {} | |||||
self.predictions = {} | self.predictions = {} | ||||
## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than | |||||
# the slow 'isupper' in is_terminal. | |||||
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | |||||
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | |||||
for rule in parser_conf.rules: | for rule in parser_conf.rules: | ||||
self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | |||||
self.callbacks[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | |||||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | ||||
self.term_matcher = term_matcher | self.term_matcher = term_matcher | ||||
@@ -163,72 +47,163 @@ class Parser: | |||||
def parse(self, stream, start_symbol=None): | def parse(self, stream, start_symbol=None): | ||||
# Define parser functions | # Define parser functions | ||||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | ||||
_Item = Item | |||||
match = self.term_matcher | match = self.term_matcher | ||||
def predict(nonterm, column): | |||||
assert not nonterm.is_term, nonterm | |||||
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||||
def complete(item): | |||||
name = item.rule.origin | |||||
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||||
def predict_and_complete(column): | |||||
while True: | |||||
to_predict = {x.expect for x in column.to_predict.get_news() | |||||
if x.ptr} # if not part of an already predicted batch | |||||
to_reduce = set(column.to_reduce.get_news()) | |||||
if not (to_predict or to_reduce): | |||||
break | |||||
for nonterm in to_predict: | |||||
column.add( predict(nonterm, column) ) | |||||
for item in to_reduce: | |||||
new_items = list(complete(item)) | |||||
if item in new_items: | |||||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||||
column.add(new_items) | |||||
def scan(i, token, column): | |||||
next_set = Column(i, self.FIRST) | |||||
next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) | |||||
if not next_set: | |||||
expect = {i.expect.name for i in column.to_scan} | |||||
raise UnexpectedToken(token, expect, considered_rules=set(column.to_scan)) | |||||
return next_set | |||||
held_completions = defaultdict(list) | |||||
node_cache = {} | |||||
token_cache = {} | |||||
def make_symbol_node(s, start, end): | |||||
label = (s, start.i, end.i) | |||||
if label in node_cache: | |||||
node = node_cache[label] | |||||
else: | |||||
node = node_cache[label] = SymbolNode(s, start, end) | |||||
return node | |||||
def predict_and_complete(column, to_scan): | |||||
"""The core Earley Predictor and Completer. | |||||
At each stage of the input, we handling any completed items (things | |||||
that matched on the last cycle) and use those to predict what should | |||||
come next in the input stream. The completions and any predicted | |||||
non-terminals are recursively processed until we reach a set of, | |||||
which can be added to the scan list for the next scanner cycle.""" | |||||
held_completions.clear() | |||||
# R (items) = Ei (column.items) | |||||
items = deque(column.items) | |||||
while items: | |||||
item = items.pop() # remove an element, A say, from R | |||||
### The Earley completer | |||||
if item.is_complete: ### (item.s == string) | |||||
if item.node is None: | |||||
item.node = make_symbol_node(item.s, item.start, column) | |||||
item.node.add_family(item.s, item.rule, item.start, None, None) | |||||
# Empty has 0 length. If we complete an empty symbol in a particular | |||||
# parse step, we need to be able to use that same empty symbol to complete | |||||
# any predictions that result, that themselves require empty. Avoids | |||||
# infinite recursion on empty symbols. | |||||
# held_completions is 'H' in E.Scott's paper. | |||||
is_empty_item = item.start.i == column.i | |||||
if is_empty_item: | |||||
held_completions[item.rule.origin] = item.node | |||||
originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||||
for originator in originators: | |||||
new_item = originator.advance() | |||||
new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | |||||
if new_item.expect in self.TERMINALS: | |||||
# Add (B :: aC.B, h, y) to Q | |||||
to_scan.add(new_item) | |||||
elif new_item not in column.items: | |||||
# Add (B :: aC.B, h, y) to Ei and R | |||||
column.add(new_item) | |||||
items.append(new_item) | |||||
### The Earley predictor | |||||
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | |||||
new_items = [] | |||||
for rule in self.predictions[item.expect]: | |||||
new_item = Item(rule, 0, column) | |||||
new_items.append(new_item) | |||||
# Process any held completions (H). | |||||
if item.expect in held_completions: | |||||
new_item = item.advance() | |||||
new_item.node = make_symbol_node(new_item.s, item.start, column) | |||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | |||||
new_items.append(new_item) | |||||
for new_item in new_items: | |||||
if new_item.expect in self.TERMINALS: | |||||
to_scan.add(new_item) | |||||
elif new_item not in column.items: | |||||
column.add(new_item) | |||||
items.append(new_item) | |||||
def scan(i, token, column, to_scan): | |||||
"""The core Earley Scanner. | |||||
This is a custom implementation of the scanner that uses the | |||||
Lark lexer to match tokens. The scan list is built by the | |||||
Earley predictor, based on the previously completed tokens. | |||||
This ensures that at each phase of the parse we have a custom | |||||
lexer context, allowing for more complex ambiguities.""" | |||||
next_set = Column(i+1, self.FIRST) | |||||
next_to_scan = set() | |||||
for item in set(to_scan): | |||||
if match(item.expect, token): | |||||
new_item = item.advance() | |||||
new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | |||||
if new_item.expect in self.TERMINALS: | |||||
# add (B ::= Aai+1.B, h, y) to Q' | |||||
next_to_scan.add(new_item) | |||||
else: | |||||
# add (B ::= Aa+1.B, h, y) to Ei+1 | |||||
next_set.add(new_item) | |||||
if not next_set and not next_to_scan: | |||||
expect = {i.expect.name for i in to_scan} | |||||
raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) | |||||
return next_set, next_to_scan | |||||
# Main loop starts | # Main loop starts | ||||
column0 = Column(0, self.FIRST) | column0 = Column(0, self.FIRST) | ||||
column0.add(predict(start_symbol, column0)) | |||||
column = column0 | column = column0 | ||||
## The scan buffer. 'Q' in E.Scott's paper. | |||||
to_scan = set() | |||||
## Predict for the start_symbol. | |||||
# Add predicted items to the first Earley set (for the predictor) if they | |||||
# result in a non-terminal, or the scanner if they result in a terminal. | |||||
for rule in self.predictions[start_symbol]: | |||||
item = Item(rule, 0, column0) | |||||
if item.expect in self.TERMINALS: | |||||
to_scan.add(item) | |||||
else: | |||||
column.add(item) | |||||
## The main Earley loop. | |||||
# Run the Prediction/Completion cycle for any Items in the current Earley set. | |||||
# Completions will be added to the SPPF tree, and predictions will be recursively | |||||
# processed down to terminals/empty nodes to be added to the scanner for the next | |||||
# step. | |||||
for i, token in enumerate(stream): | for i, token in enumerate(stream): | ||||
predict_and_complete(column) | |||||
column = scan(i, token, column) | |||||
predict_and_complete(column, to_scan) | |||||
# Clear the node_cache and token_cache, which are only relevant for each | |||||
# step in the Earley pass. | |||||
node_cache.clear() | |||||
token_cache.clear() | |||||
column, to_scan = scan(i, token, column, to_scan) | |||||
predict_and_complete(column) | |||||
predict_and_complete(column, to_scan) | |||||
# Parse ended. Now build a parse tree | |||||
solutions = [n.tree for n in column.to_reduce | |||||
if n.rule.origin==start_symbol and n.start is column0] | |||||
## Column is now the final column in the parse. If the parse was successful, the start | |||||
# symbol should have been completed in the last step of the Earley cycle, and will be in | |||||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||||
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||||
if not solutions: | if not solutions: | ||||
raise ParseError('Incomplete parse: Could not find a solution to input') | raise ParseError('Incomplete parse: Could not find a solution to input') | ||||
elif len(solutions) == 1: | |||||
tree = solutions[0] | |||||
else: | |||||
tree = Tree('_ambig', solutions) | |||||
if self.resolve_ambiguity: | |||||
tree = self.resolve_ambiguity(tree) | |||||
elif len(solutions) > 1: | |||||
raise ParseError('Earley should not generate multiple start symbol items!') | |||||
return ApplyCallbacks(self.postprocess).transform(tree) | |||||
## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. | |||||
# This means the caller can work directly with the SPPF tree. | |||||
if not self.resolve_ambiguity: | |||||
return solutions[0] | |||||
# ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | |||||
# according to the rules. | |||||
return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() | |||||
class ApplyCallbacks(Transformer_InPlace): | class ApplyCallbacks(Transformer_InPlace): | ||||
def __init__(self, postprocess): | def __init__(self, postprocess): | ||||
@@ -0,0 +1,80 @@ | |||||
"This module implements an Earley Parser" | |||||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||||
# (right now ambiguity resolution is not developed beyond the needs of lark) | |||||
# Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||||
# I use the no-recursion version of Transformer, because the tree might be | |||||
# deeper than Python's recursion limit (a bit absurd, but that's life) | |||||
# | |||||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||||
# Column keeps track of new items using NewsList instances. | |||||
# | |||||
# Author: Erez Shinan (2017) | |||||
# Email : erezshin@gmail.com | |||||
## for recursive repr | |||||
from ..tree import Tree | |||||
class Derivation(Tree): | |||||
def __init__(self, rule, children = None): | |||||
Tree.__init__(self, 'drv', children if children is not None else []) | |||||
self.meta.rule = rule | |||||
self._hash = None | |||||
def __repr__(self, indent = 0): | |||||
return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...') | |||||
def __hash__(self): | |||||
if self._hash is None: | |||||
self._hash = Tree.__hash__(self) | |||||
return self._hash | |||||
class Item(object): | |||||
"An Earley Item, the atom of the algorithm." | |||||
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') | |||||
def __init__(self, rule, ptr, start): | |||||
self.is_complete = len(rule.expansion) == ptr | |||||
self.rule = rule # rule | |||||
self.ptr = ptr # ptr | |||||
self.start = start # j | |||||
self.node = None # w | |||||
if self.is_complete: | |||||
self.s = rule.origin | |||||
self.expect = None | |||||
else: | |||||
self.s = (rule, ptr) | |||||
self.expect = rule.expansion[ptr] | |||||
self._hash = hash((self.s, self.start.i)) | |||||
def advance(self): | |||||
return self.__class__(self.rule, self.ptr + 1, self.start) | |||||
def __eq__(self, other): | |||||
return self is other or (self.s == other.s and self.start.i == other.start.i) | |||||
def __hash__(self): | |||||
return self._hash | |||||
def __repr__(self): | |||||
return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i) | |||||
class Column: | |||||
"An entry in the table, aka Earley Chart. Contains lists of items." | |||||
def __init__(self, i, FIRST): | |||||
self.i = i | |||||
self.items = set() | |||||
self.FIRST = FIRST | |||||
def add(self, item): | |||||
"""Sort items into scan/predict/reduce newslists | |||||
Makes sure only unique items are added. | |||||
""" | |||||
self.items.add(item) | |||||
def __bool__(self): | |||||
return bool(self.items) | |||||
__nonzero__ = __bool__ # Py2 backwards-compatibility |
@@ -0,0 +1,347 @@ | |||||
""""This module implements an SPPF implementation | |||||
This is used as the primary output mechanism for the Earley parser | |||||
in order to store complex ambiguities. | |||||
Full reference and more details is here: | |||||
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||||
""" | |||||
from ..tree import Tree | |||||
from ..exceptions import ParseError | |||||
from ..lexer import Token | |||||
from ..utils import Str | |||||
from ..grammar import NonTerminal, Terminal | |||||
from .earley_common import Column, Derivation | |||||
from collections import deque | |||||
class SymbolNode(object): | |||||
""" | |||||
A Symbol Node represents a symbol (or Intermediate LR0). | |||||
Symbol nodes are keyed by the symbol (s). For intermediate nodes | |||||
s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol | |||||
nodes, s will be a string representing the non-terminal origin (i.e. | |||||
the left hand side of the rule). | |||||
The children of a Symbol or Intermediate Node will always be Packed Nodes; | |||||
with each Packed Node child representing a single derivation of a production. | |||||
Hence a Symbol Node with a single child is unambiguous. | |||||
""" | |||||
__slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate') | |||||
def __init__(self, s, start, end): | |||||
self.s = s | |||||
self.start = start | |||||
self.end = end | |||||
self.children = set() | |||||
self.priority = None | |||||
self.is_intermediate = isinstance(s, tuple) | |||||
def add_family(self, lr0, rule, start, left, right): | |||||
self.children.add(PackedNode(self, lr0, rule, start, left, right)) | |||||
@property | |||||
def is_ambiguous(self): | |||||
return len(self.children) > 1 | |||||
def __iter__(self): | |||||
return iter(self.children) | |||||
def __eq__(self, other): | |||||
if not isinstance(other, SymbolNode): | |||||
return False | |||||
return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) | |||||
def __hash__(self): | |||||
return hash((self.s, self.start.i, self.end.i)) | |||||
def __repr__(self): | |||||
symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name | |||||
return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) | |||||
class PackedNode(object): | |||||
""" | |||||
A Packed Node represents a single derivation in a symbol node. | |||||
""" | |||||
__slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash') | |||||
def __init__(self, parent, s, rule, start, left, right): | |||||
self.parent = parent | |||||
self.s = s | |||||
self.start = start | |||||
self.rule = rule | |||||
self.left = left | |||||
self.right = right | |||||
self.priority = None | |||||
self._hash = hash((self.s, self.start.i, self.left, self.right)) | |||||
@property | |||||
def is_empty(self): | |||||
return self.left is None and self.right is None | |||||
def __iter__(self): | |||||
return iter([self.left, self.right]) | |||||
def __lt__(self, other): | |||||
if self.is_empty and not other.is_empty: return True | |||||
if self.priority < other.priority: return True | |||||
return False | |||||
def __gt__(self, other): | |||||
if self.is_empty and not other.is_empty: return True | |||||
if self.priority > other.priority: return True | |||||
return False | |||||
def __eq__(self, other): | |||||
if not isinstance(other, PackedNode): | |||||
return False | |||||
return self is other or (self.s == other.s and self.start == other.start and self.left == other.left and self.right == other.right) | |||||
def __hash__(self): | |||||
return self._hash | |||||
def __repr__(self): | |||||
symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name | |||||
return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0) | |||||
class ForestVisitor(object): | |||||
""" | |||||
An abstract base class for building forest visitors. | |||||
Use this as a base when you need to walk the forest. | |||||
""" | |||||
def __init__(self, root): | |||||
self.root = root | |||||
self.result = None | |||||
def visit_token_node(self, node): pass | |||||
def visit_symbol_node_in(self, node): pass | |||||
def visit_symbol_node_out(self, node): pass | |||||
def visit_packed_node_in(self, node): pass | |||||
def visit_packed_node_out(self, node): pass | |||||
def go(self): | |||||
# Visiting is a list of IDs of all symbol/intermediate nodes currently in | |||||
# the stack. It serves two purposes: to detect when we 'recurse' in and out | |||||
# of a symbol/intermediate so that we can process both up and down. Also, | |||||
# since the SPPF can have cycles it allows us to detect if we're trying | |||||
# to recurse into a node that's already on the stack (infinite recursion). | |||||
visiting = set() | |||||
# We do not use recursion here to walk the Forest due to the limited | |||||
# stack size in python. Therefore input_stack is essentially our stack. | |||||
input_stack = deque([self.root]) | |||||
# It is much faster to cache these as locals since they are called | |||||
# many times in large parses. | |||||
vpno = getattr(self, 'visit_packed_node_out') | |||||
vpni = getattr(self, 'visit_packed_node_in') | |||||
vsno = getattr(self, 'visit_symbol_node_out') | |||||
vsni = getattr(self, 'visit_symbol_node_in') | |||||
vtn = getattr(self, 'visit_token_node') | |||||
while input_stack: | |||||
current = next(reversed(input_stack)) | |||||
try: | |||||
next_node = next(current) | |||||
except StopIteration: | |||||
input_stack.pop() | |||||
continue | |||||
except TypeError: | |||||
### If the current object is not an iterator, pass through to Token/SymbolNode | |||||
pass | |||||
else: | |||||
if next_node is None: | |||||
continue | |||||
if id(next_node) in visiting: | |||||
raise ParseError("Infinite recursion in grammar!") | |||||
input_stack.append(next_node) | |||||
continue | |||||
if isinstance(current, Str): | |||||
vtn(current) | |||||
input_stack.pop() | |||||
continue | |||||
current_id = id(current) | |||||
if current_id in visiting: | |||||
if isinstance(current, PackedNode): vpno(current) | |||||
else: vsno(current) | |||||
input_stack.pop() | |||||
visiting.remove(current_id) | |||||
continue | |||||
else: | |||||
visiting.add(current_id) | |||||
if isinstance(current, PackedNode): next_node = vpni(current) | |||||
else: next_node = vsni(current) | |||||
if next_node is None: | |||||
continue | |||||
if id(next_node) in visiting: | |||||
raise ParseError("Infinite recursion in grammar!") | |||||
input_stack.append(next_node) | |||||
continue | |||||
return self.result | |||||
class ForestSumVisitor(ForestVisitor): | |||||
""" | |||||
A visitor for prioritizing ambiguous parts of the Forest. | |||||
This visitor is the default when resolving ambiguity. It pushes the priorities | |||||
from the rules into the SPPF nodes; and then sorts the packed node children | |||||
of ambiguous symbol or intermediate node according to the priorities. | |||||
This relies on the custom sort function provided in PackedNode.__lt__; which | |||||
uses these properties (and other factors) to sort the ambiguous packed nodes. | |||||
""" | |||||
def visit_packed_node_in(self, node): | |||||
return iter([node.left, node.right]) | |||||
def visit_symbol_node_in(self, node): | |||||
return iter(node.children) | |||||
def visit_packed_node_out(self, node): | |||||
node.priority = 0 | |||||
if node.rule.options and node.rule.options.priority: node.priority += node.rule.options.priority | |||||
if node.right is not None and hasattr(node.right, 'priority'): node.priority += node.right.priority | |||||
if node.left is not None and hasattr(node.left, 'priority'): node.priority += node.left.priority | |||||
def visit_symbol_node_out(self, node): | |||||
node.priority = max(child.priority for child in node.children) | |||||
node.children = sorted(node.children, reverse = True) | |||||
class ForestAntiscoreSumVisitor(ForestSumVisitor): | |||||
""" | |||||
A visitor for prioritizing ambiguous parts of the Forest. | |||||
This visitor is used when resolve_ambiguity == 'resolve__antiscore_sum'. | |||||
It pushes the priorities from the rules into the SPPF nodes, and implements | |||||
a 'least cost' mechanism for resolving ambiguity (reverse of the default | |||||
priority mechanism). It uses a custom __lt__ comparator key for sorting | |||||
the packed node children. | |||||
""" | |||||
def visit_symbol_node_out(self, node): | |||||
node.priority = min(child.priority for child in node.children) | |||||
node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) | |||||
class AntiscoreSumComparator(object): | |||||
""" | |||||
An antiscore-sum comparator for PackedNode objects. | |||||
This allows 'sorting' an iterable of PackedNode objects so that they | |||||
are arranged lowest priority first. | |||||
""" | |||||
__slots__ = ['obj'] | |||||
def __init__(self, obj, *args): | |||||
self.obj = obj | |||||
def __lt__(self, other): | |||||
if self.obj.is_empty and not other.obj.is_empty: return True | |||||
if self.obj.priority > other.obj.priority: return True | |||||
return False | |||||
def __gt__(self, other): | |||||
if self.obj.is_empty and not other.obj.is_empty: return True | |||||
if self.obj.priority < other.obj.priority: return True | |||||
return False | |||||
class ForestToTreeVisitor(ForestVisitor): | |||||
""" | |||||
A Forest visitor which converts an SPPF forest to an unambiguous AST. | |||||
The implementation in this visitor walks only the first ambiguous child | |||||
of each symbol node. When it finds an ambiguous symbol node it first | |||||
calls the forest_sum_visitor implementation to sort the children | |||||
into preference order using the algorithms defined there; so the first | |||||
child should always be the highest preference. The forest_sum_visitor | |||||
implementation should be another ForestVisitor which sorts the children | |||||
according to some priority mechanism. | |||||
""" | |||||
def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None): | |||||
super(ForestToTreeVisitor, self).__init__(root) | |||||
self.forest_sum_visitor = forest_sum_visitor | |||||
self.output_stack = deque() | |||||
self.callbacks = callbacks | |||||
self.result = None | |||||
def visit_token_node(self, node): | |||||
self.output_stack[-1].append(node) | |||||
def visit_symbol_node_in(self, node): | |||||
if node.is_ambiguous and node.priority is None: | |||||
self.forest_sum_visitor(node).go() | |||||
return next(iter(node.children)) | |||||
def visit_packed_node_in(self, node): | |||||
if not node.parent.is_intermediate: | |||||
self.output_stack.append([]) | |||||
return iter([node.left, node.right]) | |||||
def visit_packed_node_out(self, node): | |||||
if not node.parent.is_intermediate: | |||||
result = self.callbacks[node.rule](self.output_stack.pop()) | |||||
if self.output_stack: | |||||
self.output_stack[-1].append(result) | |||||
else: | |||||
self.result = result | |||||
class ForestToAmbiguousTreeVisitor(ForestVisitor): | |||||
""" | |||||
A Forest visitor which converts an SPPF forest to an ambiguous AST. | |||||
Because of the fundamental disparity between what can be stored in | |||||
an SPPF and what can be stored in a Tree; this implementation is not | |||||
complete. It correctly deals with ambiguities that occur on symbol nodes only, | |||||
and cannot deal with ambiguities that occur on intermediate nodes. | |||||
Usually, most parsers can be rewritten to avoid intermediate node | |||||
ambiguities. Also, this implementation could be fixed, however | |||||
the code to handle intermediate node ambiguities is messy and | |||||
would not be performant. It is much better not to use this and | |||||
instead to correctly disambiguate the forest and only store unambiguous | |||||
parses in Trees. It is here just to provide some parity with the | |||||
old ambiguity='explicit'. | |||||
This is mainly used by the test framework, to make it simpler to write | |||||
tests ensuring the SPPF contains the right results. | |||||
""" | |||||
def __init__(self, root, callbacks): | |||||
super(ForestToAmbiguousTreeVisitor, self).__init__(root) | |||||
self.output_stack = deque() | |||||
self.callbacks = callbacks | |||||
self.result = None | |||||
def visit_token_node(self, node): | |||||
self.output_stack[-1].children.append(node) | |||||
def visit_symbol_node_in(self, node): | |||||
if not node.is_intermediate and node.is_ambiguous: | |||||
self.output_stack.append(Tree('_ambig', [])) | |||||
return iter(node.children) | |||||
def visit_symbol_node_out(self, node): | |||||
if node.is_ambiguous: | |||||
result = self.output_stack.pop() | |||||
if self.output_stack: | |||||
self.output_stack[-1].children.append(result) | |||||
else: | |||||
self.result = result | |||||
def visit_packed_node_in(self, node): | |||||
#### NOTE: | |||||
## When an intermediate node (node.parent.s == tuple) has ambiguous children this | |||||
## forest visitor will break. | |||||
if not node.parent.is_intermediate: | |||||
self.output_stack.append(Tree('drv', [])) | |||||
return iter([node.left, node.right]) | |||||
def visit_packed_node_out(self, node): | |||||
if not node.parent.is_intermediate: | |||||
result = self.callbacks[node.rule](self.output_stack.pop().children) | |||||
if self.output_stack: | |||||
self.output_stack[-1].children.append(result) | |||||
else: | |||||
self.result = result |
@@ -1,109 +0,0 @@ | |||||
from ..utils import compare | |||||
from functools import cmp_to_key | |||||
from ..tree import Tree | |||||
# Standard ambiguity resolver (uses comparison) | |||||
# | |||||
# Author: Erez Sh | |||||
def _compare_rules(rule1, rule2): | |||||
return -compare( len(rule1.expansion), len(rule2.expansion)) | |||||
def _sum_priority(tree): | |||||
p = 0 | |||||
for n in tree.iter_subtrees(): | |||||
try: | |||||
p += n.meta.rule.options.priority or 0 | |||||
except AttributeError: | |||||
pass | |||||
return p | |||||
def _compare_priority(tree1, tree2): | |||||
tree1.iter_subtrees() | |||||
def _compare_drv(tree1, tree2): | |||||
try: | |||||
rule1 = tree1.meta.rule | |||||
except AttributeError: | |||||
rule1 = None | |||||
try: | |||||
rule2 = tree2.meta.rule | |||||
except AttributeError: | |||||
rule2 = None | |||||
if None == rule1 == rule2: | |||||
return compare(tree1, tree2) | |||||
elif rule1 is None: | |||||
return -1 | |||||
elif rule2 is None: | |||||
return 1 | |||||
assert tree1.data != '_ambig' | |||||
assert tree2.data != '_ambig' | |||||
p1 = _sum_priority(tree1) | |||||
p2 = _sum_priority(tree2) | |||||
c = (p1 or p2) and compare(p1, p2) | |||||
if c: | |||||
return c | |||||
c = _compare_rules(tree1.meta.rule, tree2.meta.rule) | |||||
if c: | |||||
return c | |||||
# rules are "equal", so compare trees | |||||
if len(tree1.children) == len(tree2.children): | |||||
for t1, t2 in zip(tree1.children, tree2.children): | |||||
c = _compare_drv(t1, t2) | |||||
if c: | |||||
return c | |||||
return compare(len(tree1.children), len(tree2.children)) | |||||
def _standard_resolve_ambig(tree): | |||||
assert tree.data == '_ambig' | |||||
key_f = cmp_to_key(_compare_drv) | |||||
best = max(tree.children, key=key_f) | |||||
assert best.data == 'drv' | |||||
tree.set('drv', best.children) | |||||
tree.meta.rule = best.meta.rule # needed for applying callbacks | |||||
def standard_resolve_ambig(tree): | |||||
for ambig in tree.find_data('_ambig'): | |||||
_standard_resolve_ambig(ambig) | |||||
return tree | |||||
# Anti-score Sum | |||||
# | |||||
# Author: Uriva (https://github.com/uriva) | |||||
def _antiscore_sum_drv(tree): | |||||
if not isinstance(tree, Tree): | |||||
return 0 | |||||
assert tree.data != '_ambig' | |||||
return _sum_priority(tree) | |||||
def _antiscore_sum_resolve_ambig(tree): | |||||
assert tree.data == '_ambig' | |||||
best = min(tree.children, key=_antiscore_sum_drv) | |||||
assert best.data == 'drv' | |||||
tree.set('drv', best.children) | |||||
tree.meta.rule = best.meta.rule # needed for applying callbacks | |||||
def antiscore_sum_resolve_ambig(tree): | |||||
for ambig in tree.find_data('_ambig'): | |||||
_antiscore_sum_resolve_ambig(ambig) | |||||
return tree |
@@ -1,107 +1,163 @@ | |||||
"This module implements an experimental Earley Parser with a dynamic lexer" | |||||
# The parser uses a parse-forest to keep track of derivations and ambiguations. | |||||
# When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||||
# (right now ambiguity resolution is not developed beyond the needs of lark) | |||||
# Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||||
# I use the no-recursion version of Transformer and Visitor, because the tree might be | |||||
# deeper than Python's recursion limit (a bit absurd, but that's life) | |||||
# | |||||
# The algorithm keeps track of each state set, using a corresponding Column instance. | |||||
# Column keeps track of new items using NewsList instances. | |||||
# | |||||
# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser | |||||
# uses regular expressions by necessity, achieving high-performance while maintaining all of | |||||
# Earley's power in parsing any CFG. | |||||
# | |||||
# | |||||
"""This module implements an experimental Earley parser with a dynamic lexer | |||||
The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: | |||||
https://www.sciencedirect.com/science/article/pii/S1571066108001497 | |||||
That is probably the best reference for understanding the algorithm here. | |||||
The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format | |||||
is better documented here: | |||||
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||||
Instead of running a lexer beforehand, or using a costy char-by-char method, this parser | |||||
uses regular expressions by necessity, achieving high-performance while maintaining all of | |||||
Earley's power in parsing any CFG. | |||||
""" | |||||
# Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
# Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
from collections import defaultdict | |||||
from collections import defaultdict, deque | |||||
from ..exceptions import ParseError, UnexpectedCharacters | from ..exceptions import ParseError, UnexpectedCharacters | ||||
from ..lexer import Token | from ..lexer import Token | ||||
from ..tree import Tree | from ..tree import Tree | ||||
from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
from ..grammar import NonTerminal, Terminal | from ..grammar import NonTerminal, Terminal | ||||
from .earley import ApplyCallbacks, Item, Column | |||||
from .earley import ApplyCallbacks | |||||
from .earley_common import Column, Item | |||||
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | |||||
class Parser: | class Parser: | ||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False): | |||||
self.analysis = GrammarAnalyzer(parser_conf) | |||||
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor, ignore = (), complete_lex = False): | |||||
analysis = GrammarAnalyzer(parser_conf) | |||||
self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
self.forest_sum_visitor = forest_sum_visitor | |||||
self.ignore = [Terminal(t) for t in ignore] | self.ignore = [Terminal(t) for t in ignore] | ||||
self.predict_all = predict_all | |||||
self.complete_lex = complete_lex | self.complete_lex = complete_lex | ||||
self.FIRST = self.analysis.FIRST | |||||
self.postprocess = {} | |||||
self.FIRST = analysis.FIRST | |||||
self.callbacks = {} | |||||
self.predictions = {} | self.predictions = {} | ||||
## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than | |||||
# the slow 'isupper' in is_terminal. | |||||
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | |||||
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | |||||
for rule in parser_conf.rules: | for rule in parser_conf.rules: | ||||
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) | |||||
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||||
self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) | |||||
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | |||||
self.term_matcher = term_matcher | self.term_matcher = term_matcher | ||||
def parse(self, stream, start_symbol=None): | def parse(self, stream, start_symbol=None): | ||||
# Define parser functions | |||||
start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | ||||
delayed_matches = defaultdict(list) | delayed_matches = defaultdict(list) | ||||
match = self.term_matcher | match = self.term_matcher | ||||
text_line = 1 | |||||
text_column = 1 | |||||
# Held Completions (H in E.Scotts paper). | |||||
held_completions = {} | |||||
def predict(nonterm, column): | |||||
assert not nonterm.is_term, nonterm | |||||
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||||
def complete(item): | |||||
name = item.rule.origin | |||||
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||||
def predict_and_complete(column): | |||||
while True: | |||||
to_predict = {x.expect for x in column.to_predict.get_news() | |||||
if x.ptr} # if not part of an already predicted batch | |||||
to_reduce = column.to_reduce.get_news() | |||||
if not (to_predict or to_reduce): | |||||
break | |||||
for nonterm in to_predict: | |||||
column.add( predict(nonterm, column) ) | |||||
for item in to_reduce: | |||||
new_items = list(complete(item)) | |||||
if item in new_items: | |||||
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||||
column.add(new_items) | |||||
def scan(i, column): | |||||
to_scan = column.to_scan | |||||
# Cache for nodes & tokens created in a particular parse step. | |||||
node_cache = {} | |||||
token_cache = {} | |||||
for x in self.ignore: | |||||
m = match(x, stream, i) | |||||
if m: | |||||
delayed_matches[m.end()] += set(to_scan) | |||||
delayed_matches[m.end()] += set(column.to_reduce) | |||||
# TODO add partial matches for ignore too? | |||||
# s = m.group(0) | |||||
# for j in range(1, len(s)): | |||||
# m = x.match(s[:-j]) | |||||
# if m: | |||||
# delayed_matches[m.end()] += to_scan | |||||
text_line = 1 | |||||
text_column = 1 | |||||
for item in to_scan: | |||||
def make_symbol_node(s, start, end): | |||||
label = (s, start.i, end.i) | |||||
if label in node_cache: | |||||
node = node_cache[label] | |||||
else: | |||||
node = node_cache[label] = SymbolNode(s, start, end) | |||||
return node | |||||
def predict_and_complete(column, to_scan): | |||||
"""The core Earley Predictor and Completer. | |||||
At each stage of the input, we handling any completed items (things | |||||
that matched on the last cycle) and use those to predict what should | |||||
come next in the input stream. The completions and any predicted | |||||
non-terminals are recursively processed until we reach a set of, | |||||
which can be added to the scan list for the next scanner cycle.""" | |||||
held_completions.clear() | |||||
# R (items) = Ei (column.items) | |||||
items = deque(column.items) | |||||
while items: | |||||
item = items.pop() # remove an element, A say, from R | |||||
### The Earley completer | |||||
if item.is_complete: ### (item.s == string) | |||||
if item.node is None: | |||||
item.node = make_symbol_node(item.s, item.start, column) | |||||
item.node.add_family(item.s, item.rule, item.start, None, None) | |||||
# Empty has 0 length. If we complete an empty symbol in a particular | |||||
# parse step, we need to be able to use that same empty symbol to complete | |||||
# any predictions that result, that themselves require empty. Avoids | |||||
# infinite recursion on empty symbols. | |||||
# held_completions is 'H' in E.Scott's paper. | |||||
is_empty_item = item.start.i == column.i | |||||
if is_empty_item: | |||||
held_completions[item.rule.origin] = item.node | |||||
originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||||
for originator in originators: | |||||
new_item = originator.advance() | |||||
new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | |||||
if new_item.expect in self.TERMINALS: | |||||
# Add (B :: aC.B, h, y) to Q | |||||
to_scan.add(new_item) | |||||
elif new_item not in column.items: | |||||
# Add (B :: aC.B, h, y) to Ei and R | |||||
column.add(new_item) | |||||
items.append(new_item) | |||||
### The Earley predictor | |||||
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | |||||
new_items = [] | |||||
for rule in self.predictions[item.expect]: | |||||
new_item = Item(rule, 0, column) | |||||
new_items.append(new_item) | |||||
# Process any held completions (H). | |||||
if item.expect in held_completions: | |||||
new_item = item.advance() | |||||
new_item.node = make_symbol_node(new_item.s, item.start, column) | |||||
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | |||||
new_items.append(new_item) | |||||
for new_item in new_items: | |||||
if new_item.expect in self.TERMINALS: | |||||
to_scan.add(new_item) | |||||
elif new_item not in column.items: | |||||
column.add(new_item) | |||||
items.append(new_item) | |||||
def scan(i, column, to_scan): | |||||
"""The core Earley Scanner. | |||||
This is a custom implementation of the scanner that uses the | |||||
Lark lexer to match tokens. The scan list is built by the | |||||
Earley predictor, based on the previously completed tokens. | |||||
This ensures that at each phase of the parse we have a custom | |||||
lexer context, allowing for more complex ambiguities.""" | |||||
# 1) Loop the expectations and ask the lexer to match. | |||||
# Since regexp is forward looking on the input stream, and we only | |||||
# want to process tokens when we hit the point in the stream at which | |||||
# they complete, we push all tokens into a buffer (delayed_matches), to | |||||
# be held possibly for a later parse step when we reach the point in the | |||||
# input stream at which they complete. | |||||
for item in set(to_scan): | |||||
m = match(item.expect, stream, i) | m = match(item.expect, stream, i) | ||||
if m: | if m: | ||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | t = Token(item.expect.name, m.group(0), i, text_line, text_column) | ||||
delayed_matches[m.end()].append(item.advance(t)) | |||||
delayed_matches[m.end()].append( (item, column, t) ) | |||||
if self.complete_lex: | if self.complete_lex: | ||||
s = m.group(0) | s = m.group(0) | ||||
@@ -109,25 +165,85 @@ class Parser: | |||||
m = match(item.expect, s[:-j]) | m = match(item.expect, s[:-j]) | ||||
if m: | if m: | ||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | t = Token(item.expect.name, m.group(0), i, text_line, text_column) | ||||
delayed_matches[i+m.end()].append(item.advance(t)) | |||||
delayed_matches[i+m.end()].append( (item, column, t) ) | |||||
# Remove any items that successfully matched in this pass from the to_scan buffer. | |||||
# This ensures we don't carry over tokens that already matched, if we're ignoring below. | |||||
to_scan.remove(item) | |||||
# 3) Process any ignores. This is typically used for e.g. whitespace. | |||||
# We carry over any unmatched items from the to_scan buffer to be matched again after | |||||
# the ignore. This should allow us to use ignored symbols in non-terminals to implement | |||||
# e.g. mandatory spacing. | |||||
for x in self.ignore: | |||||
m = match(x, stream, i) | |||||
if m: | |||||
# Carry over any items still in the scan buffer, to past the end of the ignored items. | |||||
delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ]) | |||||
# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. | |||||
delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol]) | |||||
next_set = Column(i + 1, self.FIRST) # Ei+1 | |||||
next_to_scan = set() | |||||
## 4) Process Tokens from delayed_matches. | |||||
# This is the core of the Earley scanner. Create an SPPF node for each Token, | |||||
# and create the symbol node in the SPPF tree. Advance the item that completed, | |||||
# and add the resulting new item to either the Earley set (for processing by the | |||||
# completer/predictor) or the to_scan buffer for the next parse step. | |||||
for item, start, token in delayed_matches[i+1]: | |||||
if token is not None: | |||||
new_item = item.advance() | |||||
new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||||
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | |||||
else: | |||||
new_item = item | |||||
if new_item.expect in self.TERMINALS: | |||||
# add (B ::= Aai+1.B, h, y) to Q' | |||||
next_to_scan.add(new_item) | |||||
else: | |||||
# add (B ::= Aa+1.B, h, y) to Ei+1 | |||||
next_set.add(new_item) | |||||
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | |||||
next_set.add(delayed_matches[i+1]) | |||||
del delayed_matches[i+1] # No longer needed, so unburden memory | del delayed_matches[i+1] # No longer needed, so unburden memory | ||||
if not next_set and not delayed_matches: | |||||
if not next_set and not delayed_matches and not next_to_scan: | |||||
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) | raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) | ||||
return next_set | |||||
return next_set, next_to_scan | |||||
# Main loop starts | # Main loop starts | ||||
column0 = Column(0, self.FIRST, predict_all=self.predict_all) | |||||
column0.add(predict(start_symbol, column0)) | |||||
column0 = Column(0, self.FIRST) | |||||
column = column0 | column = column0 | ||||
## The scan buffer. 'Q' in E.Scott's paper. | |||||
to_scan = set() | |||||
## Predict for the start_symbol. | |||||
# Add predicted items to the first Earley set (for the predictor) if they | |||||
# result in a non-terminal, or the scanner if they result in a terminal. | |||||
for rule in self.predictions[start_symbol]: | |||||
item = Item(rule, 0, column0) | |||||
if item.expect in self.TERMINALS: | |||||
to_scan.add(item) | |||||
else: | |||||
column.add(item) | |||||
## The main Earley loop. | |||||
# Run the Prediction/Completion cycle for any Items in the current Earley set. | |||||
# Completions will be added to the SPPF tree, and predictions will be recursively | |||||
# processed down to terminals/empty nodes to be added to the scanner for the next | |||||
# step. | |||||
for i, token in enumerate(stream): | for i, token in enumerate(stream): | ||||
predict_and_complete(column) | |||||
column = scan(i, column) | |||||
predict_and_complete(column, to_scan) | |||||
# Clear the node_cache and token_cache, which are only relevant for each | |||||
# step in the Earley pass. | |||||
node_cache.clear() | |||||
token_cache.clear() | |||||
column, to_scan = scan(i, column, to_scan) | |||||
if token == '\n': | if token == '\n': | ||||
text_line += 1 | text_line += 1 | ||||
@@ -135,24 +251,24 @@ class Parser: | |||||
else: | else: | ||||
text_column += 1 | text_column += 1 | ||||
predict_and_complete(column) | |||||
predict_and_complete(column, to_scan) | |||||
# Parse ended. Now build a parse tree | |||||
solutions = [n.tree for n in column.to_reduce | |||||
if n.rule.origin==start_symbol and n.start is column0] | |||||
## Column is now the final column in the parse. If the parse was successful, the start | |||||
# symbol should have been completed in the last step of the Earley cycle, and will be in | |||||
# this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||||
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||||
if not solutions: | if not solutions: | ||||
expected_tokens = [t.expect for t in column.to_scan] | |||||
expected_tokens = [t.expect for t in to_scan] | |||||
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) | raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) | ||||
elif len(solutions) > 1: | |||||
raise Exception('Earley should not generate more than one start symbol - bug') | |||||
elif len(solutions) == 1: | |||||
tree = solutions[0] | |||||
else: | |||||
tree = Tree('_ambig', solutions) | |||||
if self.resolve_ambiguity: | |||||
tree = self.resolve_ambiguity(tree) | |||||
return ApplyCallbacks(self.postprocess).transform(tree) | |||||
## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. | |||||
# This means the caller can work directly with the SPPF tree. | |||||
if not self.resolve_ambiguity: | |||||
return solutions[0] | |||||
# ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | |||||
# according to the rules. | |||||
return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() |
@@ -4,7 +4,7 @@ from .tree import Tree | |||||
from .visitors import Transformer_InPlace | from .visitors import Transformer_InPlace | ||||
from .common import ParserConf | from .common import ParserConf | ||||
from .lexer import Token, PatternStr | from .lexer import Token, PatternStr | ||||
from .parsers import earley, resolve_ambig | |||||
from .parsers import earley | |||||
from .grammar import Rule, Terminal, NonTerminal | from .grammar import Rule, Terminal, NonTerminal | ||||
@@ -114,7 +114,7 @@ class Reconstructor: | |||||
def _reconstruct(self, tree): | def _reconstruct(self, tree): | ||||
# TODO: ambiguity? | # TODO: ambiguity? | ||||
parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=resolve_ambig.standard_resolve_ambig) | |||||
parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=True) | |||||
unreduced_tree = parser.parse(tree.children) # find a full derivation | unreduced_tree = parser.parse(tree.children) # find a full derivation | ||||
assert unreduced_tree.data == tree.data | assert unreduced_tree.data == tree.data | ||||
res = self.write_tokens.transform(unreduced_tree) | res = self.write_tokens.transform(unreduced_tree) | ||||
@@ -21,6 +21,8 @@ from lark.lark import Lark | |||||
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput | from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput | ||||
from lark.tree import Tree | from lark.tree import Tree | ||||
from lark.visitors import Transformer | from lark.visitors import Transformer | ||||
from lark.parsers.earley_forest import ForestToAmbiguousTreeVisitor | |||||
from lark.parsers.earley import ApplyCallbacks | |||||
__path__ = os.path.dirname(__file__) | __path__ = os.path.dirname(__file__) | ||||
def _read(n, *args): | def _read(n, *args): | ||||
@@ -236,10 +238,11 @@ def _make_full_earley_test(LEXER): | |||||
""" | """ | ||||
parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') | parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') | ||||
res = parser.parse('ab') | |||||
self.assertEqual( res.data, '_ambig') | |||||
self.assertEqual( len(res.children), 2) | |||||
root_symbol = parser.parse('ab') | |||||
ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() | |||||
# print(ambig_tree.pretty()) | |||||
self.assertEqual( ambig_tree.data, '_ambig') | |||||
self.assertEqual( len(ambig_tree.children), 2) | |||||
def test_ambiguity1(self): | def test_ambiguity1(self): | ||||
grammar = """ | grammar = """ | ||||
@@ -251,9 +254,35 @@ def _make_full_earley_test(LEXER): | |||||
""" | """ | ||||
l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) | l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) | ||||
x = l.parse('cde') | |||||
assert x.data == '_ambig', x | |||||
assert len(x.children) == 2 | |||||
root_symbol = l.parse('cde') | |||||
ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, l.parser.parser.callbacks).go() | |||||
# print(ambig_tree.pretty()) | |||||
# tree = ApplyCallbacks(l.parser.parser.postprocess).transform(ambig_tree) | |||||
assert ambig_tree.data == '_ambig', ambig_tree | |||||
assert len(ambig_tree.children) == 2 | |||||
@unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") | |||||
def test_ambiguity2(self): | |||||
grammar = """ | |||||
ANY: /[a-zA-Z0-9 ]+/ | |||||
a.2: "A" b+ | |||||
b.2: "B" | |||||
c: ANY | |||||
start: (a|c)* | |||||
""" | |||||
l = Lark(grammar, parser='earley', lexer=LEXER) | |||||
res = l.parse('ABX') | |||||
expected = Tree('start', [ | |||||
Tree('a', [ | |||||
Tree('b', []) | |||||
]), | |||||
Tree('c', [ | |||||
'X' | |||||
]) | |||||
]) | |||||
self.assertEqual(res, expected) | |||||
def test_fruitflies_ambig(self): | def test_fruitflies_ambig(self): | ||||
grammar = """ | grammar = """ | ||||
@@ -272,7 +301,9 @@ def _make_full_earley_test(LEXER): | |||||
%ignore WS | %ignore WS | ||||
""" | """ | ||||
parser = Lark(grammar, ambiguity='explicit', lexer=LEXER) | parser = Lark(grammar, ambiguity='explicit', lexer=LEXER) | ||||
res = parser.parse('fruit flies like bananas') | |||||
root_symbol = parser.parse('fruit flies like bananas') | |||||
tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() | |||||
# tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree) | |||||
expected = Tree('_ambig', [ | expected = Tree('_ambig', [ | ||||
Tree('comparative', [ | Tree('comparative', [ | ||||
@@ -290,7 +321,9 @@ def _make_full_earley_test(LEXER): | |||||
# print res.pretty() | # print res.pretty() | ||||
# print expected.pretty() | # print expected.pretty() | ||||
self.assertEqual(res, expected) | |||||
# self.assertEqual(tree, expected) | |||||
self.assertEqual(tree.data, expected.data) | |||||
self.assertEqual(set(tree.children), set(expected.children)) | |||||
@unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") | @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") | ||||
@@ -303,7 +336,9 @@ def _make_full_earley_test(LEXER): | |||||
text = """cat""" | text = """cat""" | ||||
parser = Lark(grammar, start='start', ambiguity='explicit') | parser = Lark(grammar, start='start', ambiguity='explicit') | ||||
tree = parser.parse(text) | |||||
root_symbol = parser.parse(text) | |||||
ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol).go() | |||||
tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree) | |||||
self.assertEqual(tree.data, '_ambig') | self.assertEqual(tree.data, '_ambig') | ||||
combinations = {tuple(str(s) for s in t.children) for t in tree.children} | combinations = {tuple(str(s) for s in t.children) for t in tree.children} | ||||