| @@ -66,7 +66,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| if allowed: | |||
| message += '\nExpecting: %s\n' % allowed | |||
| super(UnexpectedCharacters, self).__init__(message) | |||
| super(UnexpectedCharacters, self).__init__(message.encode('utf-8')) | |||
| @@ -84,6 +84,6 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
| "Expected one of: \n\t* %s\n" | |||
| % (token, self.line, self.column, '\n\t* '.join(self.expected))) | |||
| super(UnexpectedToken, self).__init__(message) | |||
| super(UnexpectedToken, self).__init__(message.encode('utf-8')) | |||
| ###} | |||
| @@ -62,14 +62,13 @@ class LarkOptions(object): | |||
| self.profile = o.pop('profile', False) | |||
| self.ambiguity = o.pop('ambiguity', 'auto') | |||
| self.propagate_positions = o.pop('propagate_positions', False) | |||
| self.earley__predict_all = o.pop('earley__predict_all', False) | |||
| self.lexer_callbacks = o.pop('lexer_callbacks', {}) | |||
| assert self.parser in ('earley', 'lalr', 'cyk', None) | |||
| if self.parser == 'earley' and self.transformer: | |||
| raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.' | |||
| 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') | |||
| if self.ambiguity == 'explicit' and self.transformer: | |||
| raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm for explicit ambiguity.' | |||
| 'Please use your transformer on the resulting Forest, or use a different algorithm (i.e. LALR)') | |||
| if o: | |||
| raise ValueError("Unknown options: %s" % o.keys()) | |||
| @@ -176,7 +175,7 @@ class Lark: | |||
| def _build_parser(self): | |||
| self.parser_class = get_frontend(self.options.parser, self.options.lexer) | |||
| self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') | |||
| self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit') | |||
| callback = self._parse_tree_builder.create_callback(self.options.transformer) | |||
| if self.profiler: | |||
| for f in dir(callback): | |||
| @@ -7,6 +7,7 @@ from .visitors import InlineTransformer # XXX Deprecated | |||
| ###{standalone | |||
| from functools import partial, wraps | |||
| from itertools import repeat, product | |||
| class ExpandSingleChild: | |||
| @@ -62,23 +63,11 @@ class PropagatePositions: | |||
| class ChildFilter: | |||
| "Optimized childfilter (assumes no duplication in parse tree, so it's safe to change it)" | |||
| def __init__(self, to_include, node_builder): | |||
| self.node_builder = node_builder | |||
| self.to_include = to_include | |||
| def __call__(self, children): | |||
| filtered = [] | |||
| for i, to_expand in self.to_include: | |||
| if to_expand: | |||
| filtered += children[i].children | |||
| else: | |||
| filtered.append(children[i]) | |||
| return self.node_builder(filtered) | |||
| class ChildFilterLALR(ChildFilter): | |||
| "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" | |||
| def __call__(self, children): | |||
| filtered = [] | |||
| for i, to_expand in self.to_include: | |||
| @@ -89,19 +78,43 @@ class ChildFilterLALR(ChildFilter): | |||
| filtered = children[i].children | |||
| else: | |||
| filtered.append(children[i]) | |||
| return self.node_builder(filtered) | |||
| def _should_expand(sym): | |||
| return not sym.is_term and sym.name.startswith('_') | |||
| def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous): | |||
| def maybe_create_child_filter(expansion, keep_all_tokens): | |||
| to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) | |||
| if keep_all_tokens or not (sym.is_term and sym.filter_out)] | |||
| if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | |||
| return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) | |||
| return partial(ChildFilter, to_include) | |||
| class AmbiguousExpander: | |||
| """Deal with the case where we're expanding children ('_rule') into a parent but the children | |||
| are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself | |||
| ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children | |||
| into the right parents in the right places, essentially shifting the ambiguiuty up the tree.""" | |||
| def __init__(self, to_expand, tree_class, node_builder): | |||
| self.node_builder = node_builder | |||
| self.tree_class = tree_class | |||
| self.to_expand = to_expand | |||
| def __call__(self, children): | |||
| def _is_ambig_tree(child): | |||
| return hasattr(child, 'data') and child.data == '_ambig' | |||
| ambiguous = [i for i in self.to_expand if _is_ambig_tree(children[i])] | |||
| if ambiguous: | |||
| expand = [iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children)] | |||
| return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))]) | |||
| return self.node_builder(children) | |||
| def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): | |||
| to_expand = [i for i, sym in enumerate(expansion) | |||
| if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))] | |||
| if to_expand: | |||
| return partial(AmbiguousExpander, to_expand, tree_class) | |||
| class Callback(object): | |||
| pass | |||
| @@ -113,8 +126,6 @@ def ptb_inline_args(func): | |||
| return func(*children) | |||
| return f | |||
| class ParseTreeBuilder: | |||
| def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): | |||
| self.tree_class = tree_class | |||
| @@ -133,9 +144,10 @@ class ParseTreeBuilder: | |||
| expand_single_child = options.expand1 if options else False | |||
| wrapper_chain = filter(None, [ | |||
| (expand_single_child and not rule.alias) and ExpandSingleChild, | |||
| maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous), | |||
| self.propagate_positions and PropagatePositions, | |||
| (expand_single_child and not rule.alias) and ExpandSingleChild, | |||
| maybe_create_child_filter(rule.expansion, keep_all_tokens), | |||
| self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), | |||
| ]) | |||
| yield rule, wrapper_chain | |||
| @@ -4,8 +4,7 @@ from functools import partial | |||
| from .utils import get_regexp_width | |||
| from .parsers.grammar_analysis import GrammarAnalyzer | |||
| from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token | |||
| from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | |||
| from .parsers import lalr_parser, earley, earley_forest, xearley, cyk | |||
| from .tree import Tree | |||
| class WithLexer: | |||
| @@ -56,13 +55,13 @@ class LALR_CustomLexer(WithLexer): | |||
| self.lexer = lexer_cls(lexer_conf) | |||
| def get_ambiguity_resolver(options): | |||
| def get_ambiguity_options(options): | |||
| if not options or options.ambiguity == 'resolve': | |||
| return resolve_ambig.standard_resolve_ambig | |||
| return {} | |||
| elif options.ambiguity == 'resolve__antiscore_sum': | |||
| return resolve_ambig.antiscore_sum_resolve_ambig | |||
| return {'forest_sum_visitor': earley_forest.ForestAntiscoreSumVisitor} | |||
| elif options.ambiguity == 'explicit': | |||
| return None | |||
| return {'resolve_ambiguity': False} | |||
| raise ValueError(options) | |||
| def tokenize_text(text): | |||
| @@ -78,8 +77,7 @@ class Earley(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| self.init_traditional_lexer(lexer_conf) | |||
| self.parser = earley.Parser(parser_conf, self.match, | |||
| resolve_ambiguity=get_ambiguity_resolver(options)) | |||
| self.parser = earley.Parser(parser_conf, self.match, **get_ambiguity_options(options)) | |||
| def match(self, term, token): | |||
| return term.name == token.type | |||
| @@ -91,11 +89,10 @@ class XEarley: | |||
| self._prepare_match(lexer_conf) | |||
| kw.update(get_ambiguity_options(options)) | |||
| self.parser = xearley.Parser(parser_conf, | |||
| self.match, | |||
| resolve_ambiguity=get_ambiguity_resolver(options), | |||
| ignore=lexer_conf.ignore, | |||
| predict_all=options.earley__predict_all, | |||
| **kw | |||
| ) | |||
| @@ -1,160 +1,44 @@ | |||
| "This module implements an Earley Parser" | |||
| # The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
| # When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
| # (right now ambiguity resolution is not developed beyond the needs of lark) | |||
| # Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||
| # I use the no-recursion version of Transformer, because the tree might be | |||
| # deeper than Python's recursion limit (a bit absurd, but that's life) | |||
| # | |||
| # The algorithm keeps track of each state set, using a corresponding Column instance. | |||
| # Column keeps track of new items using NewsList instances. | |||
| # | |||
| """This module implements an scanerless Earley parser. | |||
| The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: | |||
| https://www.sciencedirect.com/science/article/pii/S1571066108001497 | |||
| That is probably the best reference for understanding the algorithm here. | |||
| The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format | |||
| is better documented here: | |||
| http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||
| """ | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| from ..tree import Tree | |||
| from ..visitors import Transformer_InPlace, v_args | |||
| from ..exceptions import ParseError, UnexpectedToken | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| from ..grammar import NonTerminal | |||
| from .earley_common import Column, Item | |||
| from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | |||
| class Derivation(Tree): | |||
| def __init__(self, rule, items=None): | |||
| Tree.__init__(self, 'drv', items or []) | |||
| self.meta.rule = rule | |||
| self._hash = None | |||
| def _pretty_label(self): # Nicer pretty for debugging the parser | |||
| return self.meta.rule.origin.name if self.meta.rule else self.data | |||
| def __hash__(self): | |||
| if self._hash is None: | |||
| self._hash = Tree.__hash__(self) | |||
| return self._hash | |||
| class Item(object): | |||
| "An Earley Item, the atom of the algorithm." | |||
| def __init__(self, rule, ptr, start, tree): | |||
| self.rule = rule | |||
| self.ptr = ptr | |||
| self.start = start | |||
| self.tree = tree if tree is not None else Derivation(self.rule) | |||
| @property | |||
| def expect(self): | |||
| return self.rule.expansion[self.ptr] | |||
| @property | |||
| def is_complete(self): | |||
| return self.ptr == len(self.rule.expansion) | |||
| def advance(self, tree): | |||
| assert self.tree.data == 'drv' | |||
| new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||
| return self.__class__(self.rule, self.ptr+1, self.start, new_tree) | |||
| def __eq__(self, other): | |||
| return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
| def __hash__(self): | |||
| return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ | |||
| def __repr__(self): | |||
| before = list(map(str, self.rule.expansion[:self.ptr])) | |||
| after = list(map(str, self.rule.expansion[self.ptr:])) | |||
| return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||
| class NewsList(list): | |||
| "Keeps track of newly added items (append-only)" | |||
| def __init__(self, initial=None): | |||
| list.__init__(self, initial or []) | |||
| self.last_iter = 0 | |||
| def get_news(self): | |||
| i = self.last_iter | |||
| self.last_iter = len(self) | |||
| return self[i:] | |||
| class Column: | |||
| "An entry in the table, aka Earley Chart. Contains lists of items." | |||
| def __init__(self, i, FIRST, predict_all=False): | |||
| self.i = i | |||
| self.to_reduce = NewsList() | |||
| self.to_predict = NewsList() | |||
| self.to_scan = [] | |||
| self.item_count = 0 | |||
| self.FIRST = FIRST | |||
| self.predicted = set() | |||
| self.completed = {} | |||
| self.predict_all = predict_all | |||
| def add(self, items): | |||
| """Sort items into scan/predict/reduce newslists | |||
| Makes sure only unique items are added. | |||
| """ | |||
| for item in items: | |||
| item_key = item, item.tree # Elsewhere, tree is not part of the comparison | |||
| if item.is_complete: | |||
| # XXX Potential bug: What happens if there's ambiguity in an empty rule? | |||
| if item.rule.expansion and item_key in self.completed: | |||
| old_tree = self.completed[item_key].tree | |||
| if old_tree == item.tree: | |||
| is_empty = not self.FIRST[item.rule.origin] | |||
| if not is_empty: | |||
| continue | |||
| if old_tree.data != '_ambig': | |||
| new_tree = old_tree.copy() | |||
| new_tree.meta.rule = old_tree.meta.rule | |||
| old_tree.set('_ambig', [new_tree]) | |||
| old_tree.meta.rule = None # No longer a 'drv' node | |||
| if item.tree.children[0] is old_tree: # XXX a little hacky! | |||
| raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) | |||
| if item.tree not in old_tree.children: | |||
| old_tree.children.append(item.tree) | |||
| # old_tree.children.append(item.tree) | |||
| else: | |||
| self.completed[item_key] = item | |||
| self.to_reduce.append(item) | |||
| else: | |||
| if item.expect.is_term: | |||
| self.to_scan.append(item) | |||
| else: | |||
| k = item_key if self.predict_all else item | |||
| if k in self.predicted: | |||
| continue | |||
| self.predicted.add(k) | |||
| self.to_predict.append(item) | |||
| self.item_count += 1 # Only count if actually added | |||
| def __bool__(self): | |||
| return bool(self.item_count) | |||
| __nonzero__ = __bool__ # Py2 backwards-compatibility | |||
| from collections import deque, defaultdict | |||
| class Parser: | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor): | |||
| analysis = GrammarAnalyzer(parser_conf) | |||
| self.parser_conf = parser_conf | |||
| self.resolve_ambiguity = resolve_ambiguity | |||
| self.forest_sum_visitor = forest_sum_visitor | |||
| self.FIRST = analysis.FIRST | |||
| self.postprocess = {} | |||
| self.callbacks = {} | |||
| self.predictions = {} | |||
| ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than | |||
| # the slow 'isupper' in is_terminal. | |||
| self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | |||
| self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | |||
| for rule in parser_conf.rules: | |||
| self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | |||
| self.callbacks[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | |||
| self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | |||
| self.term_matcher = term_matcher | |||
| @@ -163,72 +47,163 @@ class Parser: | |||
| def parse(self, stream, start_symbol=None): | |||
| # Define parser functions | |||
| start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | |||
| _Item = Item | |||
| match = self.term_matcher | |||
| def predict(nonterm, column): | |||
| assert not nonterm.is_term, nonterm | |||
| return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
| def complete(item): | |||
| name = item.rule.origin | |||
| return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||
| def predict_and_complete(column): | |||
| while True: | |||
| to_predict = {x.expect for x in column.to_predict.get_news() | |||
| if x.ptr} # if not part of an already predicted batch | |||
| to_reduce = set(column.to_reduce.get_news()) | |||
| if not (to_predict or to_reduce): | |||
| break | |||
| for nonterm in to_predict: | |||
| column.add( predict(nonterm, column) ) | |||
| for item in to_reduce: | |||
| new_items = list(complete(item)) | |||
| if item in new_items: | |||
| raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||
| column.add(new_items) | |||
| def scan(i, token, column): | |||
| next_set = Column(i, self.FIRST) | |||
| next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) | |||
| if not next_set: | |||
| expect = {i.expect.name for i in column.to_scan} | |||
| raise UnexpectedToken(token, expect, considered_rules=set(column.to_scan)) | |||
| return next_set | |||
| held_completions = defaultdict(list) | |||
| node_cache = {} | |||
| token_cache = {} | |||
| def make_symbol_node(s, start, end): | |||
| label = (s, start.i, end.i) | |||
| if label in node_cache: | |||
| node = node_cache[label] | |||
| else: | |||
| node = node_cache[label] = SymbolNode(s, start, end) | |||
| return node | |||
| def predict_and_complete(column, to_scan): | |||
| """The core Earley Predictor and Completer. | |||
| At each stage of the input, we handling any completed items (things | |||
| that matched on the last cycle) and use those to predict what should | |||
| come next in the input stream. The completions and any predicted | |||
| non-terminals are recursively processed until we reach a set of, | |||
| which can be added to the scan list for the next scanner cycle.""" | |||
| held_completions.clear() | |||
| # R (items) = Ei (column.items) | |||
| items = deque(column.items) | |||
| while items: | |||
| item = items.pop() # remove an element, A say, from R | |||
| ### The Earley completer | |||
| if item.is_complete: ### (item.s == string) | |||
| if item.node is None: | |||
| item.node = make_symbol_node(item.s, item.start, column) | |||
| item.node.add_family(item.s, item.rule, item.start, None, None) | |||
| # Empty has 0 length. If we complete an empty symbol in a particular | |||
| # parse step, we need to be able to use that same empty symbol to complete | |||
| # any predictions that result, that themselves require empty. Avoids | |||
| # infinite recursion on empty symbols. | |||
| # held_completions is 'H' in E.Scott's paper. | |||
| is_empty_item = item.start.i == column.i | |||
| if is_empty_item: | |||
| held_completions[item.rule.origin] = item.node | |||
| originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||
| for originator in originators: | |||
| new_item = originator.advance() | |||
| new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||
| new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | |||
| if new_item.expect in self.TERMINALS: | |||
| # Add (B :: aC.B, h, y) to Q | |||
| to_scan.add(new_item) | |||
| elif new_item not in column.items: | |||
| # Add (B :: aC.B, h, y) to Ei and R | |||
| column.add(new_item) | |||
| items.append(new_item) | |||
| ### The Earley predictor | |||
| elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | |||
| new_items = [] | |||
| for rule in self.predictions[item.expect]: | |||
| new_item = Item(rule, 0, column) | |||
| new_items.append(new_item) | |||
| # Process any held completions (H). | |||
| if item.expect in held_completions: | |||
| new_item = item.advance() | |||
| new_item.node = make_symbol_node(new_item.s, item.start, column) | |||
| new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | |||
| new_items.append(new_item) | |||
| for new_item in new_items: | |||
| if new_item.expect in self.TERMINALS: | |||
| to_scan.add(new_item) | |||
| elif new_item not in column.items: | |||
| column.add(new_item) | |||
| items.append(new_item) | |||
| def scan(i, token, column, to_scan): | |||
| """The core Earley Scanner. | |||
| This is a custom implementation of the scanner that uses the | |||
| Lark lexer to match tokens. The scan list is built by the | |||
| Earley predictor, based on the previously completed tokens. | |||
| This ensures that at each phase of the parse we have a custom | |||
| lexer context, allowing for more complex ambiguities.""" | |||
| next_set = Column(i+1, self.FIRST) | |||
| next_to_scan = set() | |||
| for item in set(to_scan): | |||
| if match(item.expect, token): | |||
| new_item = item.advance() | |||
| new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||
| new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | |||
| if new_item.expect in self.TERMINALS: | |||
| # add (B ::= Aai+1.B, h, y) to Q' | |||
| next_to_scan.add(new_item) | |||
| else: | |||
| # add (B ::= Aa+1.B, h, y) to Ei+1 | |||
| next_set.add(new_item) | |||
| if not next_set and not next_to_scan: | |||
| expect = {i.expect.name for i in to_scan} | |||
| raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) | |||
| return next_set, next_to_scan | |||
| # Main loop starts | |||
| column0 = Column(0, self.FIRST) | |||
| column0.add(predict(start_symbol, column0)) | |||
| column = column0 | |||
| ## The scan buffer. 'Q' in E.Scott's paper. | |||
| to_scan = set() | |||
| ## Predict for the start_symbol. | |||
| # Add predicted items to the first Earley set (for the predictor) if they | |||
| # result in a non-terminal, or the scanner if they result in a terminal. | |||
| for rule in self.predictions[start_symbol]: | |||
| item = Item(rule, 0, column0) | |||
| if item.expect in self.TERMINALS: | |||
| to_scan.add(item) | |||
| else: | |||
| column.add(item) | |||
| ## The main Earley loop. | |||
| # Run the Prediction/Completion cycle for any Items in the current Earley set. | |||
| # Completions will be added to the SPPF tree, and predictions will be recursively | |||
| # processed down to terminals/empty nodes to be added to the scanner for the next | |||
| # step. | |||
| for i, token in enumerate(stream): | |||
| predict_and_complete(column) | |||
| column = scan(i, token, column) | |||
| predict_and_complete(column, to_scan) | |||
| # Clear the node_cache and token_cache, which are only relevant for each | |||
| # step in the Earley pass. | |||
| node_cache.clear() | |||
| token_cache.clear() | |||
| column, to_scan = scan(i, token, column, to_scan) | |||
| predict_and_complete(column) | |||
| predict_and_complete(column, to_scan) | |||
| # Parse ended. Now build a parse tree | |||
| solutions = [n.tree for n in column.to_reduce | |||
| if n.rule.origin==start_symbol and n.start is column0] | |||
| ## Column is now the final column in the parse. If the parse was successful, the start | |||
| # symbol should have been completed in the last step of the Earley cycle, and will be in | |||
| # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||
| solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||
| if not solutions: | |||
| raise ParseError('Incomplete parse: Could not find a solution to input') | |||
| elif len(solutions) == 1: | |||
| tree = solutions[0] | |||
| else: | |||
| tree = Tree('_ambig', solutions) | |||
| if self.resolve_ambiguity: | |||
| tree = self.resolve_ambiguity(tree) | |||
| elif len(solutions) > 1: | |||
| raise ParseError('Earley should not generate multiple start symbol items!') | |||
| return ApplyCallbacks(self.postprocess).transform(tree) | |||
| ## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. | |||
| # This means the caller can work directly with the SPPF tree. | |||
| if not self.resolve_ambiguity: | |||
| return solutions[0] | |||
| # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | |||
| # according to the rules. | |||
| return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() | |||
| class ApplyCallbacks(Transformer_InPlace): | |||
| def __init__(self, postprocess): | |||
| @@ -0,0 +1,80 @@ | |||
| "This module implements an Earley Parser" | |||
| # The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
| # When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
| # (right now ambiguity resolution is not developed beyond the needs of lark) | |||
| # Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||
| # I use the no-recursion version of Transformer, because the tree might be | |||
| # deeper than Python's recursion limit (a bit absurd, but that's life) | |||
| # | |||
| # The algorithm keeps track of each state set, using a corresponding Column instance. | |||
| # Column keeps track of new items using NewsList instances. | |||
| # | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| ## for recursive repr | |||
| from ..tree import Tree | |||
| class Derivation(Tree): | |||
| def __init__(self, rule, children = None): | |||
| Tree.__init__(self, 'drv', children if children is not None else []) | |||
| self.meta.rule = rule | |||
| self._hash = None | |||
| def __repr__(self, indent = 0): | |||
| return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...') | |||
| def __hash__(self): | |||
| if self._hash is None: | |||
| self._hash = Tree.__hash__(self) | |||
| return self._hash | |||
| class Item(object): | |||
| "An Earley Item, the atom of the algorithm." | |||
| __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') | |||
| def __init__(self, rule, ptr, start): | |||
| self.is_complete = len(rule.expansion) == ptr | |||
| self.rule = rule # rule | |||
| self.ptr = ptr # ptr | |||
| self.start = start # j | |||
| self.node = None # w | |||
| if self.is_complete: | |||
| self.s = rule.origin | |||
| self.expect = None | |||
| else: | |||
| self.s = (rule, ptr) | |||
| self.expect = rule.expansion[ptr] | |||
| self._hash = hash((self.s, self.start.i)) | |||
| def advance(self): | |||
| return self.__class__(self.rule, self.ptr + 1, self.start) | |||
| def __eq__(self, other): | |||
| return self is other or (self.s == other.s and self.start.i == other.start.i) | |||
| def __hash__(self): | |||
| return self._hash | |||
| def __repr__(self): | |||
| return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i) | |||
| class Column: | |||
| "An entry in the table, aka Earley Chart. Contains lists of items." | |||
| def __init__(self, i, FIRST): | |||
| self.i = i | |||
| self.items = set() | |||
| self.FIRST = FIRST | |||
| def add(self, item): | |||
| """Sort items into scan/predict/reduce newslists | |||
| Makes sure only unique items are added. | |||
| """ | |||
| self.items.add(item) | |||
| def __bool__(self): | |||
| return bool(self.items) | |||
| __nonzero__ = __bool__ # Py2 backwards-compatibility | |||
| @@ -0,0 +1,347 @@ | |||
| """"This module implements an SPPF implementation | |||
| This is used as the primary output mechanism for the Earley parser | |||
| in order to store complex ambiguities. | |||
| Full reference and more details is here: | |||
| http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||
| """ | |||
| from ..tree import Tree | |||
| from ..exceptions import ParseError | |||
| from ..lexer import Token | |||
| from ..utils import Str | |||
| from ..grammar import NonTerminal, Terminal | |||
| from .earley_common import Column, Derivation | |||
| from collections import deque | |||
| class SymbolNode(object): | |||
| """ | |||
| A Symbol Node represents a symbol (or Intermediate LR0). | |||
| Symbol nodes are keyed by the symbol (s). For intermediate nodes | |||
| s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol | |||
| nodes, s will be a string representing the non-terminal origin (i.e. | |||
| the left hand side of the rule). | |||
| The children of a Symbol or Intermediate Node will always be Packed Nodes; | |||
| with each Packed Node child representing a single derivation of a production. | |||
| Hence a Symbol Node with a single child is unambiguous. | |||
| """ | |||
| __slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate') | |||
| def __init__(self, s, start, end): | |||
| self.s = s | |||
| self.start = start | |||
| self.end = end | |||
| self.children = set() | |||
| self.priority = None | |||
| self.is_intermediate = isinstance(s, tuple) | |||
| def add_family(self, lr0, rule, start, left, right): | |||
| self.children.add(PackedNode(self, lr0, rule, start, left, right)) | |||
| @property | |||
| def is_ambiguous(self): | |||
| return len(self.children) > 1 | |||
| def __iter__(self): | |||
| return iter(self.children) | |||
| def __eq__(self, other): | |||
| if not isinstance(other, SymbolNode): | |||
| return False | |||
| return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) | |||
| def __hash__(self): | |||
| return hash((self.s, self.start.i, self.end.i)) | |||
| def __repr__(self): | |||
| symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name | |||
| return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) | |||
| class PackedNode(object): | |||
| """ | |||
| A Packed Node represents a single derivation in a symbol node. | |||
| """ | |||
| __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash') | |||
| def __init__(self, parent, s, rule, start, left, right): | |||
| self.parent = parent | |||
| self.s = s | |||
| self.start = start | |||
| self.rule = rule | |||
| self.left = left | |||
| self.right = right | |||
| self.priority = None | |||
| self._hash = hash((self.s, self.start.i, self.left, self.right)) | |||
| @property | |||
| def is_empty(self): | |||
| return self.left is None and self.right is None | |||
| def __iter__(self): | |||
| return iter([self.left, self.right]) | |||
| def __lt__(self, other): | |||
| if self.is_empty and not other.is_empty: return True | |||
| if self.priority < other.priority: return True | |||
| return False | |||
| def __gt__(self, other): | |||
| if self.is_empty and not other.is_empty: return True | |||
| if self.priority > other.priority: return True | |||
| return False | |||
| def __eq__(self, other): | |||
| if not isinstance(other, PackedNode): | |||
| return False | |||
| return self is other or (self.s == other.s and self.start == other.start and self.left == other.left and self.right == other.right) | |||
| def __hash__(self): | |||
| return self._hash | |||
| def __repr__(self): | |||
| symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name | |||
| return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0) | |||
| class ForestVisitor(object): | |||
| """ | |||
| An abstract base class for building forest visitors. | |||
| Use this as a base when you need to walk the forest. | |||
| """ | |||
| def __init__(self, root): | |||
| self.root = root | |||
| self.result = None | |||
| def visit_token_node(self, node): pass | |||
| def visit_symbol_node_in(self, node): pass | |||
| def visit_symbol_node_out(self, node): pass | |||
| def visit_packed_node_in(self, node): pass | |||
| def visit_packed_node_out(self, node): pass | |||
| def go(self): | |||
| # Visiting is a list of IDs of all symbol/intermediate nodes currently in | |||
| # the stack. It serves two purposes: to detect when we 'recurse' in and out | |||
| # of a symbol/intermediate so that we can process both up and down. Also, | |||
| # since the SPPF can have cycles it allows us to detect if we're trying | |||
| # to recurse into a node that's already on the stack (infinite recursion). | |||
| visiting = set() | |||
| # We do not use recursion here to walk the Forest due to the limited | |||
| # stack size in python. Therefore input_stack is essentially our stack. | |||
| input_stack = deque([self.root]) | |||
| # It is much faster to cache these as locals since they are called | |||
| # many times in large parses. | |||
| vpno = getattr(self, 'visit_packed_node_out') | |||
| vpni = getattr(self, 'visit_packed_node_in') | |||
| vsno = getattr(self, 'visit_symbol_node_out') | |||
| vsni = getattr(self, 'visit_symbol_node_in') | |||
| vtn = getattr(self, 'visit_token_node') | |||
| while input_stack: | |||
| current = next(reversed(input_stack)) | |||
| try: | |||
| next_node = next(current) | |||
| except StopIteration: | |||
| input_stack.pop() | |||
| continue | |||
| except TypeError: | |||
| ### If the current object is not an iterator, pass through to Token/SymbolNode | |||
| pass | |||
| else: | |||
| if next_node is None: | |||
| continue | |||
| if id(next_node) in visiting: | |||
| raise ParseError("Infinite recursion in grammar!") | |||
| input_stack.append(next_node) | |||
| continue | |||
| if isinstance(current, Str): | |||
| vtn(current) | |||
| input_stack.pop() | |||
| continue | |||
| current_id = id(current) | |||
| if current_id in visiting: | |||
| if isinstance(current, PackedNode): vpno(current) | |||
| else: vsno(current) | |||
| input_stack.pop() | |||
| visiting.remove(current_id) | |||
| continue | |||
| else: | |||
| visiting.add(current_id) | |||
| if isinstance(current, PackedNode): next_node = vpni(current) | |||
| else: next_node = vsni(current) | |||
| if next_node is None: | |||
| continue | |||
| if id(next_node) in visiting: | |||
| raise ParseError("Infinite recursion in grammar!") | |||
| input_stack.append(next_node) | |||
| continue | |||
| return self.result | |||
| class ForestSumVisitor(ForestVisitor): | |||
| """ | |||
| A visitor for prioritizing ambiguous parts of the Forest. | |||
| This visitor is the default when resolving ambiguity. It pushes the priorities | |||
| from the rules into the SPPF nodes; and then sorts the packed node children | |||
| of ambiguous symbol or intermediate node according to the priorities. | |||
| This relies on the custom sort function provided in PackedNode.__lt__; which | |||
| uses these properties (and other factors) to sort the ambiguous packed nodes. | |||
| """ | |||
| def visit_packed_node_in(self, node): | |||
| return iter([node.left, node.right]) | |||
| def visit_symbol_node_in(self, node): | |||
| return iter(node.children) | |||
| def visit_packed_node_out(self, node): | |||
| node.priority = 0 | |||
| if node.rule.options and node.rule.options.priority: node.priority += node.rule.options.priority | |||
| if node.right is not None and hasattr(node.right, 'priority'): node.priority += node.right.priority | |||
| if node.left is not None and hasattr(node.left, 'priority'): node.priority += node.left.priority | |||
| def visit_symbol_node_out(self, node): | |||
| node.priority = max(child.priority for child in node.children) | |||
| node.children = sorted(node.children, reverse = True) | |||
| class ForestAntiscoreSumVisitor(ForestSumVisitor): | |||
| """ | |||
| A visitor for prioritizing ambiguous parts of the Forest. | |||
| This visitor is used when resolve_ambiguity == 'resolve__antiscore_sum'. | |||
| It pushes the priorities from the rules into the SPPF nodes, and implements | |||
| a 'least cost' mechanism for resolving ambiguity (reverse of the default | |||
| priority mechanism). It uses a custom __lt__ comparator key for sorting | |||
| the packed node children. | |||
| """ | |||
| def visit_symbol_node_out(self, node): | |||
| node.priority = min(child.priority for child in node.children) | |||
| node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) | |||
| class AntiscoreSumComparator(object): | |||
| """ | |||
| An antiscore-sum comparator for PackedNode objects. | |||
| This allows 'sorting' an iterable of PackedNode objects so that they | |||
| are arranged lowest priority first. | |||
| """ | |||
| __slots__ = ['obj'] | |||
| def __init__(self, obj, *args): | |||
| self.obj = obj | |||
| def __lt__(self, other): | |||
| if self.obj.is_empty and not other.obj.is_empty: return True | |||
| if self.obj.priority > other.obj.priority: return True | |||
| return False | |||
| def __gt__(self, other): | |||
| if self.obj.is_empty and not other.obj.is_empty: return True | |||
| if self.obj.priority < other.obj.priority: return True | |||
| return False | |||
| class ForestToTreeVisitor(ForestVisitor): | |||
| """ | |||
| A Forest visitor which converts an SPPF forest to an unambiguous AST. | |||
| The implementation in this visitor walks only the first ambiguous child | |||
| of each symbol node. When it finds an ambiguous symbol node it first | |||
| calls the forest_sum_visitor implementation to sort the children | |||
| into preference order using the algorithms defined there; so the first | |||
| child should always be the highest preference. The forest_sum_visitor | |||
| implementation should be another ForestVisitor which sorts the children | |||
| according to some priority mechanism. | |||
| """ | |||
| def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None): | |||
| super(ForestToTreeVisitor, self).__init__(root) | |||
| self.forest_sum_visitor = forest_sum_visitor | |||
| self.output_stack = deque() | |||
| self.callbacks = callbacks | |||
| self.result = None | |||
| def visit_token_node(self, node): | |||
| self.output_stack[-1].append(node) | |||
| def visit_symbol_node_in(self, node): | |||
| if node.is_ambiguous and node.priority is None: | |||
| self.forest_sum_visitor(node).go() | |||
| return next(iter(node.children)) | |||
| def visit_packed_node_in(self, node): | |||
| if not node.parent.is_intermediate: | |||
| self.output_stack.append([]) | |||
| return iter([node.left, node.right]) | |||
| def visit_packed_node_out(self, node): | |||
| if not node.parent.is_intermediate: | |||
| result = self.callbacks[node.rule](self.output_stack.pop()) | |||
| if self.output_stack: | |||
| self.output_stack[-1].append(result) | |||
| else: | |||
| self.result = result | |||
| class ForestToAmbiguousTreeVisitor(ForestVisitor): | |||
| """ | |||
| A Forest visitor which converts an SPPF forest to an ambiguous AST. | |||
| Because of the fundamental disparity between what can be stored in | |||
| an SPPF and what can be stored in a Tree; this implementation is not | |||
| complete. It correctly deals with ambiguities that occur on symbol nodes only, | |||
| and cannot deal with ambiguities that occur on intermediate nodes. | |||
| Usually, most parsers can be rewritten to avoid intermediate node | |||
| ambiguities. Also, this implementation could be fixed, however | |||
| the code to handle intermediate node ambiguities is messy and | |||
| would not be performant. It is much better not to use this and | |||
| instead to correctly disambiguate the forest and only store unambiguous | |||
| parses in Trees. It is here just to provide some parity with the | |||
| old ambiguity='explicit'. | |||
| This is mainly used by the test framework, to make it simpler to write | |||
| tests ensuring the SPPF contains the right results. | |||
| """ | |||
| def __init__(self, root, callbacks): | |||
| super(ForestToAmbiguousTreeVisitor, self).__init__(root) | |||
| self.output_stack = deque() | |||
| self.callbacks = callbacks | |||
| self.result = None | |||
| def visit_token_node(self, node): | |||
| self.output_stack[-1].children.append(node) | |||
| def visit_symbol_node_in(self, node): | |||
| if not node.is_intermediate and node.is_ambiguous: | |||
| self.output_stack.append(Tree('_ambig', [])) | |||
| return iter(node.children) | |||
| def visit_symbol_node_out(self, node): | |||
| if node.is_ambiguous: | |||
| result = self.output_stack.pop() | |||
| if self.output_stack: | |||
| self.output_stack[-1].children.append(result) | |||
| else: | |||
| self.result = result | |||
| def visit_packed_node_in(self, node): | |||
| #### NOTE: | |||
| ## When an intermediate node (node.parent.s == tuple) has ambiguous children this | |||
| ## forest visitor will break. | |||
| if not node.parent.is_intermediate: | |||
| self.output_stack.append(Tree('drv', [])) | |||
| return iter([node.left, node.right]) | |||
| def visit_packed_node_out(self, node): | |||
| if not node.parent.is_intermediate: | |||
| result = self.callbacks[node.rule](self.output_stack.pop().children) | |||
| if self.output_stack: | |||
| self.output_stack[-1].children.append(result) | |||
| else: | |||
| self.result = result | |||
| @@ -1,109 +0,0 @@ | |||
| from ..utils import compare | |||
| from functools import cmp_to_key | |||
| from ..tree import Tree | |||
| # Standard ambiguity resolver (uses comparison) | |||
| # | |||
| # Author: Erez Sh | |||
| def _compare_rules(rule1, rule2): | |||
| return -compare( len(rule1.expansion), len(rule2.expansion)) | |||
| def _sum_priority(tree): | |||
| p = 0 | |||
| for n in tree.iter_subtrees(): | |||
| try: | |||
| p += n.meta.rule.options.priority or 0 | |||
| except AttributeError: | |||
| pass | |||
| return p | |||
| def _compare_priority(tree1, tree2): | |||
| tree1.iter_subtrees() | |||
| def _compare_drv(tree1, tree2): | |||
| try: | |||
| rule1 = tree1.meta.rule | |||
| except AttributeError: | |||
| rule1 = None | |||
| try: | |||
| rule2 = tree2.meta.rule | |||
| except AttributeError: | |||
| rule2 = None | |||
| if None == rule1 == rule2: | |||
| return compare(tree1, tree2) | |||
| elif rule1 is None: | |||
| return -1 | |||
| elif rule2 is None: | |||
| return 1 | |||
| assert tree1.data != '_ambig' | |||
| assert tree2.data != '_ambig' | |||
| p1 = _sum_priority(tree1) | |||
| p2 = _sum_priority(tree2) | |||
| c = (p1 or p2) and compare(p1, p2) | |||
| if c: | |||
| return c | |||
| c = _compare_rules(tree1.meta.rule, tree2.meta.rule) | |||
| if c: | |||
| return c | |||
| # rules are "equal", so compare trees | |||
| if len(tree1.children) == len(tree2.children): | |||
| for t1, t2 in zip(tree1.children, tree2.children): | |||
| c = _compare_drv(t1, t2) | |||
| if c: | |||
| return c | |||
| return compare(len(tree1.children), len(tree2.children)) | |||
| def _standard_resolve_ambig(tree): | |||
| assert tree.data == '_ambig' | |||
| key_f = cmp_to_key(_compare_drv) | |||
| best = max(tree.children, key=key_f) | |||
| assert best.data == 'drv' | |||
| tree.set('drv', best.children) | |||
| tree.meta.rule = best.meta.rule # needed for applying callbacks | |||
| def standard_resolve_ambig(tree): | |||
| for ambig in tree.find_data('_ambig'): | |||
| _standard_resolve_ambig(ambig) | |||
| return tree | |||
| # Anti-score Sum | |||
| # | |||
| # Author: Uriva (https://github.com/uriva) | |||
| def _antiscore_sum_drv(tree): | |||
| if not isinstance(tree, Tree): | |||
| return 0 | |||
| assert tree.data != '_ambig' | |||
| return _sum_priority(tree) | |||
| def _antiscore_sum_resolve_ambig(tree): | |||
| assert tree.data == '_ambig' | |||
| best = min(tree.children, key=_antiscore_sum_drv) | |||
| assert best.data == 'drv' | |||
| tree.set('drv', best.children) | |||
| tree.meta.rule = best.meta.rule # needed for applying callbacks | |||
| def antiscore_sum_resolve_ambig(tree): | |||
| for ambig in tree.find_data('_ambig'): | |||
| _antiscore_sum_resolve_ambig(ambig) | |||
| return tree | |||
| @@ -1,107 +1,163 @@ | |||
| "This module implements an experimental Earley Parser with a dynamic lexer" | |||
| # The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
| # When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
| # (right now ambiguity resolution is not developed beyond the needs of lark) | |||
| # Afterwards the parse tree is reduced (transformed) according to user callbacks. | |||
| # I use the no-recursion version of Transformer and Visitor, because the tree might be | |||
| # deeper than Python's recursion limit (a bit absurd, but that's life) | |||
| # | |||
| # The algorithm keeps track of each state set, using a corresponding Column instance. | |||
| # Column keeps track of new items using NewsList instances. | |||
| # | |||
| # Instead of running a lexer beforehand, or using a costy char-by-char method, this parser | |||
| # uses regular expressions by necessity, achieving high-performance while maintaining all of | |||
| # Earley's power in parsing any CFG. | |||
| # | |||
| # | |||
| """This module implements an experimental Earley parser with a dynamic lexer | |||
| The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: | |||
| https://www.sciencedirect.com/science/article/pii/S1571066108001497 | |||
| That is probably the best reference for understanding the algorithm here. | |||
| The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format | |||
| is better documented here: | |||
| http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | |||
| Instead of running a lexer beforehand, or using a costy char-by-char method, this parser | |||
| uses regular expressions by necessity, achieving high-performance while maintaining all of | |||
| Earley's power in parsing any CFG. | |||
| """ | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| from collections import defaultdict | |||
| from collections import defaultdict, deque | |||
| from ..exceptions import ParseError, UnexpectedCharacters | |||
| from ..lexer import Token | |||
| from ..tree import Tree | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| from ..grammar import NonTerminal, Terminal | |||
| from .earley import ApplyCallbacks, Item, Column | |||
| from .earley import ApplyCallbacks | |||
| from .earley_common import Column, Item | |||
| from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | |||
| class Parser: | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False): | |||
| self.analysis = GrammarAnalyzer(parser_conf) | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor, ignore = (), complete_lex = False): | |||
| analysis = GrammarAnalyzer(parser_conf) | |||
| self.parser_conf = parser_conf | |||
| self.resolve_ambiguity = resolve_ambiguity | |||
| self.forest_sum_visitor = forest_sum_visitor | |||
| self.ignore = [Terminal(t) for t in ignore] | |||
| self.predict_all = predict_all | |||
| self.complete_lex = complete_lex | |||
| self.FIRST = self.analysis.FIRST | |||
| self.postprocess = {} | |||
| self.FIRST = analysis.FIRST | |||
| self.callbacks = {} | |||
| self.predictions = {} | |||
| ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than | |||
| # the slow 'isupper' in is_terminal. | |||
| self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | |||
| self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | |||
| for rule in parser_conf.rules: | |||
| self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) | |||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
| self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) | |||
| self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | |||
| self.term_matcher = term_matcher | |||
| def parse(self, stream, start_symbol=None): | |||
| # Define parser functions | |||
| start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | |||
| delayed_matches = defaultdict(list) | |||
| match = self.term_matcher | |||
| text_line = 1 | |||
| text_column = 1 | |||
| # Held Completions (H in E.Scotts paper). | |||
| held_completions = {} | |||
| def predict(nonterm, column): | |||
| assert not nonterm.is_term, nonterm | |||
| return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
| def complete(item): | |||
| name = item.rule.origin | |||
| return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] | |||
| def predict_and_complete(column): | |||
| while True: | |||
| to_predict = {x.expect for x in column.to_predict.get_news() | |||
| if x.ptr} # if not part of an already predicted batch | |||
| to_reduce = column.to_reduce.get_news() | |||
| if not (to_predict or to_reduce): | |||
| break | |||
| for nonterm in to_predict: | |||
| column.add( predict(nonterm, column) ) | |||
| for item in to_reduce: | |||
| new_items = list(complete(item)) | |||
| if item in new_items: | |||
| raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||
| column.add(new_items) | |||
| def scan(i, column): | |||
| to_scan = column.to_scan | |||
| # Cache for nodes & tokens created in a particular parse step. | |||
| node_cache = {} | |||
| token_cache = {} | |||
| for x in self.ignore: | |||
| m = match(x, stream, i) | |||
| if m: | |||
| delayed_matches[m.end()] += set(to_scan) | |||
| delayed_matches[m.end()] += set(column.to_reduce) | |||
| # TODO add partial matches for ignore too? | |||
| # s = m.group(0) | |||
| # for j in range(1, len(s)): | |||
| # m = x.match(s[:-j]) | |||
| # if m: | |||
| # delayed_matches[m.end()] += to_scan | |||
| text_line = 1 | |||
| text_column = 1 | |||
| for item in to_scan: | |||
| def make_symbol_node(s, start, end): | |||
| label = (s, start.i, end.i) | |||
| if label in node_cache: | |||
| node = node_cache[label] | |||
| else: | |||
| node = node_cache[label] = SymbolNode(s, start, end) | |||
| return node | |||
| def predict_and_complete(column, to_scan): | |||
| """The core Earley Predictor and Completer. | |||
| At each stage of the input, we handling any completed items (things | |||
| that matched on the last cycle) and use those to predict what should | |||
| come next in the input stream. The completions and any predicted | |||
| non-terminals are recursively processed until we reach a set of, | |||
| which can be added to the scan list for the next scanner cycle.""" | |||
| held_completions.clear() | |||
| # R (items) = Ei (column.items) | |||
| items = deque(column.items) | |||
| while items: | |||
| item = items.pop() # remove an element, A say, from R | |||
| ### The Earley completer | |||
| if item.is_complete: ### (item.s == string) | |||
| if item.node is None: | |||
| item.node = make_symbol_node(item.s, item.start, column) | |||
| item.node.add_family(item.s, item.rule, item.start, None, None) | |||
| # Empty has 0 length. If we complete an empty symbol in a particular | |||
| # parse step, we need to be able to use that same empty symbol to complete | |||
| # any predictions that result, that themselves require empty. Avoids | |||
| # infinite recursion on empty symbols. | |||
| # held_completions is 'H' in E.Scott's paper. | |||
| is_empty_item = item.start.i == column.i | |||
| if is_empty_item: | |||
| held_completions[item.rule.origin] = item.node | |||
| originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||
| for originator in originators: | |||
| new_item = originator.advance() | |||
| new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||
| new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | |||
| if new_item.expect in self.TERMINALS: | |||
| # Add (B :: aC.B, h, y) to Q | |||
| to_scan.add(new_item) | |||
| elif new_item not in column.items: | |||
| # Add (B :: aC.B, h, y) to Ei and R | |||
| column.add(new_item) | |||
| items.append(new_item) | |||
| ### The Earley predictor | |||
| elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | |||
| new_items = [] | |||
| for rule in self.predictions[item.expect]: | |||
| new_item = Item(rule, 0, column) | |||
| new_items.append(new_item) | |||
| # Process any held completions (H). | |||
| if item.expect in held_completions: | |||
| new_item = item.advance() | |||
| new_item.node = make_symbol_node(new_item.s, item.start, column) | |||
| new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | |||
| new_items.append(new_item) | |||
| for new_item in new_items: | |||
| if new_item.expect in self.TERMINALS: | |||
| to_scan.add(new_item) | |||
| elif new_item not in column.items: | |||
| column.add(new_item) | |||
| items.append(new_item) | |||
| def scan(i, column, to_scan): | |||
| """The core Earley Scanner. | |||
| This is a custom implementation of the scanner that uses the | |||
| Lark lexer to match tokens. The scan list is built by the | |||
| Earley predictor, based on the previously completed tokens. | |||
| This ensures that at each phase of the parse we have a custom | |||
| lexer context, allowing for more complex ambiguities.""" | |||
| # 1) Loop the expectations and ask the lexer to match. | |||
| # Since regexp is forward looking on the input stream, and we only | |||
| # want to process tokens when we hit the point in the stream at which | |||
| # they complete, we push all tokens into a buffer (delayed_matches), to | |||
| # be held possibly for a later parse step when we reach the point in the | |||
| # input stream at which they complete. | |||
| for item in set(to_scan): | |||
| m = match(item.expect, stream, i) | |||
| if m: | |||
| t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
| delayed_matches[m.end()].append(item.advance(t)) | |||
| delayed_matches[m.end()].append( (item, column, t) ) | |||
| if self.complete_lex: | |||
| s = m.group(0) | |||
| @@ -109,25 +165,85 @@ class Parser: | |||
| m = match(item.expect, s[:-j]) | |||
| if m: | |||
| t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
| delayed_matches[i+m.end()].append(item.advance(t)) | |||
| delayed_matches[i+m.end()].append( (item, column, t) ) | |||
| # Remove any items that successfully matched in this pass from the to_scan buffer. | |||
| # This ensures we don't carry over tokens that already matched, if we're ignoring below. | |||
| to_scan.remove(item) | |||
| # 3) Process any ignores. This is typically used for e.g. whitespace. | |||
| # We carry over any unmatched items from the to_scan buffer to be matched again after | |||
| # the ignore. This should allow us to use ignored symbols in non-terminals to implement | |||
| # e.g. mandatory spacing. | |||
| for x in self.ignore: | |||
| m = match(x, stream, i) | |||
| if m: | |||
| # Carry over any items still in the scan buffer, to past the end of the ignored items. | |||
| delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ]) | |||
| # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. | |||
| delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol]) | |||
| next_set = Column(i + 1, self.FIRST) # Ei+1 | |||
| next_to_scan = set() | |||
| ## 4) Process Tokens from delayed_matches. | |||
| # This is the core of the Earley scanner. Create an SPPF node for each Token, | |||
| # and create the symbol node in the SPPF tree. Advance the item that completed, | |||
| # and add the resulting new item to either the Earley set (for processing by the | |||
| # completer/predictor) or the to_scan buffer for the next parse step. | |||
| for item, start, token in delayed_matches[i+1]: | |||
| if token is not None: | |||
| new_item = item.advance() | |||
| new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||
| new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | |||
| else: | |||
| new_item = item | |||
| if new_item.expect in self.TERMINALS: | |||
| # add (B ::= Aai+1.B, h, y) to Q' | |||
| next_to_scan.add(new_item) | |||
| else: | |||
| # add (B ::= Aa+1.B, h, y) to Ei+1 | |||
| next_set.add(new_item) | |||
| next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | |||
| next_set.add(delayed_matches[i+1]) | |||
| del delayed_matches[i+1] # No longer needed, so unburden memory | |||
| if not next_set and not delayed_matches: | |||
| if not next_set and not delayed_matches and not next_to_scan: | |||
| raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) | |||
| return next_set | |||
| return next_set, next_to_scan | |||
| # Main loop starts | |||
| column0 = Column(0, self.FIRST, predict_all=self.predict_all) | |||
| column0.add(predict(start_symbol, column0)) | |||
| column0 = Column(0, self.FIRST) | |||
| column = column0 | |||
| ## The scan buffer. 'Q' in E.Scott's paper. | |||
| to_scan = set() | |||
| ## Predict for the start_symbol. | |||
| # Add predicted items to the first Earley set (for the predictor) if they | |||
| # result in a non-terminal, or the scanner if they result in a terminal. | |||
| for rule in self.predictions[start_symbol]: | |||
| item = Item(rule, 0, column0) | |||
| if item.expect in self.TERMINALS: | |||
| to_scan.add(item) | |||
| else: | |||
| column.add(item) | |||
| ## The main Earley loop. | |||
| # Run the Prediction/Completion cycle for any Items in the current Earley set. | |||
| # Completions will be added to the SPPF tree, and predictions will be recursively | |||
| # processed down to terminals/empty nodes to be added to the scanner for the next | |||
| # step. | |||
| for i, token in enumerate(stream): | |||
| predict_and_complete(column) | |||
| column = scan(i, column) | |||
| predict_and_complete(column, to_scan) | |||
| # Clear the node_cache and token_cache, which are only relevant for each | |||
| # step in the Earley pass. | |||
| node_cache.clear() | |||
| token_cache.clear() | |||
| column, to_scan = scan(i, column, to_scan) | |||
| if token == '\n': | |||
| text_line += 1 | |||
| @@ -135,24 +251,24 @@ class Parser: | |||
| else: | |||
| text_column += 1 | |||
| predict_and_complete(column) | |||
| predict_and_complete(column, to_scan) | |||
| # Parse ended. Now build a parse tree | |||
| solutions = [n.tree for n in column.to_reduce | |||
| if n.rule.origin==start_symbol and n.start is column0] | |||
| ## Column is now the final column in the parse. If the parse was successful, the start | |||
| # symbol should have been completed in the last step of the Earley cycle, and will be in | |||
| # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | |||
| solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||
| if not solutions: | |||
| expected_tokens = [t.expect for t in column.to_scan] | |||
| expected_tokens = [t.expect for t in to_scan] | |||
| raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) | |||
| elif len(solutions) > 1: | |||
| raise Exception('Earley should not generate more than one start symbol - bug') | |||
| elif len(solutions) == 1: | |||
| tree = solutions[0] | |||
| else: | |||
| tree = Tree('_ambig', solutions) | |||
| if self.resolve_ambiguity: | |||
| tree = self.resolve_ambiguity(tree) | |||
| return ApplyCallbacks(self.postprocess).transform(tree) | |||
| ## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. | |||
| # This means the caller can work directly with the SPPF tree. | |||
| if not self.resolve_ambiguity: | |||
| return solutions[0] | |||
| # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | |||
| # according to the rules. | |||
| return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() | |||
| @@ -4,7 +4,7 @@ from .tree import Tree | |||
| from .visitors import Transformer_InPlace | |||
| from .common import ParserConf | |||
| from .lexer import Token, PatternStr | |||
| from .parsers import earley, resolve_ambig | |||
| from .parsers import earley | |||
| from .grammar import Rule, Terminal, NonTerminal | |||
| @@ -114,7 +114,7 @@ class Reconstructor: | |||
| def _reconstruct(self, tree): | |||
| # TODO: ambiguity? | |||
| parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=resolve_ambig.standard_resolve_ambig) | |||
| parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=True) | |||
| unreduced_tree = parser.parse(tree.children) # find a full derivation | |||
| assert unreduced_tree.data == tree.data | |||
| res = self.write_tokens.transform(unreduced_tree) | |||
| @@ -21,6 +21,8 @@ from lark.lark import Lark | |||
| from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput | |||
| from lark.tree import Tree | |||
| from lark.visitors import Transformer | |||
| from lark.parsers.earley_forest import ForestToAmbiguousTreeVisitor | |||
| from lark.parsers.earley import ApplyCallbacks | |||
| __path__ = os.path.dirname(__file__) | |||
| def _read(n, *args): | |||
| @@ -236,10 +238,11 @@ def _make_full_earley_test(LEXER): | |||
| """ | |||
| parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') | |||
| res = parser.parse('ab') | |||
| self.assertEqual( res.data, '_ambig') | |||
| self.assertEqual( len(res.children), 2) | |||
| root_symbol = parser.parse('ab') | |||
| ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() | |||
| # print(ambig_tree.pretty()) | |||
| self.assertEqual( ambig_tree.data, '_ambig') | |||
| self.assertEqual( len(ambig_tree.children), 2) | |||
| def test_ambiguity1(self): | |||
| grammar = """ | |||
| @@ -251,9 +254,35 @@ def _make_full_earley_test(LEXER): | |||
| """ | |||
| l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) | |||
| x = l.parse('cde') | |||
| assert x.data == '_ambig', x | |||
| assert len(x.children) == 2 | |||
| root_symbol = l.parse('cde') | |||
| ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, l.parser.parser.callbacks).go() | |||
| # print(ambig_tree.pretty()) | |||
| # tree = ApplyCallbacks(l.parser.parser.postprocess).transform(ambig_tree) | |||
| assert ambig_tree.data == '_ambig', ambig_tree | |||
| assert len(ambig_tree.children) == 2 | |||
| @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") | |||
| def test_ambiguity2(self): | |||
| grammar = """ | |||
| ANY: /[a-zA-Z0-9 ]+/ | |||
| a.2: "A" b+ | |||
| b.2: "B" | |||
| c: ANY | |||
| start: (a|c)* | |||
| """ | |||
| l = Lark(grammar, parser='earley', lexer=LEXER) | |||
| res = l.parse('ABX') | |||
| expected = Tree('start', [ | |||
| Tree('a', [ | |||
| Tree('b', []) | |||
| ]), | |||
| Tree('c', [ | |||
| 'X' | |||
| ]) | |||
| ]) | |||
| self.assertEqual(res, expected) | |||
| def test_fruitflies_ambig(self): | |||
| grammar = """ | |||
| @@ -272,7 +301,9 @@ def _make_full_earley_test(LEXER): | |||
| %ignore WS | |||
| """ | |||
| parser = Lark(grammar, ambiguity='explicit', lexer=LEXER) | |||
| res = parser.parse('fruit flies like bananas') | |||
| root_symbol = parser.parse('fruit flies like bananas') | |||
| tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() | |||
| # tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree) | |||
| expected = Tree('_ambig', [ | |||
| Tree('comparative', [ | |||
| @@ -290,7 +321,9 @@ def _make_full_earley_test(LEXER): | |||
| # print res.pretty() | |||
| # print expected.pretty() | |||
| self.assertEqual(res, expected) | |||
| # self.assertEqual(tree, expected) | |||
| self.assertEqual(tree.data, expected.data) | |||
| self.assertEqual(set(tree.children), set(expected.children)) | |||
| @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") | |||
| @@ -303,7 +336,9 @@ def _make_full_earley_test(LEXER): | |||
| text = """cat""" | |||
| parser = Lark(grammar, start='start', ambiguity='explicit') | |||
| tree = parser.parse(text) | |||
| root_symbol = parser.parse(text) | |||
| ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol).go() | |||
| tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree) | |||
| self.assertEqual(tree.data, '_ambig') | |||
| combinations = {tuple(str(s) for s in t.children) for t in tree.children} | |||