Merge remote-tracking branch 'origin/0.7b' into 0.7btags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.6
| @@ -16,17 +16,19 @@ from ..visitors import Transformer_InPlace, v_args | |||||
| from ..exceptions import ParseError, UnexpectedToken | from ..exceptions import ParseError, UnexpectedToken | ||||
| from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
| from ..grammar import NonTerminal | from ..grammar import NonTerminal | ||||
| from .earley_common import Column, Item | |||||
| from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor | |||||
| from .earley_common import Item, TransitiveItem | |||||
| from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode | |||||
| from collections import deque, defaultdict | |||||
| class Parser: | class Parser: | ||||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor): | def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor): | ||||
| analysis = GrammarAnalyzer(parser_conf) | analysis = GrammarAnalyzer(parser_conf) | ||||
| self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
| self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
| self.forest_sum_visitor = forest_sum_visitor | |||||
| self.FIRST = analysis.FIRST | self.FIRST = analysis.FIRST | ||||
| self.NULLABLE = analysis.NULLABLE | |||||
| self.callbacks = {} | self.callbacks = {} | ||||
| self.predictions = {} | self.predictions = {} | ||||
| @@ -39,6 +41,7 @@ class Parser: | |||||
| self.callbacks[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | self.callbacks[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | ||||
| self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | ||||
| self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) | |||||
| self.term_matcher = term_matcher | self.term_matcher = term_matcher | ||||
| @@ -46,19 +49,78 @@ class Parser: | |||||
| # Define parser functions | # Define parser functions | ||||
| start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | start_symbol = NonTerminal(start_symbol or self.parser_conf.start) | ||||
| match = self.term_matcher | match = self.term_matcher | ||||
| held_completions = defaultdict(list) | |||||
| # Held Completions (H in E.Scotts paper). | |||||
| held_completions = {} | |||||
| # Cache for nodes & tokens created in a particular parse step. | |||||
| node_cache = {} | node_cache = {} | ||||
| token_cache = {} | token_cache = {} | ||||
| def make_symbol_node(s, start, end): | |||||
| label = (s, start.i, end.i) | |||||
| if label in node_cache: | |||||
| node = node_cache[label] | |||||
| columns = [] | |||||
| transitives = [] | |||||
| def is_quasi_complete(item): | |||||
| if item.is_complete: | |||||
| return True | |||||
| quasi = item.advance() | |||||
| while not quasi.is_complete: | |||||
| symbol = quasi.expect | |||||
| if symbol not in self.NULLABLE: | |||||
| return False | |||||
| if quasi.rule.origin == start_symbol and symbol == start_symbol: | |||||
| return False | |||||
| quasi = quasi.advance() | |||||
| return True | |||||
| def create_leo_transitives(item, trule, previous, visited = None): | |||||
| if visited is None: | |||||
| visited = set() | |||||
| if item.rule.origin in transitives[item.start]: | |||||
| previous = trule = transitives[item.start][item.rule.origin] | |||||
| return trule, previous | |||||
| is_empty_rule = not self.FIRST[item.rule.origin] | |||||
| if is_empty_rule: | |||||
| return trule, previous | |||||
| originator = None | |||||
| for key in columns[item.start]: | |||||
| if key.expect is not None and key.expect == item.rule.origin: | |||||
| if originator is not None: | |||||
| return trule, previous | |||||
| originator = key | |||||
| if originator is None: | |||||
| return trule, previous | |||||
| if originator in visited: | |||||
| return trule, previous | |||||
| visited.add(originator) | |||||
| if not is_quasi_complete(originator): | |||||
| return trule, previous | |||||
| trule = originator.advance() | |||||
| if originator.start != item.start: | |||||
| visited.clear() | |||||
| trule, previous = create_leo_transitives(originator, trule, previous, visited) | |||||
| if trule is None: | |||||
| return trule, previous | |||||
| titem = None | |||||
| if previous is not None: | |||||
| titem = TransitiveItem(item.rule.origin, trule, originator, previous.column) | |||||
| previous.next_titem = titem | |||||
| else: | else: | ||||
| node = node_cache[label] = SymbolNode(s, start, end) | |||||
| return node | |||||
| titem = TransitiveItem(item.rule.origin, trule, originator, item.start) | |||||
| previous = transitives[item.start][item.rule.origin] = titem | |||||
| return trule, previous | |||||
| def predict_and_complete(column, to_scan): | |||||
| def predict_and_complete(i, to_scan): | |||||
| """The core Earley Predictor and Completer. | """The core Earley Predictor and Completer. | ||||
| At each stage of the input, we handling any completed items (things | At each stage of the input, we handling any completed items (things | ||||
| @@ -68,61 +130,90 @@ class Parser: | |||||
| which can be added to the scan list for the next scanner cycle.""" | which can be added to the scan list for the next scanner cycle.""" | ||||
| held_completions.clear() | held_completions.clear() | ||||
| column = columns[i] | |||||
| # R (items) = Ei (column.items) | # R (items) = Ei (column.items) | ||||
| items = deque(column.items) | |||||
| items = deque(column) | |||||
| while items: | while items: | ||||
| item = items.pop() # remove an element, A say, from R | item = items.pop() # remove an element, A say, from R | ||||
| ### The Earley completer | ### The Earley completer | ||||
| if item.is_complete: ### (item.s == string) | if item.is_complete: ### (item.s == string) | ||||
| if item.node is None: | if item.node is None: | ||||
| item.node = make_symbol_node(item.s, item.start, column) | |||||
| label = (item.s, item.start, i) | |||||
| item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| item.node.add_family(item.s, item.rule, item.start, None, None) | item.node.add_family(item.s, item.rule, item.start, None, None) | ||||
| # Empty has 0 length. If we complete an empty symbol in a particular | |||||
| # parse step, we need to be able to use that same empty symbol to complete | |||||
| # any predictions that result, that themselves require empty. Avoids | |||||
| # infinite recursion on empty symbols. | |||||
| # held_completions is 'H' in E.Scott's paper. | |||||
| is_empty_item = item.start.i == column.i | |||||
| if is_empty_item: | |||||
| held_completions[item.rule.origin] = item.node | |||||
| originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||||
| for originator in originators: | |||||
| new_item = originator.advance() | |||||
| new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||||
| new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | |||||
| create_leo_transitives(item, None, None) | |||||
| ###R Joop Leo right recursion Completer | |||||
| if item.rule.origin in transitives[item.start]: | |||||
| transitive = transitives[item.start][item.s] | |||||
| if transitive.previous in transitives[transitive.column]: | |||||
| root_transitive = transitives[transitive.column][transitive.previous] | |||||
| else: | |||||
| root_transitive = transitive | |||||
| label = (root_transitive.s, root_transitive.start, i) | |||||
| node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| vn.add_path(root_transitive, item.node) | |||||
| new_item = Item(transitive.rule, transitive.ptr, transitive.start) | |||||
| new_item.node = vn | |||||
| if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
| # Add (B :: aC.B, h, y) to Q | # Add (B :: aC.B, h, y) to Q | ||||
| to_scan.add(new_item) | to_scan.add(new_item) | ||||
| elif new_item not in column.items: | |||||
| elif new_item not in column: | |||||
| # Add (B :: aC.B, h, y) to Ei and R | # Add (B :: aC.B, h, y) to Ei and R | ||||
| column.add(new_item) | column.add(new_item) | ||||
| items.append(new_item) | items.append(new_item) | ||||
| ###R Regular Earley completer | |||||
| else: | |||||
| # Empty has 0 length. If we complete an empty symbol in a particular | |||||
| # parse step, we need to be able to use that same empty symbol to complete | |||||
| # any predictions that result, that themselves require empty. Avoids | |||||
| # infinite recursion on empty symbols. | |||||
| # held_completions is 'H' in E.Scott's paper. | |||||
| is_empty_item = item.start == i | |||||
| if is_empty_item: | |||||
| held_completions[item.rule.origin] = item.node | |||||
| originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] | |||||
| for originator in originators: | |||||
| new_item = originator.advance() | |||||
| label = (new_item.s, originator.start, i) | |||||
| new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) | |||||
| if new_item.expect in self.TERMINALS: | |||||
| # Add (B :: aC.B, h, y) to Q | |||||
| to_scan.add(new_item) | |||||
| elif new_item not in column: | |||||
| # Add (B :: aC.B, h, y) to Ei and R | |||||
| column.add(new_item) | |||||
| items.append(new_item) | |||||
| ### The Earley predictor | ### The Earley predictor | ||||
| elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | ||||
| new_items = [] | new_items = [] | ||||
| for rule in self.predictions[item.expect]: | for rule in self.predictions[item.expect]: | ||||
| new_item = Item(rule, 0, column) | |||||
| new_item = Item(rule, 0, i) | |||||
| new_items.append(new_item) | new_items.append(new_item) | ||||
| # Process any held completions (H). | # Process any held completions (H). | ||||
| if item.expect in held_completions: | if item.expect in held_completions: | ||||
| new_item = item.advance() | new_item = item.advance() | ||||
| new_item.node = make_symbol_node(new_item.s, item.start, column) | |||||
| label = (new_item.s, item.start, i) | |||||
| new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | ||||
| new_items.append(new_item) | new_items.append(new_item) | ||||
| for new_item in new_items: | for new_item in new_items: | ||||
| if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
| to_scan.add(new_item) | to_scan.add(new_item) | ||||
| elif new_item not in column.items: | |||||
| elif new_item not in column: | |||||
| column.add(new_item) | column.add(new_item) | ||||
| items.append(new_item) | items.append(new_item) | ||||
| def scan(i, token, column, to_scan): | |||||
| def scan(i, token, to_scan): | |||||
| """The core Earley Scanner. | """The core Earley Scanner. | ||||
| This is a custom implementation of the scanner that uses the | This is a custom implementation of the scanner that uses the | ||||
| @@ -130,12 +221,17 @@ class Parser: | |||||
| Earley predictor, based on the previously completed tokens. | Earley predictor, based on the previously completed tokens. | ||||
| This ensures that at each phase of the parse we have a custom | This ensures that at each phase of the parse we have a custom | ||||
| lexer context, allowing for more complex ambiguities.""" | lexer context, allowing for more complex ambiguities.""" | ||||
| next_set = Column(i+1, self.FIRST) | |||||
| next_to_scan = set() | next_to_scan = set() | ||||
| next_set = set() | |||||
| columns.append(next_set) | |||||
| next_transitives = dict() | |||||
| transitives.append(next_transitives) | |||||
| for item in set(to_scan): | for item in set(to_scan): | ||||
| if match(item.expect, token): | if match(item.expect, token): | ||||
| new_item = item.advance() | new_item = item.advance() | ||||
| new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||||
| label = (new_item.s, new_item.start, i) | |||||
| new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | ||||
| if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
| @@ -149,11 +245,11 @@ class Parser: | |||||
| expect = {i.expect.name for i in to_scan} | expect = {i.expect.name for i in to_scan} | ||||
| raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) | raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) | ||||
| return next_set, next_to_scan | |||||
| return next_to_scan | |||||
| # Main loop starts | # Main loop starts | ||||
| column0 = Column(0, self.FIRST) | |||||
| column = column0 | |||||
| columns.append(set()) | |||||
| transitives.append(dict()) | |||||
| ## The scan buffer. 'Q' in E.Scott's paper. | ## The scan buffer. 'Q' in E.Scott's paper. | ||||
| to_scan = set() | to_scan = set() | ||||
| @@ -162,32 +258,34 @@ class Parser: | |||||
| # Add predicted items to the first Earley set (for the predictor) if they | # Add predicted items to the first Earley set (for the predictor) if they | ||||
| # result in a non-terminal, or the scanner if they result in a terminal. | # result in a non-terminal, or the scanner if they result in a terminal. | ||||
| for rule in self.predictions[start_symbol]: | for rule in self.predictions[start_symbol]: | ||||
| item = Item(rule, 0, column0) | |||||
| item = Item(rule, 0, 0) | |||||
| if item.expect in self.TERMINALS: | if item.expect in self.TERMINALS: | ||||
| to_scan.add(item) | to_scan.add(item) | ||||
| else: | else: | ||||
| column.add(item) | |||||
| columns[0].add(item) | |||||
| ## The main Earley loop. | ## The main Earley loop. | ||||
| # Run the Prediction/Completion cycle for any Items in the current Earley set. | # Run the Prediction/Completion cycle for any Items in the current Earley set. | ||||
| # Completions will be added to the SPPF tree, and predictions will be recursively | # Completions will be added to the SPPF tree, and predictions will be recursively | ||||
| # processed down to terminals/empty nodes to be added to the scanner for the next | # processed down to terminals/empty nodes to be added to the scanner for the next | ||||
| # step. | # step. | ||||
| for i, token in enumerate(stream): | |||||
| predict_and_complete(column, to_scan) | |||||
| i = 0 | |||||
| for token in stream: | |||||
| predict_and_complete(i, to_scan) | |||||
| # Clear the node_cache and token_cache, which are only relevant for each | # Clear the node_cache and token_cache, which are only relevant for each | ||||
| # step in the Earley pass. | # step in the Earley pass. | ||||
| node_cache.clear() | node_cache.clear() | ||||
| token_cache.clear() | token_cache.clear() | ||||
| column, to_scan = scan(i, token, column, to_scan) | |||||
| to_scan = scan(i, token, to_scan) | |||||
| i += 1 | |||||
| predict_and_complete(column, to_scan) | |||||
| predict_and_complete(i, to_scan) | |||||
| ## Column is now the final column in the parse. If the parse was successful, the start | ## Column is now the final column in the parse. If the parse was successful, the start | ||||
| # symbol should have been completed in the last step of the Earley cycle, and will be in | # symbol should have been completed in the last step of the Earley cycle, and will be in | ||||
| # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | ||||
| solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||||
| solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||||
| if not solutions: | if not solutions: | ||||
| raise ParseError('Incomplete parse: Could not find a solution to input') | raise ParseError('Incomplete parse: Could not find a solution to input') | ||||
| @@ -201,7 +299,7 @@ class Parser: | |||||
| # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | ||||
| # according to the rules. | # according to the rules. | ||||
| return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() | |||||
| return self.forest_tree_visitor.go(solutions[0]) | |||||
| class ApplyCallbacks(Transformer_InPlace): | class ApplyCallbacks(Transformer_InPlace): | ||||
| def __init__(self, postprocess): | def __init__(self, postprocess): | ||||
| @@ -13,27 +13,12 @@ | |||||
| # Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
| # Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
| ## for recursive repr | |||||
| from ..tree import Tree | |||||
| class Derivation(Tree): | |||||
| def __init__(self, rule, children = None): | |||||
| Tree.__init__(self, 'drv', children if children is not None else []) | |||||
| self.meta.rule = rule | |||||
| self._hash = None | |||||
| def __repr__(self, indent = 0): | |||||
| return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...') | |||||
| def __hash__(self): | |||||
| if self._hash is None: | |||||
| self._hash = Tree.__hash__(self) | |||||
| return self._hash | |||||
| from ..grammar import NonTerminal, Terminal | |||||
| class Item(object): | class Item(object): | ||||
| "An Earley Item, the atom of the algorithm." | "An Earley Item, the atom of the algorithm." | ||||
| __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') | |||||
| __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash') | |||||
| def __init__(self, rule, ptr, start): | def __init__(self, rule, ptr, start): | ||||
| self.is_complete = len(rule.expansion) == ptr | self.is_complete = len(rule.expansion) == ptr | ||||
| self.rule = rule # rule | self.rule = rule # rule | ||||
| @@ -43,38 +28,48 @@ class Item(object): | |||||
| if self.is_complete: | if self.is_complete: | ||||
| self.s = rule.origin | self.s = rule.origin | ||||
| self.expect = None | self.expect = None | ||||
| self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None | |||||
| else: | else: | ||||
| self.s = (rule, ptr) | self.s = (rule, ptr) | ||||
| self.expect = rule.expansion[ptr] | self.expect = rule.expansion[ptr] | ||||
| self._hash = hash((self.s, self.start.i)) | |||||
| self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None | |||||
| self._hash = hash((self.s, self.start)) | |||||
| def advance(self): | def advance(self): | ||||
| return self.__class__(self.rule, self.ptr + 1, self.start) | |||||
| return Item(self.rule, self.ptr + 1, self.start) | |||||
| def __eq__(self, other): | def __eq__(self, other): | ||||
| return self is other or (self.s == other.s and self.start.i == other.start.i) | |||||
| return self is other or (self.s == other.s and self.start == other.start) | |||||
| def __hash__(self): | def __hash__(self): | ||||
| return self._hash | return self._hash | ||||
| def __repr__(self): | def __repr__(self): | ||||
| return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i) | |||||
| class Column: | |||||
| "An entry in the table, aka Earley Chart. Contains lists of items." | |||||
| def __init__(self, i, FIRST): | |||||
| self.i = i | |||||
| self.items = set() | |||||
| self.FIRST = FIRST | |||||
| before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) | |||||
| after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) | |||||
| symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after)) | |||||
| return '%s (%d)' % (symbol, self.start) | |||||
| class TransitiveItem(Item): | |||||
| __slots__ = ('recognized', 'reduction', 'column', 'next_titem') | |||||
| def __init__(self, recognized, trule, originator, start): | |||||
| super(TransitiveItem, self).__init__(trule.rule, trule.ptr, trule.start) | |||||
| self.recognized = recognized | |||||
| self.reduction = originator | |||||
| self.column = start | |||||
| self.next_titem = None | |||||
| self._hash = hash((self.s, self.start, self.recognized)) | |||||
| def add(self, item): | |||||
| """Sort items into scan/predict/reduce newslists | |||||
| Makes sure only unique items are added. | |||||
| """ | |||||
| self.items.add(item) | |||||
| def __eq__(self, other): | |||||
| if not isinstance(other, TransitiveItem): | |||||
| return False | |||||
| return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.recognized == other.recognized) | |||||
| def __bool__(self): | |||||
| return bool(self.items) | |||||
| def __hash__(self): | |||||
| return self._hash | |||||
| __nonzero__ = __bool__ # Py2 backwards-compatibility | |||||
| def __repr__(self): | |||||
| before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] ) | |||||
| after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] ) | |||||
| return '{} : {} -> {}* {} ({}, {})'.format(self.recognized.name, self.rule.origin.name, ' '.join(before), ' '.join(after), self.column, self.start) | |||||
| @@ -7,14 +7,15 @@ Full reference and more details is here: | |||||
| http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ | ||||
| """ | """ | ||||
| from random import randint | |||||
| from ..tree import Tree | from ..tree import Tree | ||||
| from ..exceptions import ParseError | from ..exceptions import ParseError | ||||
| from ..lexer import Token | from ..lexer import Token | ||||
| from ..utils import Str | from ..utils import Str | ||||
| from ..grammar import NonTerminal, Terminal | |||||
| from .earley_common import Column, Derivation | |||||
| from ..grammar import NonTerminal, Terminal, Symbol | |||||
| from collections import deque | from collections import deque | ||||
| from importlib import import_module | |||||
| class ForestNode(object): | class ForestNode(object): | ||||
| pass | pass | ||||
| @@ -33,36 +34,65 @@ class SymbolNode(ForestNode): | |||||
| Hence a Symbol Node with a single child is unambiguous. | Hence a Symbol Node with a single child is unambiguous. | ||||
| """ | """ | ||||
| __slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate') | |||||
| __slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash') | |||||
| def __init__(self, s, start, end): | def __init__(self, s, start, end): | ||||
| self.s = s | self.s = s | ||||
| self.start = start | self.start = start | ||||
| self.end = end | self.end = end | ||||
| self.children = set() | |||||
| self._children = set() | |||||
| self.paths = set() | |||||
| self.paths_loaded = False | |||||
| self.priority = None | self.priority = None | ||||
| self.is_intermediate = isinstance(s, tuple) | self.is_intermediate = isinstance(s, tuple) | ||||
| self._hash = hash((self.s, self.start, self.end)) | |||||
| def add_family(self, lr0, rule, start, left, right): | def add_family(self, lr0, rule, start, left, right): | ||||
| self.children.add(PackedNode(self, lr0, rule, start, left, right)) | |||||
| self._children.add(PackedNode(self, lr0, rule, start, left, right)) | |||||
| def add_path(self, transitive, node): | |||||
| self.paths.add((transitive, node)) | |||||
| def load_paths(self): | |||||
| for transitive, node in self.paths: | |||||
| if transitive.next_titem is not None: | |||||
| vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end) | |||||
| vn.add_path(transitive.next_titem, node) | |||||
| self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn) | |||||
| else: | |||||
| self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node) | |||||
| self.paths_loaded = True | |||||
| @property | @property | ||||
| def is_ambiguous(self): | def is_ambiguous(self): | ||||
| return len(self.children) > 1 | return len(self.children) > 1 | ||||
| @property | |||||
| def children(self): | |||||
| if not self.paths_loaded: | |||||
| self.load_paths() | |||||
| return self._children | |||||
| def __iter__(self): | def __iter__(self): | ||||
| return iter(self.children) | |||||
| return iter(self._children) | |||||
| def __eq__(self, other): | def __eq__(self, other): | ||||
| if not isinstance(other, SymbolNode): | if not isinstance(other, SymbolNode): | ||||
| return False | return False | ||||
| return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) | |||||
| return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end) | |||||
| def __hash__(self): | def __hash__(self): | ||||
| return hash((self.s, self.start.i, self.end.i)) | |||||
| return self._hash | |||||
| def __repr__(self): | def __repr__(self): | ||||
| symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name | |||||
| return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) | |||||
| if self.is_intermediate: | |||||
| rule = self.s[0] | |||||
| ptr = self.s[1] | |||||
| before = ( expansion.name for expansion in rule.expansion[:ptr] ) | |||||
| after = ( expansion.name for expansion in rule.expansion[ptr:] ) | |||||
| symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) | |||||
| else: | |||||
| symbol = self.s.name | |||||
| return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority if self.priority is not None else 0) | |||||
| class PackedNode(ForestNode): | class PackedNode(ForestNode): | ||||
| """ | """ | ||||
| @@ -77,7 +107,7 @@ class PackedNode(ForestNode): | |||||
| self.left = left | self.left = left | ||||
| self.right = right | self.right = right | ||||
| self.priority = None | self.priority = None | ||||
| self._hash = hash((self.s, self.start.i, self.left, self.right)) | |||||
| self._hash = hash((self.s, self.start, self.left, self.right)) | |||||
| @property | @property | ||||
| def is_empty(self): | def is_empty(self): | ||||
| @@ -105,8 +135,15 @@ class PackedNode(ForestNode): | |||||
| return self._hash | return self._hash | ||||
| def __repr__(self): | def __repr__(self): | ||||
| symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name | |||||
| return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0) | |||||
| if isinstance(self.s, tuple): | |||||
| rule = self.s[0] | |||||
| ptr = self.s[1] | |||||
| before = ( expansion.name for expansion in rule.expansion[:ptr] ) | |||||
| after = ( expansion.name for expansion in rule.expansion[ptr:] ) | |||||
| symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after)) | |||||
| else: | |||||
| symbol = self.s.name | |||||
| return "({}, {}, {})".format(symbol, self.start, self.priority) | |||||
| class ForestVisitor(object): | class ForestVisitor(object): | ||||
| """ | """ | ||||
| @@ -114,9 +151,7 @@ class ForestVisitor(object): | |||||
| Use this as a base when you need to walk the forest. | Use this as a base when you need to walk the forest. | ||||
| """ | """ | ||||
| def __init__(self, root): | |||||
| self.root = root | |||||
| self.result = None | |||||
| __slots__ = ['result'] | |||||
| def visit_token_node(self, node): pass | def visit_token_node(self, node): pass | ||||
| def visit_symbol_node_in(self, node): pass | def visit_symbol_node_in(self, node): pass | ||||
| @@ -124,7 +159,8 @@ class ForestVisitor(object): | |||||
| def visit_packed_node_in(self, node): pass | def visit_packed_node_in(self, node): pass | ||||
| def visit_packed_node_out(self, node): pass | def visit_packed_node_out(self, node): pass | ||||
| def go(self): | |||||
| def go(self, root): | |||||
| self.result = None | |||||
| # Visiting is a list of IDs of all symbol/intermediate nodes currently in | # Visiting is a list of IDs of all symbol/intermediate nodes currently in | ||||
| # the stack. It serves two purposes: to detect when we 'recurse' in and out | # the stack. It serves two purposes: to detect when we 'recurse' in and out | ||||
| # of a symbol/intermediate so that we can process both up and down. Also, | # of a symbol/intermediate so that we can process both up and down. Also, | ||||
| @@ -134,7 +170,7 @@ class ForestVisitor(object): | |||||
| # We do not use recursion here to walk the Forest due to the limited | # We do not use recursion here to walk the Forest due to the limited | ||||
| # stack size in python. Therefore input_stack is essentially our stack. | # stack size in python. Therefore input_stack is essentially our stack. | ||||
| input_stack = deque([self.root]) | |||||
| input_stack = deque([root]) | |||||
| # It is much faster to cache these as locals since they are called | # It is much faster to cache these as locals since they are called | ||||
| # many times in large parses. | # many times in large parses. | ||||
| @@ -170,8 +206,8 @@ class ForestVisitor(object): | |||||
| current_id = id(current) | current_id = id(current) | ||||
| if current_id in visiting: | if current_id in visiting: | ||||
| if isinstance(current, PackedNode): vpno(current) | |||||
| else: vsno(current) | |||||
| if isinstance(current, PackedNode): vpno(current) | |||||
| else: vsno(current) | |||||
| input_stack.pop() | input_stack.pop() | ||||
| visiting.remove(current_id) | visiting.remove(current_id) | ||||
| continue | continue | ||||
| @@ -214,7 +250,7 @@ class ForestSumVisitor(ForestVisitor): | |||||
| def visit_symbol_node_out(self, node): | def visit_symbol_node_out(self, node): | ||||
| node.priority = max(child.priority for child in node.children) | node.priority = max(child.priority for child in node.children) | ||||
| node.children = sorted(node.children, reverse = True) | |||||
| node._children = sorted(node.children, reverse = True) | |||||
| class ForestAntiscoreSumVisitor(ForestSumVisitor): | class ForestAntiscoreSumVisitor(ForestSumVisitor): | ||||
| """ | """ | ||||
| @@ -228,7 +264,7 @@ class ForestAntiscoreSumVisitor(ForestSumVisitor): | |||||
| """ | """ | ||||
| def visit_symbol_node_out(self, node): | def visit_symbol_node_out(self, node): | ||||
| node.priority = min(child.priority for child in node.children) | node.priority = min(child.priority for child in node.children) | ||||
| node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) | |||||
| node._children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) | |||||
| class AntiscoreSumComparator(object): | class AntiscoreSumComparator(object): | ||||
| """ | """ | ||||
| @@ -263,19 +299,21 @@ class ForestToTreeVisitor(ForestVisitor): | |||||
| implementation should be another ForestVisitor which sorts the children | implementation should be another ForestVisitor which sorts the children | ||||
| according to some priority mechanism. | according to some priority mechanism. | ||||
| """ | """ | ||||
| def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None): | |||||
| super(ForestToTreeVisitor, self).__init__(root) | |||||
| self.forest_sum_visitor = forest_sum_visitor | |||||
| self.output_stack = deque() | |||||
| __slots__ = ['forest_sum_visitor', 'output_stack', 'callbacks'] | |||||
| def __init__(self, forest_sum_visitor = ForestSumVisitor, callbacks = None): | |||||
| self.forest_sum_visitor = forest_sum_visitor() | |||||
| self.callbacks = callbacks | self.callbacks = callbacks | ||||
| self.result = None | |||||
| def go(self, root): | |||||
| self.output_stack = deque() | |||||
| return super(ForestToTreeVisitor, self).go(root) | |||||
| def visit_token_node(self, node): | def visit_token_node(self, node): | ||||
| self.output_stack[-1].append(node) | self.output_stack[-1].append(node) | ||||
| def visit_symbol_node_in(self, node): | def visit_symbol_node_in(self, node): | ||||
| if node.is_ambiguous and node.priority is None: | if node.is_ambiguous and node.priority is None: | ||||
| self.forest_sum_visitor(node).go() | |||||
| self.forest_sum_visitor.go(node) | |||||
| return next(iter(node.children)) | return next(iter(node.children)) | ||||
| def visit_packed_node_in(self, node): | def visit_packed_node_in(self, node): | ||||
| @@ -311,11 +349,13 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): | |||||
| This is mainly used by the test framework, to make it simpler to write | This is mainly used by the test framework, to make it simpler to write | ||||
| tests ensuring the SPPF contains the right results. | tests ensuring the SPPF contains the right results. | ||||
| """ | """ | ||||
| def __init__(self, root, callbacks): | |||||
| super(ForestToAmbiguousTreeVisitor, self).__init__(root) | |||||
| self.output_stack = deque() | |||||
| __slots__ = ['output_stack', 'callbacks'] | |||||
| def __init__(self, callbacks): | |||||
| self.callbacks = callbacks | self.callbacks = callbacks | ||||
| self.result = None | |||||
| def go(self, root): | |||||
| self.output_stack = deque([]) | |||||
| return super(ForestToAmbiguousTreeVisitor, self).go(root) | |||||
| def visit_token_node(self, node): | def visit_token_node(self, node): | ||||
| self.output_stack[-1].children.append(node) | self.output_stack[-1].children.append(node) | ||||
| @@ -326,7 +366,7 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): | |||||
| return iter(node.children) | return iter(node.children) | ||||
| def visit_symbol_node_out(self, node): | def visit_symbol_node_out(self, node): | ||||
| if node.is_ambiguous: | |||||
| if not node.is_intermediate and node.is_ambiguous: | |||||
| result = self.output_stack.pop() | result = self.output_stack.pop() | ||||
| if self.output_stack: | if self.output_stack: | ||||
| self.output_stack[-1].children.append(result) | self.output_stack[-1].children.append(result) | ||||
| @@ -347,4 +387,78 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor): | |||||
| if self.output_stack: | if self.output_stack: | ||||
| self.output_stack[-1].children.append(result) | self.output_stack[-1].children.append(result) | ||||
| else: | else: | ||||
| self.result = result | |||||
| self.result = result | |||||
| class ForestToPyDotVisitor(ForestVisitor): | |||||
| """ | |||||
| A Forest visitor which writes the SPPF to a PNG. | |||||
| The SPPF can get really large, really quickly because | |||||
| of the amount of meta-data it stores, so this is probably | |||||
| only useful for trivial trees and learning how the SPPF | |||||
| is structured. | |||||
| """ | |||||
| def __init__(self, rankdir="TB"): | |||||
| self.pydot = import_module('pydot') | |||||
| self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir) | |||||
| def go(self, root, filename): | |||||
| super(ForestToPyDotVisitor, self).go(root) | |||||
| self.graph.write_png(filename) | |||||
| def visit_token_node(self, node): | |||||
| graph_node_id = str(id(node)) | |||||
| graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"')) | |||||
| graph_node_color = 0x808080 | |||||
| graph_node_style = "\"filled,rounded\"" | |||||
| graph_node_shape = "diamond" | |||||
| graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) | |||||
| self.graph.add_node(graph_node) | |||||
| def visit_packed_node_in(self, node): | |||||
| graph_node_id = str(id(node)) | |||||
| graph_node_label = repr(node) | |||||
| graph_node_color = 0x808080 | |||||
| graph_node_style = "filled" | |||||
| graph_node_shape = "diamond" | |||||
| graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) | |||||
| self.graph.add_node(graph_node) | |||||
| return iter([node.left, node.right]) | |||||
| def visit_packed_node_out(self, node): | |||||
| graph_node_id = str(id(node)) | |||||
| graph_node = self.graph.get_node(graph_node_id)[0] | |||||
| for child in [node.left, node.right]: | |||||
| if child is not None: | |||||
| child_graph_node_id = str(id(child)) | |||||
| child_graph_node = self.graph.get_node(child_graph_node_id)[0] | |||||
| self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) | |||||
| else: | |||||
| #### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay. | |||||
| child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890)) | |||||
| child_graph_node_style = "invis" | |||||
| child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None") | |||||
| child_edge_style = "invis" | |||||
| self.graph.add_node(child_graph_node) | |||||
| self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style)) | |||||
| def visit_symbol_node_in(self, node): | |||||
| graph_node_id = str(id(node)) | |||||
| graph_node_label = repr(node) | |||||
| graph_node_color = 0x808080 | |||||
| graph_node_style = "\"filled\"" | |||||
| if node.is_intermediate: | |||||
| graph_node_shape = "ellipse" | |||||
| else: | |||||
| graph_node_shape = "rectangle" | |||||
| graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label) | |||||
| self.graph.add_node(graph_node) | |||||
| return iter(node.children) | |||||
| def visit_symbol_node_out(self, node): | |||||
| graph_node_id = str(id(node)) | |||||
| graph_node = self.graph.get_node(graph_node_id)[0] | |||||
| for child in node.children: | |||||
| child_graph_node_id = str(id(child)) | |||||
| child_graph_node = self.graph.get_node(child_graph_node_id)[0] | |||||
| self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node)) | |||||
| @@ -22,7 +22,8 @@ from ..exceptions import ParseError, UnexpectedCharacters | |||||
| from ..lexer import Token | from ..lexer import Token | ||||
| from .grammar_analysis import GrammarAnalyzer | from .grammar_analysis import GrammarAnalyzer | ||||
| from ..grammar import NonTerminal, Terminal | from ..grammar import NonTerminal, Terminal | ||||
| from .earley_common import Column, Item | |||||
| from .earley import ApplyCallbacks | |||||
| from .earley_common import Item, TransitiveItem | |||||
| from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor | from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor | ||||
| @@ -31,11 +32,11 @@ class Parser: | |||||
| analysis = GrammarAnalyzer(parser_conf) | analysis = GrammarAnalyzer(parser_conf) | ||||
| self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
| self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
| self.forest_sum_visitor = forest_sum_visitor | |||||
| self.ignore = [Terminal(t) for t in ignore] | self.ignore = [Terminal(t) for t in ignore] | ||||
| self.complete_lex = complete_lex | self.complete_lex = complete_lex | ||||
| self.FIRST = analysis.FIRST | self.FIRST = analysis.FIRST | ||||
| self.NULLABLE = analysis.NULLABLE | |||||
| self.callbacks = {} | self.callbacks = {} | ||||
| self.predictions = {} | self.predictions = {} | ||||
| @@ -43,10 +44,12 @@ class Parser: | |||||
| # the slow 'isupper' in is_terminal. | # the slow 'isupper' in is_terminal. | ||||
| self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } | ||||
| self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } | ||||
| for rule in parser_conf.rules: | for rule in parser_conf.rules: | ||||
| self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) | self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) | ||||
| self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | ||||
| self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks) | |||||
| self.term_matcher = term_matcher | self.term_matcher = term_matcher | ||||
| def parse(self, stream, start_symbol=None): | def parse(self, stream, start_symbol=None): | ||||
| @@ -60,19 +63,74 @@ class Parser: | |||||
| # Cache for nodes & tokens created in a particular parse step. | # Cache for nodes & tokens created in a particular parse step. | ||||
| node_cache = {} | node_cache = {} | ||||
| token_cache = {} | token_cache = {} | ||||
| columns = [] | |||||
| transitives = [] | |||||
| text_line = 1 | text_line = 1 | ||||
| text_column = 1 | text_column = 1 | ||||
| def make_symbol_node(s, start, end): | |||||
| label = (s, start.i, end.i) | |||||
| if label in node_cache: | |||||
| node = node_cache[label] | |||||
| def is_quasi_complete(item): | |||||
| if item.is_complete: | |||||
| return True | |||||
| quasi = item.advance() | |||||
| while not quasi.is_complete: | |||||
| symbol = quasi.expect | |||||
| if symbol not in self.NULLABLE: | |||||
| return False | |||||
| if quasi.rule.origin == start_symbol and symbol == start_symbol: | |||||
| return False | |||||
| quasi = quasi.advance() | |||||
| return True | |||||
| def create_leo_transitives(item, trule, previous, visited = None): | |||||
| if visited is None: | |||||
| visited = set() | |||||
| if item.rule.origin in transitives[item.start]: | |||||
| previous = trule = transitives[item.start][item.rule.origin] | |||||
| return trule, previous | |||||
| is_empty_rule = not self.FIRST[item.rule.origin] | |||||
| if is_empty_rule: | |||||
| return trule, previous | |||||
| originator = None | |||||
| for key in columns[item.start]: | |||||
| if key.expect is not None and key.expect == item.rule.origin: | |||||
| if originator is not None: | |||||
| return trule, previous | |||||
| originator = key | |||||
| if originator is None: | |||||
| return trule, previous | |||||
| if originator in visited: | |||||
| return trule, previous | |||||
| visited.add(originator) | |||||
| if not is_quasi_complete(originator): | |||||
| return trule, previous | |||||
| trule = originator.advance() | |||||
| if originator.start != item.start: | |||||
| visited.clear() | |||||
| trule, previous = create_leo_transitives(originator, trule, previous, visited) | |||||
| if trule is None: | |||||
| return trule, previous | |||||
| titem = None | |||||
| if previous is not None: | |||||
| titem = TransitiveItem(item.rule.origin, trule, originator, previous.column) | |||||
| previous.next_titem = titem | |||||
| else: | else: | ||||
| node = node_cache[label] = SymbolNode(s, start, end) | |||||
| return node | |||||
| titem = TransitiveItem(item.rule.origin, trule, originator, item.start) | |||||
| previous = transitives[item.start][item.rule.origin] = titem | |||||
| return trule, previous | |||||
| def predict_and_complete(column, to_scan): | |||||
| def predict_and_complete(i, to_scan): | |||||
| """The core Earley Predictor and Completer. | """The core Earley Predictor and Completer. | ||||
| At each stage of the input, we handling any completed items (things | At each stage of the input, we handling any completed items (things | ||||
| @@ -82,61 +140,90 @@ class Parser: | |||||
| which can be added to the scan list for the next scanner cycle.""" | which can be added to the scan list for the next scanner cycle.""" | ||||
| held_completions.clear() | held_completions.clear() | ||||
| column = columns[i] | |||||
| # R (items) = Ei (column.items) | # R (items) = Ei (column.items) | ||||
| items = deque(column.items) | |||||
| items = deque(column) | |||||
| while items: | while items: | ||||
| item = items.pop() # remove an element, A say, from R | item = items.pop() # remove an element, A say, from R | ||||
| ### The Earley completer | ### The Earley completer | ||||
| if item.is_complete: ### (item.s == string) | if item.is_complete: ### (item.s == string) | ||||
| if item.node is None: | if item.node is None: | ||||
| item.node = make_symbol_node(item.s, item.start, column) | |||||
| label = (item.s, item.start, i) | |||||
| item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| item.node.add_family(item.s, item.rule, item.start, None, None) | item.node.add_family(item.s, item.rule, item.start, None, None) | ||||
| # Empty has 0 length. If we complete an empty symbol in a particular | |||||
| # parse step, we need to be able to use that same empty symbol to complete | |||||
| # any predictions that result, that themselves require empty. Avoids | |||||
| # infinite recursion on empty symbols. | |||||
| # held_completions is 'H' in E.Scott's paper. | |||||
| is_empty_item = item.start.i == column.i | |||||
| if is_empty_item: | |||||
| held_completions[item.rule.origin] = item.node | |||||
| originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] | |||||
| for originator in originators: | |||||
| new_item = originator.advance() | |||||
| new_item.node = make_symbol_node(new_item.s, originator.start, column) | |||||
| new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) | |||||
| create_leo_transitives(item, None, None) | |||||
| ###R Joop Leo right recursion Completer | |||||
| if item.rule.origin in transitives[item.start]: | |||||
| transitive = transitives[item.start][item.s] | |||||
| if transitive.previous in transitives[transitive.column]: | |||||
| root_transitive = transitives[transitive.column][transitive.previous] | |||||
| else: | |||||
| root_transitive = transitive | |||||
| label = (root_transitive.s, root_transitive.start, i) | |||||
| node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| vn.add_path(root_transitive, item.node) | |||||
| new_item = Item(transitive.rule, transitive.ptr, transitive.start) | |||||
| new_item.node = vn | |||||
| if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
| # Add (B :: aC.B, h, y) to Q | # Add (B :: aC.B, h, y) to Q | ||||
| to_scan.add(new_item) | to_scan.add(new_item) | ||||
| elif new_item not in column.items: | |||||
| elif new_item not in column: | |||||
| # Add (B :: aC.B, h, y) to Ei and R | # Add (B :: aC.B, h, y) to Ei and R | ||||
| column.add(new_item) | column.add(new_item) | ||||
| items.append(new_item) | items.append(new_item) | ||||
| ###R Regular Earley completer | |||||
| else: | |||||
| # Empty has 0 length. If we complete an empty symbol in a particular | |||||
| # parse step, we need to be able to use that same empty symbol to complete | |||||
| # any predictions that result, that themselves require empty. Avoids | |||||
| # infinite recursion on empty symbols. | |||||
| # held_completions is 'H' in E.Scott's paper. | |||||
| is_empty_item = item.start == i | |||||
| if is_empty_item: | |||||
| held_completions[item.rule.origin] = item.node | |||||
| originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] | |||||
| for originator in originators: | |||||
| new_item = originator.advance() | |||||
| label = (new_item.s, originator.start, i) | |||||
| new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) | |||||
| if new_item.expect in self.TERMINALS: | |||||
| # Add (B :: aC.B, h, y) to Q | |||||
| to_scan.add(new_item) | |||||
| elif new_item not in column: | |||||
| # Add (B :: aC.B, h, y) to Ei and R | |||||
| column.add(new_item) | |||||
| items.append(new_item) | |||||
| ### The Earley predictor | ### The Earley predictor | ||||
| elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) | ||||
| new_items = [] | new_items = [] | ||||
| for rule in self.predictions[item.expect]: | for rule in self.predictions[item.expect]: | ||||
| new_item = Item(rule, 0, column) | |||||
| new_item = Item(rule, 0, i) | |||||
| new_items.append(new_item) | new_items.append(new_item) | ||||
| # Process any held completions (H). | # Process any held completions (H). | ||||
| if item.expect in held_completions: | if item.expect in held_completions: | ||||
| new_item = item.advance() | new_item = item.advance() | ||||
| new_item.node = make_symbol_node(new_item.s, item.start, column) | |||||
| label = (new_item.s, item.start, i) | |||||
| new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) | ||||
| new_items.append(new_item) | new_items.append(new_item) | ||||
| for new_item in new_items: | for new_item in new_items: | ||||
| if new_item.expect in self.TERMINALS: | if new_item.expect in self.TERMINALS: | ||||
| to_scan.add(new_item) | to_scan.add(new_item) | ||||
| elif new_item not in column.items: | |||||
| elif new_item not in column: | |||||
| column.add(new_item) | column.add(new_item) | ||||
| items.append(new_item) | items.append(new_item) | ||||
| def scan(i, column, to_scan): | |||||
| def scan(i, to_scan): | |||||
| """The core Earley Scanner. | """The core Earley Scanner. | ||||
| This is a custom implementation of the scanner that uses the | This is a custom implementation of the scanner that uses the | ||||
| @@ -155,7 +242,7 @@ class Parser: | |||||
| m = match(item.expect, stream, i) | m = match(item.expect, stream, i) | ||||
| if m: | if m: | ||||
| t = Token(item.expect.name, m.group(0), i, text_line, text_column) | t = Token(item.expect.name, m.group(0), i, text_line, text_column) | ||||
| delayed_matches[m.end()].append( (item, column, t) ) | |||||
| delayed_matches[m.end()].append( (item, i, t) ) | |||||
| if self.complete_lex: | if self.complete_lex: | ||||
| s = m.group(0) | s = m.group(0) | ||||
| @@ -163,7 +250,7 @@ class Parser: | |||||
| m = match(item.expect, s[:-j]) | m = match(item.expect, s[:-j]) | ||||
| if m: | if m: | ||||
| t = Token(item.expect.name, m.group(0), i, text_line, text_column) | t = Token(item.expect.name, m.group(0), i, text_line, text_column) | ||||
| delayed_matches[i+m.end()].append( (item, column, t) ) | |||||
| delayed_matches[i+m.end()].append( (item, i, t) ) | |||||
| # Remove any items that successfully matched in this pass from the to_scan buffer. | # Remove any items that successfully matched in this pass from the to_scan buffer. | ||||
| # This ensures we don't carry over tokens that already matched, if we're ignoring below. | # This ensures we don't carry over tokens that already matched, if we're ignoring below. | ||||
| @@ -177,13 +264,16 @@ class Parser: | |||||
| m = match(x, stream, i) | m = match(x, stream, i) | ||||
| if m: | if m: | ||||
| # Carry over any items still in the scan buffer, to past the end of the ignored items. | # Carry over any items still in the scan buffer, to past the end of the ignored items. | ||||
| delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ]) | |||||
| delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ]) | |||||
| # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. | # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. | ||||
| delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol]) | |||||
| delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol]) | |||||
| next_set = Column(i + 1, self.FIRST) # Ei+1 | |||||
| next_to_scan = set() | next_to_scan = set() | ||||
| next_set = set() | |||||
| columns.append(next_set) | |||||
| next_transitives = dict() | |||||
| transitives.append(next_transitives) | |||||
| ## 4) Process Tokens from delayed_matches. | ## 4) Process Tokens from delayed_matches. | ||||
| # This is the core of the Earley scanner. Create an SPPF node for each Token, | # This is the core of the Earley scanner. Create an SPPF node for each Token, | ||||
| @@ -193,7 +283,8 @@ class Parser: | |||||
| for item, start, token in delayed_matches[i+1]: | for item, start, token in delayed_matches[i+1]: | ||||
| if token is not None: | if token is not None: | ||||
| new_item = item.advance() | new_item = item.advance() | ||||
| new_item.node = make_symbol_node(new_item.s, new_item.start, column) | |||||
| label = (new_item.s, new_item.start, i) | |||||
| new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) | |||||
| new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) | ||||
| else: | else: | ||||
| new_item = item | new_item = item | ||||
| @@ -210,11 +301,11 @@ class Parser: | |||||
| if not next_set and not delayed_matches and not next_to_scan: | if not next_set and not delayed_matches and not next_to_scan: | ||||
| raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) | raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) | ||||
| return next_set, next_to_scan | |||||
| return next_to_scan | |||||
| # Main loop starts | # Main loop starts | ||||
| column0 = Column(0, self.FIRST) | |||||
| column = column0 | |||||
| columns.append(set()) | |||||
| transitives.append(dict()) | |||||
| ## The scan buffer. 'Q' in E.Scott's paper. | ## The scan buffer. 'Q' in E.Scott's paper. | ||||
| to_scan = set() | to_scan = set() | ||||
| @@ -223,38 +314,41 @@ class Parser: | |||||
| # Add predicted items to the first Earley set (for the predictor) if they | # Add predicted items to the first Earley set (for the predictor) if they | ||||
| # result in a non-terminal, or the scanner if they result in a terminal. | # result in a non-terminal, or the scanner if they result in a terminal. | ||||
| for rule in self.predictions[start_symbol]: | for rule in self.predictions[start_symbol]: | ||||
| item = Item(rule, 0, column0) | |||||
| item = Item(rule, 0, 0) | |||||
| if item.expect in self.TERMINALS: | if item.expect in self.TERMINALS: | ||||
| to_scan.add(item) | to_scan.add(item) | ||||
| else: | else: | ||||
| column.add(item) | |||||
| columns[0].add(item) | |||||
| ## The main Earley loop. | ## The main Earley loop. | ||||
| # Run the Prediction/Completion cycle for any Items in the current Earley set. | # Run the Prediction/Completion cycle for any Items in the current Earley set. | ||||
| # Completions will be added to the SPPF tree, and predictions will be recursively | # Completions will be added to the SPPF tree, and predictions will be recursively | ||||
| # processed down to terminals/empty nodes to be added to the scanner for the next | # processed down to terminals/empty nodes to be added to the scanner for the next | ||||
| # step. | # step. | ||||
| for i, token in enumerate(stream): | |||||
| predict_and_complete(column, to_scan) | |||||
| i = 0 | |||||
| for token in stream: | |||||
| predict_and_complete(i, to_scan) | |||||
| # Clear the node_cache and token_cache, which are only relevant for each | # Clear the node_cache and token_cache, which are only relevant for each | ||||
| # step in the Earley pass. | # step in the Earley pass. | ||||
| node_cache.clear() | node_cache.clear() | ||||
| token_cache.clear() | token_cache.clear() | ||||
| column, to_scan = scan(i, column, to_scan) | |||||
| node_cache.clear() | |||||
| to_scan = scan(i, to_scan) | |||||
| if token == '\n': | if token == '\n': | ||||
| text_line += 1 | text_line += 1 | ||||
| text_column = 1 | text_column = 1 | ||||
| else: | else: | ||||
| text_column += 1 | text_column += 1 | ||||
| i += 1 | |||||
| predict_and_complete(column, to_scan) | |||||
| predict_and_complete(i, to_scan) | |||||
| ## Column is now the final column in the parse. If the parse was successful, the start | ## Column is now the final column in the parse. If the parse was successful, the start | ||||
| # symbol should have been completed in the last step of the Earley cycle, and will be in | # symbol should have been completed in the last step of the Earley cycle, and will be in | ||||
| # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | ||||
| solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] | |||||
| solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||||
| if not solutions: | if not solutions: | ||||
| expected_tokens = [t.expect for t in to_scan] | expected_tokens = [t.expect for t in to_scan] | ||||
| @@ -265,9 +359,8 @@ class Parser: | |||||
| ## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. | ## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. | ||||
| # This means the caller can work directly with the SPPF tree. | # This means the caller can work directly with the SPPF tree. | ||||
| if not self.resolve_ambiguity: | if not self.resolve_ambiguity: | ||||
| return ForestToAmbiguousTreeVisitor(solutions[0], self.callbacks).go() | |||||
| return ForestToAmbiguousTreeVisitor(self.callbacks).go(solutions[0]) | |||||
| # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities | ||||
| # according to the rules. | # according to the rules. | ||||
| return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() | |||||
| return self.forest_tree_visitor.go(solutions[0]) | |||||