Browse Source

Added Chris' changes, Dec 2018

Merge remote-tracking branch 'origin/0.7b' into 0.7b
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.6
Erez Shinan 6 years ago
parent
commit
c968e212ff
4 changed files with 466 additions and 166 deletions
  1. +144
    -46
      lark/parsers/earley.py
  2. +32
    -37
      lark/parsers/earley_common.py
  3. +148
    -34
      lark/parsers/earley_forest.py
  4. +142
    -49
      lark/parsers/xearley.py

+ 144
- 46
lark/parsers/earley.py View File

@@ -16,17 +16,19 @@ from ..visitors import Transformer_InPlace, v_args
from ..exceptions import ParseError, UnexpectedToken
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
from .earley_common import Column, Item
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor
from .earley_common import Item, TransitiveItem
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode

from collections import deque, defaultdict

class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor):
analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.forest_sum_visitor = forest_sum_visitor

self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE
self.callbacks = {}
self.predictions = {}

@@ -39,6 +41,7 @@ class Parser:
self.callbacks[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]

self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks)
self.term_matcher = term_matcher


@@ -46,19 +49,78 @@ class Parser:
# Define parser functions
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
match = self.term_matcher
held_completions = defaultdict(list)

# Held Completions (H in E.Scotts paper).
held_completions = {}

# Cache for nodes & tokens created in a particular parse step.
node_cache = {}
token_cache = {}

def make_symbol_node(s, start, end):
label = (s, start.i, end.i)
if label in node_cache:
node = node_cache[label]
columns = []
transitives = []

def is_quasi_complete(item):
if item.is_complete:
return True

quasi = item.advance()
while not quasi.is_complete:
symbol = quasi.expect
if symbol not in self.NULLABLE:
return False
if quasi.rule.origin == start_symbol and symbol == start_symbol:
return False
quasi = quasi.advance()
return True

def create_leo_transitives(item, trule, previous, visited = None):
if visited is None:
visited = set()

if item.rule.origin in transitives[item.start]:
previous = trule = transitives[item.start][item.rule.origin]
return trule, previous

is_empty_rule = not self.FIRST[item.rule.origin]
if is_empty_rule:
return trule, previous

originator = None
for key in columns[item.start]:
if key.expect is not None and key.expect == item.rule.origin:
if originator is not None:
return trule, previous
originator = key

if originator is None:
return trule, previous

if originator in visited:
return trule, previous

visited.add(originator)
if not is_quasi_complete(originator):
return trule, previous

trule = originator.advance()
if originator.start != item.start:
visited.clear()

trule, previous = create_leo_transitives(originator, trule, previous, visited)
if trule is None:
return trule, previous

titem = None
if previous is not None:
titem = TransitiveItem(item.rule.origin, trule, originator, previous.column)
previous.next_titem = titem
else:
node = node_cache[label] = SymbolNode(s, start, end)
return node
titem = TransitiveItem(item.rule.origin, trule, originator, item.start)

previous = transitives[item.start][item.rule.origin] = titem
return trule, previous

def predict_and_complete(column, to_scan):
def predict_and_complete(i, to_scan):
"""The core Earley Predictor and Completer.

At each stage of the input, we handling any completed items (things
@@ -68,61 +130,90 @@ class Parser:
which can be added to the scan list for the next scanner cycle."""
held_completions.clear()

column = columns[i]
# R (items) = Ei (column.items)
items = deque(column.items)
items = deque(column)
while items:
item = items.pop() # remove an element, A say, from R

### The Earley completer
if item.is_complete: ### (item.s == string)
if item.node is None:
item.node = make_symbol_node(item.s, item.start, column)
label = (item.s, item.start, i)
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
item.node.add_family(item.s, item.rule, item.start, None, None)

# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start.i == column.i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
new_item.node = make_symbol_node(new_item.s, originator.start, column)
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
create_leo_transitives(item, None, None)

###R Joop Leo right recursion Completer
if item.rule.origin in transitives[item.start]:
transitive = transitives[item.start][item.s]
if transitive.previous in transitives[transitive.column]:
root_transitive = transitives[transitive.column][transitive.previous]
else:
root_transitive = transitive

label = (root_transitive.s, root_transitive.start, i)
node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
vn.add_path(root_transitive, item.node)

new_item = Item(transitive.rule, transitive.ptr, transitive.start)
new_item.node = vn
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column.items:
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)
###R Regular Earley completer
else:
# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start == i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
label = (new_item.s, originator.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)

### The Earley predictor
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
new_items = []
for rule in self.predictions[item.expect]:
new_item = Item(rule, 0, column)
new_item = Item(rule, 0, i)
new_items.append(new_item)

# Process any held completions (H).
if item.expect in held_completions:
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, item.start, column)
label = (new_item.s, item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
new_items.append(new_item)

for new_item in new_items:
if new_item.expect in self.TERMINALS:
to_scan.add(new_item)
elif new_item not in column.items:
elif new_item not in column:
column.add(new_item)
items.append(new_item)

def scan(i, token, column, to_scan):
def scan(i, token, to_scan):
"""The core Earley Scanner.

This is a custom implementation of the scanner that uses the
@@ -130,12 +221,17 @@ class Parser:
Earley predictor, based on the previously completed tokens.
This ensures that at each phase of the parse we have a custom
lexer context, allowing for more complex ambiguities."""
next_set = Column(i+1, self.FIRST)
next_to_scan = set()
next_set = set()
columns.append(next_set)
next_transitives = dict()
transitives.append(next_transitives)

for item in set(to_scan):
if match(item.expect, token):
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, new_item.start, column)
label = (new_item.s, new_item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)

if new_item.expect in self.TERMINALS:
@@ -149,11 +245,11 @@ class Parser:
expect = {i.expect.name for i in to_scan}
raise UnexpectedToken(token, expect, considered_rules = set(to_scan))

return next_set, next_to_scan
return next_to_scan

# Main loop starts
column0 = Column(0, self.FIRST)
column = column0
columns.append(set())
transitives.append(dict())

## The scan buffer. 'Q' in E.Scott's paper.
to_scan = set()
@@ -162,32 +258,34 @@ class Parser:
# Add predicted items to the first Earley set (for the predictor) if they
# result in a non-terminal, or the scanner if they result in a terminal.
for rule in self.predictions[start_symbol]:
item = Item(rule, 0, column0)
item = Item(rule, 0, 0)
if item.expect in self.TERMINALS:
to_scan.add(item)
else:
column.add(item)
columns[0].add(item)

## The main Earley loop.
# Run the Prediction/Completion cycle for any Items in the current Earley set.
# Completions will be added to the SPPF tree, and predictions will be recursively
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
for i, token in enumerate(stream):
predict_and_complete(column, to_scan)
i = 0
for token in stream:
predict_and_complete(i, to_scan)

# Clear the node_cache and token_cache, which are only relevant for each
# step in the Earley pass.
node_cache.clear()
token_cache.clear()
column, to_scan = scan(i, token, column, to_scan)
to_scan = scan(i, token, to_scan)
i += 1

predict_and_complete(column, to_scan)
predict_and_complete(i, to_scan)

## Column is now the final column in the parse. If the parse was successful, the start
# symbol should have been completed in the last step of the Earley cycle, and will be in
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0]
solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]

if not solutions:
raise ParseError('Incomplete parse: Could not find a solution to input')
@@ -201,7 +299,7 @@ class Parser:

# ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities
# according to the rules.
return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go()
return self.forest_tree_visitor.go(solutions[0])

class ApplyCallbacks(Transformer_InPlace):
def __init__(self, postprocess):


+ 32
- 37
lark/parsers/earley_common.py View File

@@ -13,27 +13,12 @@
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com

## for recursive repr
from ..tree import Tree

class Derivation(Tree):
def __init__(self, rule, children = None):
Tree.__init__(self, 'drv', children if children is not None else [])
self.meta.rule = rule
self._hash = None

def __repr__(self, indent = 0):
return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...')

def __hash__(self):
if self._hash is None:
self._hash = Tree.__hash__(self)
return self._hash
from ..grammar import NonTerminal, Terminal

class Item(object):
"An Earley Item, the atom of the algorithm."

__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash')
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash')
def __init__(self, rule, ptr, start):
self.is_complete = len(rule.expansion) == ptr
self.rule = rule # rule
@@ -43,38 +28,48 @@ class Item(object):
if self.is_complete:
self.s = rule.origin
self.expect = None
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
else:
self.s = (rule, ptr)
self.expect = rule.expansion[ptr]
self._hash = hash((self.s, self.start.i))
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
self._hash = hash((self.s, self.start))

def advance(self):
return self.__class__(self.rule, self.ptr + 1, self.start)
return Item(self.rule, self.ptr + 1, self.start)

def __eq__(self, other):
return self is other or (self.s == other.s and self.start.i == other.start.i)
return self is other or (self.s == other.s and self.start == other.start)

def __hash__(self):
return self._hash

def __repr__(self):
return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i)

class Column:
"An entry in the table, aka Earley Chart. Contains lists of items."
def __init__(self, i, FIRST):
self.i = i
self.items = set()
self.FIRST = FIRST
before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after))
return '%s (%d)' % (symbol, self.start)


class TransitiveItem(Item):
__slots__ = ('recognized', 'reduction', 'column', 'next_titem')
def __init__(self, recognized, trule, originator, start):
super(TransitiveItem, self).__init__(trule.rule, trule.ptr, trule.start)
self.recognized = recognized
self.reduction = originator
self.column = start
self.next_titem = None
self._hash = hash((self.s, self.start, self.recognized))

def add(self, item):
"""Sort items into scan/predict/reduce newslists

Makes sure only unique items are added.
"""
self.items.add(item)
def __eq__(self, other):
if not isinstance(other, TransitiveItem):
return False
return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.recognized == other.recognized)

def __bool__(self):
return bool(self.items)
def __hash__(self):
return self._hash

__nonzero__ = __bool__ # Py2 backwards-compatibility
def __repr__(self):
before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
return '{} : {} -> {}* {} ({}, {})'.format(self.recognized.name, self.rule.origin.name, ' '.join(before), ' '.join(after), self.column, self.start)

+ 148
- 34
lark/parsers/earley_forest.py View File

@@ -7,14 +7,15 @@ Full reference and more details is here:
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
"""

from random import randint
from ..tree import Tree
from ..exceptions import ParseError
from ..lexer import Token
from ..utils import Str
from ..grammar import NonTerminal, Terminal
from .earley_common import Column, Derivation
from ..grammar import NonTerminal, Terminal, Symbol

from collections import deque
from importlib import import_module

class ForestNode(object):
pass
@@ -33,36 +34,65 @@ class SymbolNode(ForestNode):

Hence a Symbol Node with a single child is unambiguous.
"""
__slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate')
__slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash')
def __init__(self, s, start, end):
self.s = s
self.start = start
self.end = end
self.children = set()
self._children = set()
self.paths = set()
self.paths_loaded = False
self.priority = None
self.is_intermediate = isinstance(s, tuple)
self._hash = hash((self.s, self.start, self.end))

def add_family(self, lr0, rule, start, left, right):
self.children.add(PackedNode(self, lr0, rule, start, left, right))
self._children.add(PackedNode(self, lr0, rule, start, left, right))

def add_path(self, transitive, node):
self.paths.add((transitive, node))

def load_paths(self):
for transitive, node in self.paths:
if transitive.next_titem is not None:
vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end)
vn.add_path(transitive.next_titem, node)
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
else:
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
self.paths_loaded = True

@property
def is_ambiguous(self):
return len(self.children) > 1

@property
def children(self):
if not self.paths_loaded:
self.load_paths()
return self._children

def __iter__(self):
return iter(self.children)
return iter(self._children)

def __eq__(self, other):
if not isinstance(other, SymbolNode):
return False
return self is other or (self.s == other.s and self.start == other.start and self.end is other.end)
return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end)

def __hash__(self):
return hash((self.s, self.start.i, self.end.i))
return self._hash

def __repr__(self):
symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name
return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0)
if self.is_intermediate:
rule = self.s[0]
ptr = self.s[1]
before = ( expansion.name for expansion in rule.expansion[:ptr] )
after = ( expansion.name for expansion in rule.expansion[ptr:] )
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
else:
symbol = self.s.name
return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority if self.priority is not None else 0)

class PackedNode(ForestNode):
"""
@@ -77,7 +107,7 @@ class PackedNode(ForestNode):
self.left = left
self.right = right
self.priority = None
self._hash = hash((self.s, self.start.i, self.left, self.right))
self._hash = hash((self.s, self.start, self.left, self.right))

@property
def is_empty(self):
@@ -105,8 +135,15 @@ class PackedNode(ForestNode):
return self._hash

def __repr__(self):
symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name
return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0)
if isinstance(self.s, tuple):
rule = self.s[0]
ptr = self.s[1]
before = ( expansion.name for expansion in rule.expansion[:ptr] )
after = ( expansion.name for expansion in rule.expansion[ptr:] )
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
else:
symbol = self.s.name
return "({}, {}, {})".format(symbol, self.start, self.priority)

class ForestVisitor(object):
"""
@@ -114,9 +151,7 @@ class ForestVisitor(object):

Use this as a base when you need to walk the forest.
"""
def __init__(self, root):
self.root = root
self.result = None
__slots__ = ['result']

def visit_token_node(self, node): pass
def visit_symbol_node_in(self, node): pass
@@ -124,7 +159,8 @@ class ForestVisitor(object):
def visit_packed_node_in(self, node): pass
def visit_packed_node_out(self, node): pass

def go(self):
def go(self, root):
self.result = None
# Visiting is a list of IDs of all symbol/intermediate nodes currently in
# the stack. It serves two purposes: to detect when we 'recurse' in and out
# of a symbol/intermediate so that we can process both up and down. Also,
@@ -134,7 +170,7 @@ class ForestVisitor(object):

# We do not use recursion here to walk the Forest due to the limited
# stack size in python. Therefore input_stack is essentially our stack.
input_stack = deque([self.root])
input_stack = deque([root])

# It is much faster to cache these as locals since they are called
# many times in large parses.
@@ -170,8 +206,8 @@ class ForestVisitor(object):

current_id = id(current)
if current_id in visiting:
if isinstance(current, PackedNode): vpno(current)
else: vsno(current)
if isinstance(current, PackedNode): vpno(current)
else: vsno(current)
input_stack.pop()
visiting.remove(current_id)
continue
@@ -214,7 +250,7 @@ class ForestSumVisitor(ForestVisitor):

def visit_symbol_node_out(self, node):
node.priority = max(child.priority for child in node.children)
node.children = sorted(node.children, reverse = True)
node._children = sorted(node.children, reverse = True)

class ForestAntiscoreSumVisitor(ForestSumVisitor):
"""
@@ -228,7 +264,7 @@ class ForestAntiscoreSumVisitor(ForestSumVisitor):
"""
def visit_symbol_node_out(self, node):
node.priority = min(child.priority for child in node.children)
node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True)
node._children = sorted(node.children, key=AntiscoreSumComparator, reverse = True)

class AntiscoreSumComparator(object):
"""
@@ -263,19 +299,21 @@ class ForestToTreeVisitor(ForestVisitor):
implementation should be another ForestVisitor which sorts the children
according to some priority mechanism.
"""
def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None):
super(ForestToTreeVisitor, self).__init__(root)
self.forest_sum_visitor = forest_sum_visitor
self.output_stack = deque()
__slots__ = ['forest_sum_visitor', 'output_stack', 'callbacks']
def __init__(self, forest_sum_visitor = ForestSumVisitor, callbacks = None):
self.forest_sum_visitor = forest_sum_visitor()
self.callbacks = callbacks
self.result = None

def go(self, root):
self.output_stack = deque()
return super(ForestToTreeVisitor, self).go(root)

def visit_token_node(self, node):
self.output_stack[-1].append(node)

def visit_symbol_node_in(self, node):
if node.is_ambiguous and node.priority is None:
self.forest_sum_visitor(node).go()
self.forest_sum_visitor.go(node)
return next(iter(node.children))

def visit_packed_node_in(self, node):
@@ -311,11 +349,13 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor):
This is mainly used by the test framework, to make it simpler to write
tests ensuring the SPPF contains the right results.
"""
def __init__(self, root, callbacks):
super(ForestToAmbiguousTreeVisitor, self).__init__(root)
self.output_stack = deque()
__slots__ = ['output_stack', 'callbacks']
def __init__(self, callbacks):
self.callbacks = callbacks
self.result = None

def go(self, root):
self.output_stack = deque([])
return super(ForestToAmbiguousTreeVisitor, self).go(root)

def visit_token_node(self, node):
self.output_stack[-1].children.append(node)
@@ -326,7 +366,7 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor):
return iter(node.children)

def visit_symbol_node_out(self, node):
if node.is_ambiguous:
if not node.is_intermediate and node.is_ambiguous:
result = self.output_stack.pop()
if self.output_stack:
self.output_stack[-1].children.append(result)
@@ -347,4 +387,78 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor):
if self.output_stack:
self.output_stack[-1].children.append(result)
else:
self.result = result
self.result = result

class ForestToPyDotVisitor(ForestVisitor):
"""
A Forest visitor which writes the SPPF to a PNG.

The SPPF can get really large, really quickly because
of the amount of meta-data it stores, so this is probably
only useful for trivial trees and learning how the SPPF
is structured.
"""
def __init__(self, rankdir="TB"):
self.pydot = import_module('pydot')
self.graph = self.pydot.Dot(graph_type='digraph', rankdir=rankdir)

def go(self, root, filename):
super(ForestToPyDotVisitor, self).go(root)
self.graph.write_png(filename)

def visit_token_node(self, node):
graph_node_id = str(id(node))
graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
graph_node_color = 0x808080
graph_node_style = "\"filled,rounded\""
graph_node_shape = "diamond"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)

def visit_packed_node_in(self, node):
graph_node_id = str(id(node))
graph_node_label = repr(node)
graph_node_color = 0x808080
graph_node_style = "filled"
graph_node_shape = "diamond"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)
return iter([node.left, node.right])

def visit_packed_node_out(self, node):
graph_node_id = str(id(node))
graph_node = self.graph.get_node(graph_node_id)[0]
for child in [node.left, node.right]:
if child is not None:
child_graph_node_id = str(id(child))
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))
else:
#### Try and be above the Python object ID range; probably impl. specific, but maybe this is okay.
child_graph_node_id = str(randint(100000000000000000000000000000,123456789012345678901234567890))
child_graph_node_style = "invis"
child_graph_node = self.pydot.Node(child_graph_node_id, style=child_graph_node_style, label="None")
child_edge_style = "invis"
self.graph.add_node(child_graph_node)
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node, style=child_edge_style))

def visit_symbol_node_in(self, node):
graph_node_id = str(id(node))
graph_node_label = repr(node)
graph_node_color = 0x808080
graph_node_style = "\"filled\""
if node.is_intermediate:
graph_node_shape = "ellipse"
else:
graph_node_shape = "rectangle"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)
return iter(node.children)

def visit_symbol_node_out(self, node):
graph_node_id = str(id(node))
graph_node = self.graph.get_node(graph_node_id)[0]
for child in node.children:
child_graph_node_id = str(id(child))
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))

+ 142
- 49
lark/parsers/xearley.py View File

@@ -22,7 +22,8 @@ from ..exceptions import ParseError, UnexpectedCharacters
from ..lexer import Token
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal, Terminal
from .earley_common import Column, Item
from .earley import ApplyCallbacks
from .earley_common import Item, TransitiveItem
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor


@@ -31,11 +32,11 @@ class Parser:
analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.forest_sum_visitor = forest_sum_visitor
self.ignore = [Terminal(t) for t in ignore]
self.complete_lex = complete_lex

self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE
self.callbacks = {}
self.predictions = {}

@@ -43,10 +44,12 @@ class Parser:
# the slow 'isupper' in is_terminal.
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }

for rule in parser_conf.rules:
self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None)
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]

self.forest_tree_visitor = ForestToTreeVisitor(forest_sum_visitor, self.callbacks)
self.term_matcher = term_matcher

def parse(self, stream, start_symbol=None):
@@ -60,19 +63,74 @@ class Parser:
# Cache for nodes & tokens created in a particular parse step.
node_cache = {}
token_cache = {}
columns = []
transitives = []

text_line = 1
text_column = 1

def make_symbol_node(s, start, end):
label = (s, start.i, end.i)
if label in node_cache:
node = node_cache[label]
def is_quasi_complete(item):
if item.is_complete:
return True

quasi = item.advance()
while not quasi.is_complete:
symbol = quasi.expect
if symbol not in self.NULLABLE:
return False
if quasi.rule.origin == start_symbol and symbol == start_symbol:
return False
quasi = quasi.advance()
return True

def create_leo_transitives(item, trule, previous, visited = None):
if visited is None:
visited = set()

if item.rule.origin in transitives[item.start]:
previous = trule = transitives[item.start][item.rule.origin]
return trule, previous

is_empty_rule = not self.FIRST[item.rule.origin]
if is_empty_rule:
return trule, previous

originator = None
for key in columns[item.start]:
if key.expect is not None and key.expect == item.rule.origin:
if originator is not None:
return trule, previous
originator = key

if originator is None:
return trule, previous

if originator in visited:
return trule, previous

visited.add(originator)
if not is_quasi_complete(originator):
return trule, previous

trule = originator.advance()
if originator.start != item.start:
visited.clear()

trule, previous = create_leo_transitives(originator, trule, previous, visited)
if trule is None:
return trule, previous

titem = None
if previous is not None:
titem = TransitiveItem(item.rule.origin, trule, originator, previous.column)
previous.next_titem = titem
else:
node = node_cache[label] = SymbolNode(s, start, end)
return node
titem = TransitiveItem(item.rule.origin, trule, originator, item.start)

previous = transitives[item.start][item.rule.origin] = titem
return trule, previous

def predict_and_complete(column, to_scan):
def predict_and_complete(i, to_scan):
"""The core Earley Predictor and Completer.

At each stage of the input, we handling any completed items (things
@@ -82,61 +140,90 @@ class Parser:
which can be added to the scan list for the next scanner cycle."""
held_completions.clear()

column = columns[i]
# R (items) = Ei (column.items)
items = deque(column.items)
items = deque(column)
while items:
item = items.pop() # remove an element, A say, from R

### The Earley completer
if item.is_complete: ### (item.s == string)
if item.node is None:
item.node = make_symbol_node(item.s, item.start, column)
label = (item.s, item.start, i)
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
item.node.add_family(item.s, item.rule, item.start, None, None)

# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start.i == column.i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
new_item.node = make_symbol_node(new_item.s, originator.start, column)
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
create_leo_transitives(item, None, None)

###R Joop Leo right recursion Completer
if item.rule.origin in transitives[item.start]:
transitive = transitives[item.start][item.s]
if transitive.previous in transitives[transitive.column]:
root_transitive = transitives[transitive.column][transitive.previous]
else:
root_transitive = transitive

label = (root_transitive.s, root_transitive.start, i)
node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
vn.add_path(root_transitive, item.node)

new_item = Item(transitive.rule, transitive.ptr, transitive.start)
new_item.node = vn
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column.items:
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)
###R Regular Earley completer
else:
# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start == i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
label = (new_item.s, originator.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)

### The Earley predictor
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
new_items = []
for rule in self.predictions[item.expect]:
new_item = Item(rule, 0, column)
new_item = Item(rule, 0, i)
new_items.append(new_item)

# Process any held completions (H).
if item.expect in held_completions:
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, item.start, column)
label = (new_item.s, item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
new_items.append(new_item)

for new_item in new_items:
if new_item.expect in self.TERMINALS:
to_scan.add(new_item)
elif new_item not in column.items:
elif new_item not in column:
column.add(new_item)
items.append(new_item)

def scan(i, column, to_scan):
def scan(i, to_scan):
"""The core Earley Scanner.

This is a custom implementation of the scanner that uses the
@@ -155,7 +242,7 @@ class Parser:
m = match(item.expect, stream, i)
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[m.end()].append( (item, column, t) )
delayed_matches[m.end()].append( (item, i, t) )

if self.complete_lex:
s = m.group(0)
@@ -163,7 +250,7 @@ class Parser:
m = match(item.expect, s[:-j])
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append( (item, column, t) )
delayed_matches[i+m.end()].append( (item, i, t) )

# Remove any items that successfully matched in this pass from the to_scan buffer.
# This ensures we don't carry over tokens that already matched, if we're ignoring below.
@@ -177,13 +264,16 @@ class Parser:
m = match(x, stream, i)
if m:
# Carry over any items still in the scan buffer, to past the end of the ignored items.
delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ])
delayed_matches[m.end()].extend([(item, i, None) for item in to_scan ])

# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol])
delayed_matches[m.end()].extend([(item, i, None) for item in columns[i] if item.is_complete and item.s == start_symbol])

next_set = Column(i + 1, self.FIRST) # Ei+1
next_to_scan = set()
next_set = set()
columns.append(next_set)
next_transitives = dict()
transitives.append(next_transitives)

## 4) Process Tokens from delayed_matches.
# This is the core of the Earley scanner. Create an SPPF node for each Token,
@@ -193,7 +283,8 @@ class Parser:
for item, start, token in delayed_matches[i+1]:
if token is not None:
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, new_item.start, column)
label = (new_item.s, new_item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)
else:
new_item = item
@@ -210,11 +301,11 @@ class Parser:
if not next_set and not delayed_matches and not next_to_scan:
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan))

return next_set, next_to_scan
return next_to_scan

# Main loop starts
column0 = Column(0, self.FIRST)
column = column0
columns.append(set())
transitives.append(dict())

## The scan buffer. 'Q' in E.Scott's paper.
to_scan = set()
@@ -223,38 +314,41 @@ class Parser:
# Add predicted items to the first Earley set (for the predictor) if they
# result in a non-terminal, or the scanner if they result in a terminal.
for rule in self.predictions[start_symbol]:
item = Item(rule, 0, column0)
item = Item(rule, 0, 0)
if item.expect in self.TERMINALS:
to_scan.add(item)
else:
column.add(item)
columns[0].add(item)

## The main Earley loop.
# Run the Prediction/Completion cycle for any Items in the current Earley set.
# Completions will be added to the SPPF tree, and predictions will be recursively
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
for i, token in enumerate(stream):
predict_and_complete(column, to_scan)
i = 0
for token in stream:
predict_and_complete(i, to_scan)

# Clear the node_cache and token_cache, which are only relevant for each
# step in the Earley pass.
node_cache.clear()
token_cache.clear()
column, to_scan = scan(i, column, to_scan)
node_cache.clear()
to_scan = scan(i, to_scan)

if token == '\n':
text_line += 1
text_column = 1
else:
text_column += 1
i += 1

predict_and_complete(column, to_scan)
predict_and_complete(i, to_scan)

## Column is now the final column in the parse. If the parse was successful, the start
# symbol should have been completed in the last step of the Earley cycle, and will be in
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0]
solutions = [n.node for n in columns[i] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]

if not solutions:
expected_tokens = [t.expect for t in to_scan]
@@ -265,9 +359,8 @@ class Parser:
## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller.
# This means the caller can work directly with the SPPF tree.
if not self.resolve_ambiguity:
return ForestToAmbiguousTreeVisitor(solutions[0], self.callbacks).go()
return ForestToAmbiguousTreeVisitor(self.callbacks).go(solutions[0])

# ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities
# according to the rules.
return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go()

return self.forest_tree_visitor.go(solutions[0])

Loading…
Cancel
Save