Browse Source

Implement Joop Leo's optimizations for right recursion performance

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.6
night199uk 6 years ago
parent
commit
04d90fa916
4 changed files with 303 additions and 76 deletions
  1. +112
    -25
      lark/parsers/earley.py
  2. +34
    -4
      lark/parsers/earley_common.py
  3. +45
    -22
      lark/parsers/earley_forest.py
  4. +112
    -25
      lark/parsers/xearley.py

+ 112
- 25
lark/parsers/earley.py View File

@@ -16,7 +16,7 @@ from ..visitors import Transformer_InPlace, v_args
from ..exceptions import ParseError, UnexpectedToken
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
from .earley_common import Item
from .earley_common import Item, TransitiveItem
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode

from collections import deque, defaultdict
@@ -28,6 +28,7 @@ class Parser:
self.resolve_ambiguity = resolve_ambiguity

self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE
self.callbacks = {}
self.predictions = {}

@@ -56,14 +57,68 @@ class Parser:
node_cache = {}
token_cache = {}
columns = []

def make_symbol_node(s, start, end):
label = (s, start, end)
if label in node_cache:
node = node_cache[label]
transitives = []

def is_quasi_complete(item):
if item.is_complete:
return True

quasi = item.advance()
while not quasi.is_complete:
symbol = quasi.expect
if symbol not in self.NULLABLE:
return False
if quasi.rule.origin == start_symbol and symbol == start_symbol:
return False
quasi = quasi.advance()
return True

def create_leo_transitives(item, trule, previous, visited = None):
if visited is None:
visited = set()

if item.rule.origin in transitives[item.start]:
previous = trule = transitives[item.start][item.rule.origin]
return trule, previous

is_empty_rule = not self.FIRST[item.rule.origin]
if is_empty_rule:
return trule, previous

originator = None
for key in columns[item.start]:
if key.expect is not None and key.expect == item.rule.origin:
if originator is not None:
return trule, previous
originator = key

if originator is None:
return trule, previous

if originator in visited:
return trule, previous

visited.add(originator)
if not is_quasi_complete(originator):
return trule, previous

trule = originator.advance()
if originator.start != item.start:
visited.clear()

trule, previous = create_leo_transitives(originator, trule, previous, visited)
if trule is None:
return trule, previous

titem = None
if previous is not None:
titem = TransitiveItem(item.rule.origin, trule, originator, previous.column)
previous.next_titem = titem
else:
node = node_cache[label] = SymbolNode(s, start, end)
return node
titem = TransitiveItem(item.rule.origin, trule, originator, item.start)

previous = transitives[item.start][item.rule.origin] = titem
return trule, previous

def predict_and_complete(i, to_scan):
"""The core Earley Predictor and Completer.
@@ -84,23 +139,26 @@ class Parser:
### The Earley completer
if item.is_complete: ### (item.s == string)
if item.node is None:
item.node = make_symbol_node(item.s, item.start, i)
label = (item.s, item.start, i)
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
item.node.add_family(item.s, item.rule, item.start, None, None)

# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start == i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
new_item.node = make_symbol_node(new_item.s, originator.start, i)
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
create_leo_transitives(item, None, None)

###R Joop Leo right recursion Completer
if item.rule.origin in transitives[item.start]:
transitive = transitives[item.start][item.s]
if transitive.previous in transitives[transitive.column]:
root_transitive = transitives[transitive.column][transitive.previous]
else:
root_transitive = transitive

label = (root_transitive.s, root_transitive.start, i)
node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
vn.add_path(root_transitive, item.node)

new_item = Item(transitive.rule, transitive.ptr, transitive.start)
new_item.node = vn
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
@@ -108,6 +166,30 @@ class Parser:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)
###R Regular Earley completer
else:
# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start == i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
label = (new_item.s, originator.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)

### The Earley predictor
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
@@ -119,7 +201,8 @@ class Parser:
# Process any held completions (H).
if item.expect in held_completions:
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, item.start, i)
label = (new_item.s, item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
new_items.append(new_item)

@@ -141,11 +224,14 @@ class Parser:
next_to_scan = set()
next_set = set()
columns.append(next_set)
next_transitives = dict()
transitives.append(next_transitives)

for item in set(to_scan):
if match(item.expect, token):
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, new_item.start, i)
label = (new_item.s, new_item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)

if new_item.expect in self.TERMINALS:
@@ -163,6 +249,7 @@ class Parser:

# Main loop starts
columns.append(set())
transitives.append(dict())

## The scan buffer. 'Q' in E.Scott's paper.
to_scan = set()


+ 34
- 4
lark/parsers/earley_common.py View File

@@ -13,12 +13,13 @@
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com

from ..grammar import NonTerminal, Terminal

class Item(object):
"An Earley Item, the atom of the algorithm."

__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash')
__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'previous', 'node', '_hash')
def __init__(self, rule, ptr, start):
assert isinstance(start, int), "start is not an int"
self.is_complete = len(rule.expansion) == ptr
self.rule = rule # rule
self.ptr = ptr # ptr
@@ -27,13 +28,15 @@ class Item(object):
if self.is_complete:
self.s = rule.origin
self.expect = None
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
else:
self.s = (rule, ptr)
self.expect = rule.expansion[ptr]
self.previous = rule.expansion[ptr - 1] if ptr > 0 and len(rule.expansion) else None
self._hash = hash((self.s, self.start))

def advance(self):
return self.__class__(self.rule, self.ptr + 1, self.start)
return Item(self.rule, self.ptr + 1, self.start)

def __eq__(self, other):
return self is other or (self.s == other.s and self.start == other.start)
@@ -42,4 +45,31 @@ class Item(object):
return self._hash

def __repr__(self):
return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start)
before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
symbol = "{} ::= {}* {}".format(self.rule.origin.name, ' '.join(before), ' '.join(after))
return '%s (%d)' % (symbol, self.start)


class TransitiveItem(Item):
__slots__ = ('recognized', 'reduction', 'column', 'next_titem')
def __init__(self, recognized, trule, originator, start):
super(TransitiveItem, self).__init__(trule.rule, trule.ptr, trule.start)
self.recognized = recognized
self.reduction = originator
self.column = start
self.next_titem = None
self._hash = hash((self.s, self.start, self.recognized))

def __eq__(self, other):
if not isinstance(other, TransitiveItem):
return False
return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.recognized == other.recognized)

def __hash__(self):
return self._hash

def __repr__(self):
before = ( expansion.name for expansion in self.rule.expansion[:self.ptr] )
after = ( expansion.name for expansion in self.rule.expansion[self.ptr:] )
return '{} : {} -> {}* {} ({}, {})'.format(self.recognized.name, self.rule.origin.name, ' '.join(before), ' '.join(after), self.column, self.start)

+ 45
- 22
lark/parsers/earley_forest.py View File

@@ -12,7 +12,7 @@ from ..tree import Tree
from ..exceptions import ParseError
from ..lexer import Token
from ..utils import Str
from ..grammar import NonTerminal, Terminal
from ..grammar import NonTerminal, Terminal, Symbol

from collections import deque
from importlib import import_module
@@ -34,42 +34,65 @@ class SymbolNode(ForestNode):

Hence a Symbol Node with a single child is unambiguous.
"""
__slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate')
__slots__ = ('s', 'start', 'end', '_children', 'paths', 'paths_loaded', 'priority', 'is_intermediate', '_hash')
def __init__(self, s, start, end):
self.s = s
self.start = start
self.end = end
self.children = set()
self._children = set()
self.paths = set()
self.paths_loaded = False
self.priority = None
self.is_intermediate = isinstance(s, tuple)
self._hash = hash((self.s, self.start, self.end))

def add_family(self, lr0, rule, start, left, right):
self.children.add(PackedNode(self, lr0, rule, start, left, right))
self._children.add(PackedNode(self, lr0, rule, start, left, right))

def add_path(self, transitive, node):
self.paths.add((transitive, node))

def load_paths(self):
for transitive, node in self.paths:
if transitive.next_titem is not None:
vn = SymbolNode(transitive.next_titem.s, transitive.next_titem.start, self.end)
vn.add_path(transitive.next_titem, node)
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, vn)
else:
self.add_family(transitive.reduction.rule.origin, transitive.reduction.rule, transitive.reduction.start, transitive.reduction.node, node)
self.paths_loaded = True

@property
def is_ambiguous(self):
return len(self.children) > 1

@property
def children(self):
if not self.paths_loaded:
self.load_paths()
return self._children

def __iter__(self):
return iter(self.children)
return iter(self._children)

def __eq__(self, other):
if not isinstance(other, SymbolNode):
return False
return self is other or (self.s == other.s and self.start == other.start and self.end is other.end)
return self is other or (type(self.s) == type(other.s) and self.s == other.s and self.start == other.start and self.end is other.end)

def __hash__(self):
return hash((self.s, self.start, self.end))
return self._hash

def __repr__(self):
if self.is_intermediate:
rule = self.s[0]
ptr = self.s[1]
names = [ "{}*".format(expansion.name) if index == ptr else expansion.name for index, expansion in enumerate(rule.expansion) ]
symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names))
before = ( expansion.name for expansion in rule.expansion[:ptr] )
after = ( expansion.name for expansion in rule.expansion[ptr:] )
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
else:
symbol = self.s.name
return "(%s, %d, %d, %d)" % (symbol, self.start, self.end, self.priority if self.priority is not None else 0)
return "({}, {}, {}, {})".format(symbol, self.start, self.end, self.priority if self.priority is not None else 0)

class PackedNode(ForestNode):
"""
@@ -115,11 +138,12 @@ class PackedNode(ForestNode):
if isinstance(self.s, tuple):
rule = self.s[0]
ptr = self.s[1]
names = [ "{}*".format(expansion.name) if index == ptr else expansion.name for index, expansion in enumerate(rule.expansion) ]
symbol = "{} ::= {}".format(rule.origin.name, ' '.join(names))
before = ( expansion.name for expansion in rule.expansion[:ptr] )
after = ( expansion.name for expansion in rule.expansion[ptr:] )
symbol = "{} ::= {}* {}".format(rule.origin.name, ' '.join(before), ' '.join(after))
else:
symbol = self.s.name
return "{%s, %d, %d}" % (symbol, self.start, self.priority if self.priority is not None else 0)
return "({}, {}, {})".format(symbol, self.start, self.priority)

class ForestVisitor(object):
"""
@@ -182,8 +206,8 @@ class ForestVisitor(object):

current_id = id(current)
if current_id in visiting:
if isinstance(current, PackedNode): vpno(current)
else: vsno(current)
if isinstance(current, PackedNode): vpno(current)
else: vsno(current)
input_stack.pop()
visiting.remove(current_id)
continue
@@ -226,7 +250,7 @@ class ForestSumVisitor(ForestVisitor):

def visit_symbol_node_out(self, node):
node.priority = max(child.priority for child in node.children)
node.children = sorted(node.children, reverse = True)
node._children = sorted(node.children, reverse = True)

class ForestAntiscoreSumVisitor(ForestSumVisitor):
"""
@@ -240,7 +264,7 @@ class ForestAntiscoreSumVisitor(ForestSumVisitor):
"""
def visit_symbol_node_out(self, node):
node.priority = min(child.priority for child in node.children)
node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True)
node._children = sorted(node.children, key=AntiscoreSumComparator, reverse = True)

class AntiscoreSumComparator(object):
"""
@@ -342,7 +366,7 @@ class ForestToAmbiguousTreeVisitor(ForestVisitor):
return iter(node.children)

def visit_symbol_node_out(self, node):
if node.is_ambiguous:
if not node.is_intermediate and node.is_ambiguous:
result = self.output_stack.pop()
if self.output_stack:
self.output_stack[-1].children.append(result)
@@ -386,8 +410,8 @@ class ForestToPyDotVisitor(ForestVisitor):
graph_node_id = str(id(node))
graph_node_label = "\"{}\"".format(node.value.replace('"', '\\"'))
graph_node_color = 0x808080
graph_node_style = "filled"
graph_node_shape = "polygon"
graph_node_style = "\"filled,rounded\""
graph_node_shape = "diamond"
graph_node = self.pydot.Node(graph_node_id, style=graph_node_style, fillcolor="#{:06x}".format(graph_node_color), shape=graph_node_shape, label=graph_node_label)
self.graph.add_node(graph_node)

@@ -422,7 +446,7 @@ class ForestToPyDotVisitor(ForestVisitor):
graph_node_id = str(id(node))
graph_node_label = repr(node)
graph_node_color = 0x808080
graph_node_style = "filled"
graph_node_style = "\"filled\""
if node.is_intermediate:
graph_node_shape = "ellipse"
else:
@@ -438,4 +462,3 @@ class ForestToPyDotVisitor(ForestVisitor):
child_graph_node_id = str(id(child))
child_graph_node = self.graph.get_node(child_graph_node_id)[0]
self.graph.add_edge(self.pydot.Edge(graph_node, child_graph_node))


+ 112
- 25
lark/parsers/xearley.py View File

@@ -24,7 +24,7 @@ from ..tree import Tree
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal, Terminal
from .earley import ApplyCallbacks
from .earley_common import Item
from .earley_common import Item, TransitiveItem
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode


@@ -37,6 +37,7 @@ class Parser:
self.complete_lex = complete_lex

self.FIRST = analysis.FIRST
self.NULLABLE = analysis.NULLABLE
self.callbacks = {}
self.predictions = {}

@@ -64,17 +65,71 @@ class Parser:
node_cache = {}
token_cache = {}
columns = []
transitives = []

text_line = 1
text_column = 1

def make_symbol_node(s, start, end):
label = (s, start, end)
if label in node_cache:
node = node_cache[label]
def is_quasi_complete(item):
if item.is_complete:
return True

quasi = item.advance()
while not quasi.is_complete:
symbol = quasi.expect
if symbol not in self.NULLABLE:
return False
if quasi.rule.origin == start_symbol and symbol == start_symbol:
return False
quasi = quasi.advance()
return True

def create_leo_transitives(item, trule, previous, visited = None):
if visited is None:
visited = set()

if item.rule.origin in transitives[item.start]:
previous = trule = transitives[item.start][item.rule.origin]
return trule, previous

is_empty_rule = not self.FIRST[item.rule.origin]
if is_empty_rule:
return trule, previous

originator = None
for key in columns[item.start]:
if key.expect is not None and key.expect == item.rule.origin:
if originator is not None:
return trule, previous
originator = key

if originator is None:
return trule, previous

if originator in visited:
return trule, previous

visited.add(originator)
if not is_quasi_complete(originator):
return trule, previous

trule = originator.advance()
if originator.start != item.start:
visited.clear()

trule, previous = create_leo_transitives(originator, trule, previous, visited)
if trule is None:
return trule, previous

titem = None
if previous is not None:
titem = TransitiveItem(item.rule.origin, trule, originator, previous.column)
previous.next_titem = titem
else:
node = node_cache[label] = SymbolNode(s, start, end)
return node
titem = TransitiveItem(item.rule.origin, trule, originator, item.start)

previous = transitives[item.start][item.rule.origin] = titem
return trule, previous

def predict_and_complete(i, to_scan):
"""The core Earley Predictor and Completer.
@@ -95,23 +150,26 @@ class Parser:
### The Earley completer
if item.is_complete: ### (item.s == string)
if item.node is None:
item.node = make_symbol_node(item.s, item.start, i)
label = (item.s, item.start, i)
item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
item.node.add_family(item.s, item.rule, item.start, None, None)

# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start == i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
new_item.node = make_symbol_node(new_item.s, originator.start, i)
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
create_leo_transitives(item, None, None)

###R Joop Leo right recursion Completer
if item.rule.origin in transitives[item.start]:
transitive = transitives[item.start][item.s]
if transitive.previous in transitives[transitive.column]:
root_transitive = transitives[transitive.column][transitive.previous]
else:
root_transitive = transitive

label = (root_transitive.s, root_transitive.start, i)
node = vn = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
vn.add_path(root_transitive, item.node)

new_item = Item(transitive.rule, transitive.ptr, transitive.start)
new_item.node = vn
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
@@ -119,6 +177,30 @@ class Parser:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)
###R Regular Earley completer
else:
# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start == i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
label = (new_item.s, originator.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)

### The Earley predictor
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
@@ -130,7 +212,8 @@ class Parser:
# Process any held completions (H).
if item.expect in held_completions:
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, item.start, i)
label = (new_item.s, item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
new_items.append(new_item)

@@ -190,6 +273,8 @@ class Parser:
next_to_scan = set()
next_set = set()
columns.append(next_set)
next_transitives = dict()
transitives.append(next_transitives)

## 4) Process Tokens from delayed_matches.
# This is the core of the Earley scanner. Create an SPPF node for each Token,
@@ -199,8 +284,8 @@ class Parser:
for item, start, token in delayed_matches[i+1]:
if token is not None:
new_item = item.advance()
# new_item.start = start # Should we update this to account for gaps due to ignores?
new_item.node = make_symbol_node(new_item.s, new_item.start, i)
label = (new_item.s, new_item.start, i)
new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label))
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)
else:
new_item = item
@@ -221,6 +306,7 @@ class Parser:

# Main loop starts
columns.append(set())
transitives.append(dict())

## The scan buffer. 'Q' in E.Scott's paper.
to_scan = set()
@@ -248,6 +334,7 @@ class Parser:
# step in the Earley pass.
node_cache.clear()
token_cache.clear()
node_cache.clear()
to_scan = scan(i, to_scan)

if token == '\n':


Loading…
Cancel
Save