Browse Source

Heavy modifications to the Earley parser to try and make it handle more

types of ambiguity.
 - Rewritten along the lines of Elizabeth Scott's parser.
https://www.sciencedirect.com/science/article/pii/S1571066108001497

 - Implement SPPF trees per Elizabeth Scott and Bram van der Sanden's work.
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.6
night199uk 6 years ago
parent
commit
1d5fd7301a
10 changed files with 893 additions and 443 deletions
  1. +4
    -5
      lark/lark.py
  2. +31
    -19
      lark/parse_tree_builder.py
  3. +7
    -10
      lark/parser_frontends.py
  4. +168
    -193
      lark/parsers/earley.py
  5. +80
    -0
      lark/parsers/earley_common.py
  6. +347
    -0
      lark/parsers/earley_forest.py
  7. +0
    -109
      lark/parsers/resolve_ambig.py
  8. +212
    -96
      lark/parsers/xearley.py
  9. +1
    -1
      lark/reconstruct.py
  10. +43
    -10
      tests/test_parser.py

+ 4
- 5
lark/lark.py View File

@@ -62,14 +62,13 @@ class LarkOptions(object):
self.profile = o.pop('profile', False)
self.ambiguity = o.pop('ambiguity', 'auto')
self.propagate_positions = o.pop('propagate_positions', False)
self.earley__predict_all = o.pop('earley__predict_all', False)
self.lexer_callbacks = o.pop('lexer_callbacks', {})

assert self.parser in ('earley', 'lalr', 'cyk', None)

if self.parser == 'earley' and self.transformer:
raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.'
'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)')
if self.ambiguity == 'explicit' and self.transformer:
raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm for explicit ambiguity.'
'Please use your transformer on the resulting Forest, or use a different algorithm (i.e. LALR)')

if o:
raise ValueError("Unknown options: %s" % o.keys())
@@ -176,7 +175,7 @@ class Lark:
def _build_parser(self):
self.parser_class = get_frontend(self.options.parser, self.options.lexer)

self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr')
self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit')
callback = self._parse_tree_builder.create_callback(self.options.transformer)
if self.profiler:
for f in dir(callback):


+ 31
- 19
lark/parse_tree_builder.py View File

@@ -7,6 +7,7 @@ from .visitors import InlineTransformer # XXX Deprecated

###{standalone
from functools import partial, wraps
from itertools import repeat, product


class ExpandSingleChild:
@@ -62,23 +63,11 @@ class PropagatePositions:


class ChildFilter:
"Optimized childfilter (assumes no duplication in parse tree, so it's safe to change it)"
def __init__(self, to_include, node_builder):
self.node_builder = node_builder
self.to_include = to_include

def __call__(self, children):
filtered = []
for i, to_expand in self.to_include:
if to_expand:
filtered += children[i].children
else:
filtered.append(children[i])

return self.node_builder(filtered)

class ChildFilterLALR(ChildFilter):
"Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)"

def __call__(self, children):
filtered = []
for i, to_expand in self.to_include:
@@ -89,19 +78,43 @@ class ChildFilterLALR(ChildFilter):
filtered = children[i].children
else:
filtered.append(children[i])

return self.node_builder(filtered)

def _should_expand(sym):
return not sym.is_term and sym.name.startswith('_')

def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous):
def maybe_create_child_filter(expansion, keep_all_tokens):
to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion)
if keep_all_tokens or not (sym.is_term and sym.filter_out)]

if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include):
return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include)
return partial(ChildFilter, to_include)

class AmbiguousExpander:
"""Deal with the case where we're expanding children ('_rule') into a parent but the children
are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself
ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children
into the right parents in the right places, essentially shifting the ambiguiuty up the tree."""
def __init__(self, to_expand, tree_class, node_builder):
self.node_builder = node_builder
self.tree_class = tree_class
self.to_expand = to_expand

def __call__(self, children):
def _is_ambig_tree(child):
return hasattr(child, 'data') and child.data == '_ambig'

ambiguous = [i for i in self.to_expand if _is_ambig_tree(children[i])]
if ambiguous:
expand = [iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children)]
return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))])
return self.node_builder(children)

def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens):
to_expand = [i for i, sym in enumerate(expansion)
if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))]
if to_expand:
return partial(AmbiguousExpander, to_expand, tree_class)

class Callback(object):
pass
@@ -113,8 +126,6 @@ def ptb_inline_args(func):
return func(*children)
return f



class ParseTreeBuilder:
def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False):
self.tree_class = tree_class
@@ -135,7 +146,8 @@ class ParseTreeBuilder:
wrapper_chain = filter(None, [
self.propagate_positions and PropagatePositions,
(expand_single_child and not rule.alias) and ExpandSingleChild,
maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous),
maybe_create_child_filter(rule.expansion, keep_all_tokens),
self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens),
])

yield rule, wrapper_chain


+ 7
- 10
lark/parser_frontends.py View File

@@ -4,8 +4,7 @@ from functools import partial
from .utils import get_regexp_width
from .parsers.grammar_analysis import GrammarAnalyzer
from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token

from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk
from .parsers import lalr_parser, earley, earley_forest, xearley, cyk
from .tree import Tree

class WithLexer:
@@ -54,13 +53,13 @@ class LALR_CustomLexer(WithLexer):
self.lexer = lexer_cls(lexer_conf)


def get_ambiguity_resolver(options):
def get_ambiguity_options(options):
if not options or options.ambiguity == 'resolve':
return resolve_ambig.standard_resolve_ambig
return {}
elif options.ambiguity == 'resolve__antiscore_sum':
return resolve_ambig.antiscore_sum_resolve_ambig
return {'forest_sum_visitor': earley_forest.ForestAntiscoreSumVisitor}
elif options.ambiguity == 'explicit':
return None
return {'resolve_ambiguity': False}
raise ValueError(options)

def tokenize_text(text):
@@ -76,8 +75,7 @@ class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None):
self.init_traditional_lexer(lexer_conf)

self.parser = earley.Parser(parser_conf, self.match,
resolve_ambiguity=get_ambiguity_resolver(options))
self.parser = earley.Parser(parser_conf, self.match, **get_ambiguity_options(options))

def match(self, term, token):
return term.name == token.type
@@ -89,11 +87,10 @@ class XEarley:

self._prepare_match(lexer_conf)

kw.update(get_ambiguity_options(options))
self.parser = xearley.Parser(parser_conf,
self.match,
resolve_ambiguity=get_ambiguity_resolver(options),
ignore=lexer_conf.ignore,
predict_all=options.earley__predict_all,
**kw
)



+ 168
- 193
lark/parsers/earley.py View File

@@ -1,160 +1,44 @@
"This module implements an Earley Parser"

# The parser uses a parse-forest to keep track of derivations and ambiguations.
# When the parse ends successfully, a disambiguation stage resolves all ambiguity
# (right now ambiguity resolution is not developed beyond the needs of lark)
# Afterwards the parse tree is reduced (transformed) according to user callbacks.
# I use the no-recursion version of Transformer, because the tree might be
# deeper than Python's recursion limit (a bit absurd, but that's life)
#
# The algorithm keeps track of each state set, using a corresponding Column instance.
# Column keeps track of new items using NewsList instances.
#
"""This module implements an scanerless Earley parser.

The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
https://www.sciencedirect.com/science/article/pii/S1571066108001497

That is probably the best reference for understanding the algorithm here.

The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
is better documented here:
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com

from ..tree import Tree
from ..visitors import Transformer_InPlace, v_args
from ..exceptions import ParseError, UnexpectedToken
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal
from .earley_common import Column, Item
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode


class Derivation(Tree):
def __init__(self, rule, items=None):
Tree.__init__(self, 'drv', items or [])
self.meta.rule = rule
self._hash = None

def _pretty_label(self): # Nicer pretty for debugging the parser
return self.meta.rule.origin.name if self.meta.rule else self.data

def __hash__(self):
if self._hash is None:
self._hash = Tree.__hash__(self)
return self._hash

class Item(object):
"An Earley Item, the atom of the algorithm."

def __init__(self, rule, ptr, start, tree):
self.rule = rule
self.ptr = ptr
self.start = start
self.tree = tree if tree is not None else Derivation(self.rule)

@property
def expect(self):
return self.rule.expansion[self.ptr]

@property
def is_complete(self):
return self.ptr == len(self.rule.expansion)

def advance(self, tree):
assert self.tree.data == 'drv'
new_tree = Derivation(self.rule, self.tree.children + [tree])
return self.__class__(self.rule, self.ptr+1, self.start, new_tree)

def __eq__(self, other):
return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule

def __hash__(self):
return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__

def __repr__(self):
before = list(map(str, self.rule.expansion[:self.ptr]))
after = list(map(str, self.rule.expansion[self.ptr:]))
return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after))

class NewsList(list):
"Keeps track of newly added items (append-only)"

def __init__(self, initial=None):
list.__init__(self, initial or [])
self.last_iter = 0

def get_news(self):
i = self.last_iter
self.last_iter = len(self)
return self[i:]



class Column:
"An entry in the table, aka Earley Chart. Contains lists of items."
def __init__(self, i, FIRST, predict_all=False):
self.i = i
self.to_reduce = NewsList()
self.to_predict = NewsList()
self.to_scan = []
self.item_count = 0
self.FIRST = FIRST

self.predicted = set()
self.completed = {}
self.predict_all = predict_all

def add(self, items):
"""Sort items into scan/predict/reduce newslists

Makes sure only unique items are added.
"""
for item in items:

item_key = item, item.tree # Elsewhere, tree is not part of the comparison
if item.is_complete:
# XXX Potential bug: What happens if there's ambiguity in an empty rule?
if item.rule.expansion and item_key in self.completed:
old_tree = self.completed[item_key].tree
if old_tree == item.tree:
is_empty = not self.FIRST[item.rule.origin]
if not is_empty:
continue

if old_tree.data != '_ambig':
new_tree = old_tree.copy()
new_tree.meta.rule = old_tree.meta.rule
old_tree.set('_ambig', [new_tree])
old_tree.meta.rule = None # No longer a 'drv' node

if item.tree.children[0] is old_tree: # XXX a little hacky!
raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule)

if item.tree not in old_tree.children:
old_tree.children.append(item.tree)
# old_tree.children.append(item.tree)
else:
self.completed[item_key] = item
self.to_reduce.append(item)
else:
if item.expect.is_term:
self.to_scan.append(item)
else:
k = item_key if self.predict_all else item
if k in self.predicted:
continue
self.predicted.add(k)
self.to_predict.append(item)

self.item_count += 1 # Only count if actually added


def __bool__(self):
return bool(self.item_count)
__nonzero__ = __bool__ # Py2 backwards-compatibility
from collections import deque, defaultdict

class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None):
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor):
analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.forest_sum_visitor = forest_sum_visitor

self.FIRST = analysis.FIRST
self.postprocess = {}
self.callbacks = {}
self.predictions = {}

## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
# the slow 'isupper' in is_terminal.
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }

for rule in parser_conf.rules:
self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias)
self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None)
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]

self.term_matcher = term_matcher
@@ -163,72 +47,163 @@ class Parser:
def parse(self, stream, start_symbol=None):
# Define parser functions
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)

_Item = Item
match = self.term_matcher

def predict(nonterm, column):
assert not nonterm.is_term, nonterm
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]]

def complete(item):
name = item.rule.origin
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]

def predict_and_complete(column):
while True:
to_predict = {x.expect for x in column.to_predict.get_news()
if x.ptr} # if not part of an already predicted batch
to_reduce = set(column.to_reduce.get_news())
if not (to_predict or to_reduce):
break

for nonterm in to_predict:
column.add( predict(nonterm, column) )

for item in to_reduce:
new_items = list(complete(item))
if item in new_items:
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
column.add(new_items)

def scan(i, token, column):
next_set = Column(i, self.FIRST)
next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token))

if not next_set:
expect = {i.expect.name for i in column.to_scan}
raise UnexpectedToken(token, expect, considered_rules=set(column.to_scan))

return next_set
held_completions = defaultdict(list)
node_cache = {}
token_cache = {}

def make_symbol_node(s, start, end):
label = (s, start.i, end.i)
if label in node_cache:
node = node_cache[label]
else:
node = node_cache[label] = SymbolNode(s, start, end)
return node

def predict_and_complete(column, to_scan):
"""The core Earley Predictor and Completer.

At each stage of the input, we handling any completed items (things
that matched on the last cycle) and use those to predict what should
come next in the input stream. The completions and any predicted
non-terminals are recursively processed until we reach a set of,
which can be added to the scan list for the next scanner cycle."""
held_completions.clear()

# R (items) = Ei (column.items)
items = deque(column.items)
while items:
item = items.pop() # remove an element, A say, from R

### The Earley completer
if item.is_complete: ### (item.s == string)
if item.node is None:
item.node = make_symbol_node(item.s, item.start, column)
item.node.add_family(item.s, item.rule, item.start, None, None)

# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start.i == column.i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
new_item.node = make_symbol_node(new_item.s, originator.start, column)
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column.items:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)

### The Earley predictor
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
new_items = []
for rule in self.predictions[item.expect]:
new_item = Item(rule, 0, column)
new_items.append(new_item)

# Process any held completions (H).
if item.expect in held_completions:
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, item.start, column)
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
new_items.append(new_item)

for new_item in new_items:
if new_item.expect in self.TERMINALS:
to_scan.add(new_item)
elif new_item not in column.items:
column.add(new_item)
items.append(new_item)

def scan(i, token, column, to_scan):
"""The core Earley Scanner.

This is a custom implementation of the scanner that uses the
Lark lexer to match tokens. The scan list is built by the
Earley predictor, based on the previously completed tokens.
This ensures that at each phase of the parse we have a custom
lexer context, allowing for more complex ambiguities."""
next_set = Column(i+1, self.FIRST)
next_to_scan = set()
for item in set(to_scan):
if match(item.expect, token):
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, new_item.start, column)
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)

if new_item.expect in self.TERMINALS:
# add (B ::= Aai+1.B, h, y) to Q'
next_to_scan.add(new_item)
else:
# add (B ::= Aa+1.B, h, y) to Ei+1
next_set.add(new_item)

if not next_set and not next_to_scan:
expect = {i.expect.name for i in to_scan}
raise UnexpectedToken(token, expect, considered_rules = set(to_scan))

return next_set, next_to_scan

# Main loop starts
column0 = Column(0, self.FIRST)
column0.add(predict(start_symbol, column0))

column = column0

## The scan buffer. 'Q' in E.Scott's paper.
to_scan = set()

## Predict for the start_symbol.
# Add predicted items to the first Earley set (for the predictor) if they
# result in a non-terminal, or the scanner if they result in a terminal.
for rule in self.predictions[start_symbol]:
item = Item(rule, 0, column0)
if item.expect in self.TERMINALS:
to_scan.add(item)
else:
column.add(item)

## The main Earley loop.
# Run the Prediction/Completion cycle for any Items in the current Earley set.
# Completions will be added to the SPPF tree, and predictions will be recursively
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
for i, token in enumerate(stream):
predict_and_complete(column)
column = scan(i, token, column)
predict_and_complete(column, to_scan)

# Clear the node_cache and token_cache, which are only relevant for each
# step in the Earley pass.
node_cache.clear()
token_cache.clear()
column, to_scan = scan(i, token, column, to_scan)

predict_and_complete(column)
predict_and_complete(column, to_scan)

# Parse ended. Now build a parse tree
solutions = [n.tree for n in column.to_reduce
if n.rule.origin==start_symbol and n.start is column0]
## Column is now the final column in the parse. If the parse was successful, the start
# symbol should have been completed in the last step of the Earley cycle, and will be in
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0]

if not solutions:
raise ParseError('Incomplete parse: Could not find a solution to input')
elif len(solutions) == 1:
tree = solutions[0]
else:
tree = Tree('_ambig', solutions)

if self.resolve_ambiguity:
tree = self.resolve_ambiguity(tree)
elif len(solutions) > 1:
raise ParseError('Earley should not generate multiple start symbol items!')

return ApplyCallbacks(self.postprocess).transform(tree)
## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller.
# This means the caller can work directly with the SPPF tree.
if not self.resolve_ambiguity:
return solutions[0]

# ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities
# according to the rules.
return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go()

class ApplyCallbacks(Transformer_InPlace):
def __init__(self, postprocess):


+ 80
- 0
lark/parsers/earley_common.py View File

@@ -0,0 +1,80 @@
"This module implements an Earley Parser"

# The parser uses a parse-forest to keep track of derivations and ambiguations.
# When the parse ends successfully, a disambiguation stage resolves all ambiguity
# (right now ambiguity resolution is not developed beyond the needs of lark)
# Afterwards the parse tree is reduced (transformed) according to user callbacks.
# I use the no-recursion version of Transformer, because the tree might be
# deeper than Python's recursion limit (a bit absurd, but that's life)
#
# The algorithm keeps track of each state set, using a corresponding Column instance.
# Column keeps track of new items using NewsList instances.
#
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com

## for recursive repr
from ..tree import Tree

class Derivation(Tree):
def __init__(self, rule, children = None):
Tree.__init__(self, 'drv', children if children is not None else [])
self.meta.rule = rule
self._hash = None

def __repr__(self, indent = 0):
return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...')

def __hash__(self):
if self._hash is None:
self._hash = Tree.__hash__(self)
return self._hash

class Item(object):
"An Earley Item, the atom of the algorithm."

__slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash')
def __init__(self, rule, ptr, start):
self.is_complete = len(rule.expansion) == ptr
self.rule = rule # rule
self.ptr = ptr # ptr
self.start = start # j
self.node = None # w
if self.is_complete:
self.s = rule.origin
self.expect = None
else:
self.s = (rule, ptr)
self.expect = rule.expansion[ptr]
self._hash = hash((self.s, self.start.i))

def advance(self):
return self.__class__(self.rule, self.ptr + 1, self.start)

def __eq__(self, other):
return self is other or (self.s == other.s and self.start.i == other.start.i)

def __hash__(self):
return self._hash

def __repr__(self):
return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i)

class Column:
"An entry in the table, aka Earley Chart. Contains lists of items."
def __init__(self, i, FIRST):
self.i = i
self.items = set()
self.FIRST = FIRST

def add(self, item):
"""Sort items into scan/predict/reduce newslists

Makes sure only unique items are added.
"""
self.items.add(item)

def __bool__(self):
return bool(self.items)

__nonzero__ = __bool__ # Py2 backwards-compatibility

+ 347
- 0
lark/parsers/earley_forest.py View File

@@ -0,0 +1,347 @@
""""This module implements an SPPF implementation

This is used as the primary output mechanism for the Earley parser
in order to store complex ambiguities.

Full reference and more details is here:
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/
"""

from ..tree import Tree
from ..exceptions import ParseError
from ..lexer import Token
from ..utils import Str
from ..grammar import NonTerminal, Terminal
from .earley_common import Column, Derivation

from collections import deque

class SymbolNode(object):
"""
A Symbol Node represents a symbol (or Intermediate LR0).

Symbol nodes are keyed by the symbol (s). For intermediate nodes
s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol
nodes, s will be a string representing the non-terminal origin (i.e.
the left hand side of the rule).

The children of a Symbol or Intermediate Node will always be Packed Nodes;
with each Packed Node child representing a single derivation of a production.

Hence a Symbol Node with a single child is unambiguous.
"""
__slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate')
def __init__(self, s, start, end):
self.s = s
self.start = start
self.end = end
self.children = set()
self.priority = None
self.is_intermediate = isinstance(s, tuple)

def add_family(self, lr0, rule, start, left, right):
self.children.add(PackedNode(self, lr0, rule, start, left, right))

@property
def is_ambiguous(self):
return len(self.children) > 1

def __iter__(self):
return iter(self.children)

def __eq__(self, other):
if not isinstance(other, SymbolNode):
return False
return self is other or (self.s == other.s and self.start == other.start and self.end is other.end)

def __hash__(self):
return hash((self.s, self.start.i, self.end.i))

def __repr__(self):
symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name
return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0)

class PackedNode(object):
"""
A Packed Node represents a single derivation in a symbol node.
"""
__slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash')
def __init__(self, parent, s, rule, start, left, right):
self.parent = parent
self.s = s
self.start = start
self.rule = rule
self.left = left
self.right = right
self.priority = None
self._hash = hash((self.s, self.start.i, self.left, self.right))

@property
def is_empty(self):
return self.left is None and self.right is None

def __iter__(self):
return iter([self.left, self.right])

def __lt__(self, other):
if self.is_empty and not other.is_empty: return True
if self.priority < other.priority: return True
return False

def __gt__(self, other):
if self.is_empty and not other.is_empty: return True
if self.priority > other.priority: return True
return False

def __eq__(self, other):
if not isinstance(other, PackedNode):
return False
return self is other or (self.s == other.s and self.start == other.start and self.left == other.left and self.right == other.right)

def __hash__(self):
return self._hash

def __repr__(self):
symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name
return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0)

class ForestVisitor(object):
"""
An abstract base class for building forest visitors.

Use this as a base when you need to walk the forest.
"""
def __init__(self, root):
self.root = root
self.result = None

def visit_token_node(self, node): pass
def visit_symbol_node_in(self, node): pass
def visit_symbol_node_out(self, node): pass
def visit_packed_node_in(self, node): pass
def visit_packed_node_out(self, node): pass

def go(self):
# Visiting is a list of IDs of all symbol/intermediate nodes currently in
# the stack. It serves two purposes: to detect when we 'recurse' in and out
# of a symbol/intermediate so that we can process both up and down. Also,
# since the SPPF can have cycles it allows us to detect if we're trying
# to recurse into a node that's already on the stack (infinite recursion).
visiting = set()

# We do not use recursion here to walk the Forest due to the limited
# stack size in python. Therefore input_stack is essentially our stack.
input_stack = deque([self.root])

# It is much faster to cache these as locals since they are called
# many times in large parses.
vpno = getattr(self, 'visit_packed_node_out')
vpni = getattr(self, 'visit_packed_node_in')
vsno = getattr(self, 'visit_symbol_node_out')
vsni = getattr(self, 'visit_symbol_node_in')
vtn = getattr(self, 'visit_token_node')
while input_stack:
current = next(reversed(input_stack))
try:
next_node = next(current)
except StopIteration:
input_stack.pop()
continue
except TypeError:
### If the current object is not an iterator, pass through to Token/SymbolNode
pass
else:
if next_node is None:
continue

if id(next_node) in visiting:
raise ParseError("Infinite recursion in grammar!")

input_stack.append(next_node)
continue

if isinstance(current, Str):
vtn(current)
input_stack.pop()
continue

current_id = id(current)
if current_id in visiting:
if isinstance(current, PackedNode): vpno(current)
else: vsno(current)
input_stack.pop()
visiting.remove(current_id)
continue
else:
visiting.add(current_id)
if isinstance(current, PackedNode): next_node = vpni(current)
else: next_node = vsni(current)
if next_node is None:
continue

if id(next_node) in visiting:
raise ParseError("Infinite recursion in grammar!")

input_stack.append(next_node)
continue

return self.result

class ForestSumVisitor(ForestVisitor):
"""
A visitor for prioritizing ambiguous parts of the Forest.

This visitor is the default when resolving ambiguity. It pushes the priorities
from the rules into the SPPF nodes; and then sorts the packed node children
of ambiguous symbol or intermediate node according to the priorities.
This relies on the custom sort function provided in PackedNode.__lt__; which
uses these properties (and other factors) to sort the ambiguous packed nodes.
"""
def visit_packed_node_in(self, node):
return iter([node.left, node.right])

def visit_symbol_node_in(self, node):
return iter(node.children)

def visit_packed_node_out(self, node):
node.priority = 0
if node.rule.options and node.rule.options.priority: node.priority += node.rule.options.priority
if node.right is not None and hasattr(node.right, 'priority'): node.priority += node.right.priority
if node.left is not None and hasattr(node.left, 'priority'): node.priority += node.left.priority

def visit_symbol_node_out(self, node):
node.priority = max(child.priority for child in node.children)
node.children = sorted(node.children, reverse = True)

class ForestAntiscoreSumVisitor(ForestSumVisitor):
"""
A visitor for prioritizing ambiguous parts of the Forest.

This visitor is used when resolve_ambiguity == 'resolve__antiscore_sum'.
It pushes the priorities from the rules into the SPPF nodes, and implements
a 'least cost' mechanism for resolving ambiguity (reverse of the default
priority mechanism). It uses a custom __lt__ comparator key for sorting
the packed node children.
"""
def visit_symbol_node_out(self, node):
node.priority = min(child.priority for child in node.children)
node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True)

class AntiscoreSumComparator(object):
"""
An antiscore-sum comparator for PackedNode objects.

This allows 'sorting' an iterable of PackedNode objects so that they
are arranged lowest priority first.
"""
__slots__ = ['obj']
def __init__(self, obj, *args):
self.obj = obj

def __lt__(self, other):
if self.obj.is_empty and not other.obj.is_empty: return True
if self.obj.priority > other.obj.priority: return True
return False

def __gt__(self, other):
if self.obj.is_empty and not other.obj.is_empty: return True
if self.obj.priority < other.obj.priority: return True
return False

class ForestToTreeVisitor(ForestVisitor):
"""
A Forest visitor which converts an SPPF forest to an unambiguous AST.

The implementation in this visitor walks only the first ambiguous child
of each symbol node. When it finds an ambiguous symbol node it first
calls the forest_sum_visitor implementation to sort the children
into preference order using the algorithms defined there; so the first
child should always be the highest preference. The forest_sum_visitor
implementation should be another ForestVisitor which sorts the children
according to some priority mechanism.
"""
def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None):
super(ForestToTreeVisitor, self).__init__(root)
self.forest_sum_visitor = forest_sum_visitor
self.output_stack = deque()
self.callbacks = callbacks
self.result = None

def visit_token_node(self, node):
self.output_stack[-1].append(node)

def visit_symbol_node_in(self, node):
if node.is_ambiguous and node.priority is None:
self.forest_sum_visitor(node).go()
return next(iter(node.children))

def visit_packed_node_in(self, node):
if not node.parent.is_intermediate:
self.output_stack.append([])
return iter([node.left, node.right])

def visit_packed_node_out(self, node):
if not node.parent.is_intermediate:
result = self.callbacks[node.rule](self.output_stack.pop())
if self.output_stack:
self.output_stack[-1].append(result)
else:
self.result = result

class ForestToAmbiguousTreeVisitor(ForestVisitor):
"""
A Forest visitor which converts an SPPF forest to an ambiguous AST.

Because of the fundamental disparity between what can be stored in
an SPPF and what can be stored in a Tree; this implementation is not
complete. It correctly deals with ambiguities that occur on symbol nodes only,
and cannot deal with ambiguities that occur on intermediate nodes.

Usually, most parsers can be rewritten to avoid intermediate node
ambiguities. Also, this implementation could be fixed, however
the code to handle intermediate node ambiguities is messy and
would not be performant. It is much better not to use this and
instead to correctly disambiguate the forest and only store unambiguous
parses in Trees. It is here just to provide some parity with the
old ambiguity='explicit'.

This is mainly used by the test framework, to make it simpler to write
tests ensuring the SPPF contains the right results.
"""
def __init__(self, root, callbacks):
super(ForestToAmbiguousTreeVisitor, self).__init__(root)
self.output_stack = deque()
self.callbacks = callbacks
self.result = None

def visit_token_node(self, node):
self.output_stack[-1].children.append(node)

def visit_symbol_node_in(self, node):
if not node.is_intermediate and node.is_ambiguous:
self.output_stack.append(Tree('_ambig', []))
return iter(node.children)

def visit_symbol_node_out(self, node):
if node.is_ambiguous:
result = self.output_stack.pop()
if self.output_stack:
self.output_stack[-1].children.append(result)
else:
self.result = result

def visit_packed_node_in(self, node):
#### NOTE:
## When an intermediate node (node.parent.s == tuple) has ambiguous children this
## forest visitor will break.
if not node.parent.is_intermediate:
self.output_stack.append(Tree('drv', []))
return iter([node.left, node.right])

def visit_packed_node_out(self, node):
if not node.parent.is_intermediate:
result = self.callbacks[node.rule](self.output_stack.pop().children)
if self.output_stack:
self.output_stack[-1].children.append(result)
else:
self.result = result

+ 0
- 109
lark/parsers/resolve_ambig.py View File

@@ -1,109 +0,0 @@
from ..utils import compare
from functools import cmp_to_key

from ..tree import Tree


# Standard ambiguity resolver (uses comparison)
#
# Author: Erez Sh

def _compare_rules(rule1, rule2):
return -compare( len(rule1.expansion), len(rule2.expansion))

def _sum_priority(tree):
p = 0

for n in tree.iter_subtrees():
try:
p += n.meta.rule.options.priority or 0
except AttributeError:
pass

return p

def _compare_priority(tree1, tree2):
tree1.iter_subtrees()

def _compare_drv(tree1, tree2):
try:
rule1 = tree1.meta.rule
except AttributeError:
rule1 = None

try:
rule2 = tree2.meta.rule
except AttributeError:
rule2 = None

if None == rule1 == rule2:
return compare(tree1, tree2)
elif rule1 is None:
return -1
elif rule2 is None:
return 1

assert tree1.data != '_ambig'
assert tree2.data != '_ambig'

p1 = _sum_priority(tree1)
p2 = _sum_priority(tree2)
c = (p1 or p2) and compare(p1, p2)
if c:
return c

c = _compare_rules(tree1.meta.rule, tree2.meta.rule)
if c:
return c

# rules are "equal", so compare trees
if len(tree1.children) == len(tree2.children):
for t1, t2 in zip(tree1.children, tree2.children):
c = _compare_drv(t1, t2)
if c:
return c

return compare(len(tree1.children), len(tree2.children))


def _standard_resolve_ambig(tree):
assert tree.data == '_ambig'
key_f = cmp_to_key(_compare_drv)
best = max(tree.children, key=key_f)
assert best.data == 'drv'
tree.set('drv', best.children)
tree.meta.rule = best.meta.rule # needed for applying callbacks

def standard_resolve_ambig(tree):
for ambig in tree.find_data('_ambig'):
_standard_resolve_ambig(ambig)

return tree




# Anti-score Sum
#
# Author: Uriva (https://github.com/uriva)

def _antiscore_sum_drv(tree):
if not isinstance(tree, Tree):
return 0

assert tree.data != '_ambig'

return _sum_priority(tree)

def _antiscore_sum_resolve_ambig(tree):
assert tree.data == '_ambig'
best = min(tree.children, key=_antiscore_sum_drv)
assert best.data == 'drv'
tree.set('drv', best.children)
tree.meta.rule = best.meta.rule # needed for applying callbacks

def antiscore_sum_resolve_ambig(tree):
for ambig in tree.find_data('_ambig'):
_antiscore_sum_resolve_ambig(ambig)

return tree

+ 212
- 96
lark/parsers/xearley.py View File

@@ -1,107 +1,163 @@
"This module implements an experimental Earley Parser with a dynamic lexer"

# The parser uses a parse-forest to keep track of derivations and ambiguations.
# When the parse ends successfully, a disambiguation stage resolves all ambiguity
# (right now ambiguity resolution is not developed beyond the needs of lark)
# Afterwards the parse tree is reduced (transformed) according to user callbacks.
# I use the no-recursion version of Transformer and Visitor, because the tree might be
# deeper than Python's recursion limit (a bit absurd, but that's life)
#
# The algorithm keeps track of each state set, using a corresponding Column instance.
# Column keeps track of new items using NewsList instances.
#
# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
# uses regular expressions by necessity, achieving high-performance while maintaining all of
# Earley's power in parsing any CFG.
#
#
"""This module implements an experimental Earley parser with a dynamic lexer

The core Earley algorithm used here is based on Elizabeth Scott's implementation, here:
https://www.sciencedirect.com/science/article/pii/S1571066108001497

That is probably the best reference for understanding the algorithm here.

The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format
is better documented here:
http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/

Instead of running a lexer beforehand, or using a costy char-by-char method, this parser
uses regular expressions by necessity, achieving high-performance while maintaining all of
Earley's power in parsing any CFG.
"""
# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com

from collections import defaultdict
from collections import defaultdict, deque

from ..exceptions import ParseError, UnexpectedCharacters
from ..lexer import Token
from ..tree import Tree
from .grammar_analysis import GrammarAnalyzer
from ..grammar import NonTerminal, Terminal

from .earley import ApplyCallbacks, Item, Column
from .earley import ApplyCallbacks
from .earley_common import Column, Item
from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode


class Parser:
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False):
self.analysis = GrammarAnalyzer(parser_conf)
def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor, ignore = (), complete_lex = False):
analysis = GrammarAnalyzer(parser_conf)
self.parser_conf = parser_conf
self.resolve_ambiguity = resolve_ambiguity
self.forest_sum_visitor = forest_sum_visitor
self.ignore = [Terminal(t) for t in ignore]
self.predict_all = predict_all
self.complete_lex = complete_lex

self.FIRST = self.analysis.FIRST
self.postprocess = {}
self.FIRST = analysis.FIRST
self.callbacks = {}
self.predictions = {}

## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than
# the slow 'isupper' in is_terminal.
self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term }
self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term }
for rule in parser_conf.rules:
self.postprocess[rule] = getattr(parser_conf.callback, rule.alias)
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None)
self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)]

self.term_matcher = term_matcher


def parse(self, stream, start_symbol=None):
# Define parser functions
start_symbol = NonTerminal(start_symbol or self.parser_conf.start)
delayed_matches = defaultdict(list)
match = self.term_matcher

text_line = 1
text_column = 1

def predict(nonterm, column):
assert not nonterm.is_term, nonterm
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]

def complete(item):
name = item.rule.origin
return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name]

def predict_and_complete(column):
while True:
to_predict = {x.expect for x in column.to_predict.get_news()
if x.ptr} # if not part of an already predicted batch
to_reduce = column.to_reduce.get_news()
if not (to_predict or to_reduce):
break

for nonterm in to_predict:
column.add( predict(nonterm, column) )
for item in to_reduce:
new_items = list(complete(item))
if item in new_items:
raise ParseError('Infinite recursion detected! (rule %s)' % item.rule)
column.add(new_items)

def scan(i, column):
to_scan = column.to_scan
# Held Completions (H in E.Scotts paper).
held_completions = {}

for x in self.ignore:
m = match(x, stream, i)
if m:
delayed_matches[m.end()] += set(to_scan)
delayed_matches[m.end()] += set(column.to_reduce)
# Cache for nodes & tokens created in a particular parse step.
node_cache = {}
token_cache = {}

# TODO add partial matches for ignore too?
# s = m.group(0)
# for j in range(1, len(s)):
# m = x.match(s[:-j])
# if m:
# delayed_matches[m.end()] += to_scan
text_line = 1
text_column = 0

for item in to_scan:
def make_symbol_node(s, start, end):
label = (s, start.i, end.i)
if label in node_cache:
node = node_cache[label]
else:
node = node_cache[label] = SymbolNode(s, start, end)
return node

def predict_and_complete(column, to_scan):
"""The core Earley Predictor and Completer.

At each stage of the input, we handling any completed items (things
that matched on the last cycle) and use those to predict what should
come next in the input stream. The completions and any predicted
non-terminals are recursively processed until we reach a set of,
which can be added to the scan list for the next scanner cycle."""
held_completions.clear()

# R (items) = Ei (column.items)
items = deque(column.items)
while items:
item = items.pop() # remove an element, A say, from R

### The Earley completer
if item.is_complete: ### (item.s == string)
if item.node is None:
item.node = make_symbol_node(item.s, item.start, column)
item.node.add_family(item.s, item.rule, item.start, None, None)

# Empty has 0 length. If we complete an empty symbol in a particular
# parse step, we need to be able to use that same empty symbol to complete
# any predictions that result, that themselves require empty. Avoids
# infinite recursion on empty symbols.
# held_completions is 'H' in E.Scott's paper.
is_empty_item = item.start.i == column.i
if is_empty_item:
held_completions[item.rule.origin] = item.node

originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s]
for originator in originators:
new_item = originator.advance()
new_item.node = make_symbol_node(new_item.s, originator.start, column)
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node)
if new_item.expect in self.TERMINALS:
# Add (B :: aC.B, h, y) to Q
to_scan.add(new_item)
elif new_item not in column.items:
# Add (B :: aC.B, h, y) to Ei and R
column.add(new_item)
items.append(new_item)

### The Earley predictor
elif item.expect in self.NON_TERMINALS: ### (item.s == lr0)
new_items = []
for rule in self.predictions[item.expect]:
new_item = Item(rule, 0, column)
new_items.append(new_item)

# Process any held completions (H).
if item.expect in held_completions:
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, item.start, column)
new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect])
new_items.append(new_item)

for new_item in new_items:
if new_item.expect in self.TERMINALS:
to_scan.add(new_item)
elif new_item not in column.items:
column.add(new_item)
items.append(new_item)

def scan(i, column, to_scan):
"""The core Earley Scanner.

This is a custom implementation of the scanner that uses the
Lark lexer to match tokens. The scan list is built by the
Earley predictor, based on the previously completed tokens.
This ensures that at each phase of the parse we have a custom
lexer context, allowing for more complex ambiguities."""

# 1) Loop the expectations and ask the lexer to match.
# Since regexp is forward looking on the input stream, and we only
# want to process tokens when we hit the point in the stream at which
# they complete, we push all tokens into a buffer (delayed_matches), to
# be held possibly for a later parse step when we reach the point in the
# input stream at which they complete.
for item in set(to_scan):
m = match(item.expect, stream, i)
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[m.end()].append(item.advance(t))
delayed_matches[m.end()].append( (item, column, t) )

if self.complete_lex:
s = m.group(0)
@@ -109,25 +165,85 @@ class Parser:
m = match(item.expect, s[:-j])
if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append(item.advance(t))
delayed_matches[i+m.end()].append( (item, column, t) )

# Remove any items that successfully matched in this pass from the to_scan buffer.
# This ensures we don't carry over tokens that already matched, if we're ignoring below.
to_scan.remove(item)

# 3) Process any ignores. This is typically used for e.g. whitespace.
# We carry over any unmatched items from the to_scan buffer to be matched again after
# the ignore. This should allow us to use ignored symbols in non-terminals to implement
# e.g. mandatory spacing.
for x in self.ignore:
m = match(x, stream, i)
if m:
# Carry over any items still in the scan buffer, to past the end of the ignored items.
delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ])

# If we're ignoring up to the end of the file, # carry over the start symbol if it already completed.
delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol])

next_set = Column(i + 1, self.FIRST) # Ei+1
next_to_scan = set()

## 4) Process Tokens from delayed_matches.
# This is the core of the Earley scanner. Create an SPPF node for each Token,
# and create the symbol node in the SPPF tree. Advance the item that completed,
# and add the resulting new item to either the Earley set (for processing by the
# completer/predictor) or the to_scan buffer for the next parse step.
for item, start, token in delayed_matches[i+1]:
if token is not None:
new_item = item.advance()
new_item.node = make_symbol_node(new_item.s, new_item.start, column)
new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token)
else:
new_item = item

if new_item.expect in self.TERMINALS:
# add (B ::= Aai+1.B, h, y) to Q'
next_to_scan.add(new_item)
else:
# add (B ::= Aa+1.B, h, y) to Ei+1
next_set.add(new_item)

next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
next_set.add(delayed_matches[i+1])
del delayed_matches[i+1] # No longer needed, so unburden memory

if not next_set and not delayed_matches:
if not next_set and not delayed_matches and not next_to_scan:
raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan))

return next_set
return next_set, next_to_scan

# Main loop starts
column0 = Column(0, self.FIRST, predict_all=self.predict_all)
column0.add(predict(start_symbol, column0))

column0 = Column(0, self.FIRST)
column = column0

## The scan buffer. 'Q' in E.Scott's paper.
to_scan = set()

## Predict for the start_symbol.
# Add predicted items to the first Earley set (for the predictor) if they
# result in a non-terminal, or the scanner if they result in a terminal.
for rule in self.predictions[start_symbol]:
item = Item(rule, 0, column0)
if item.expect in self.TERMINALS:
to_scan.add(item)
else:
column.add(item)

## The main Earley loop.
# Run the Prediction/Completion cycle for any Items in the current Earley set.
# Completions will be added to the SPPF tree, and predictions will be recursively
# processed down to terminals/empty nodes to be added to the scanner for the next
# step.
for i, token in enumerate(stream):
predict_and_complete(column)
column = scan(i, column)
predict_and_complete(column, to_scan)

# Clear the node_cache and token_cache, which are only relevant for each
# step in the Earley pass.
node_cache.clear()
token_cache.clear()
column, to_scan = scan(i, column, to_scan)

if token == '\n':
text_line += 1
@@ -135,24 +251,24 @@ class Parser:
else:
text_column += 1

predict_and_complete(column)
predict_and_complete(column, to_scan)

# Parse ended. Now build a parse tree
solutions = [n.tree for n in column.to_reduce
if n.rule.origin==start_symbol and n.start is column0]
## Column is now the final column in the parse. If the parse was successful, the start
# symbol should have been completed in the last step of the Earley cycle, and will be in
# this column. Find the item for the start_symbol, which is the root of the SPPF tree.
solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0]

if not solutions:
expected_tokens = [t.expect for t in column.to_scan]
expected_tokens = [t.expect for t in to_scan]
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)
elif len(solutions) > 1:
raise Exception('Earley should not generate more than one start symbol - bug')

elif len(solutions) == 1:
tree = solutions[0]
else:
tree = Tree('_ambig', solutions)

if self.resolve_ambiguity:
tree = self.resolve_ambiguity(tree)

return ApplyCallbacks(self.postprocess).transform(tree)

## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller.
# This means the caller can work directly with the SPPF tree.
if not self.resolve_ambiguity:
return solutions[0]

# ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities
# according to the rules.
return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go()

+ 1
- 1
lark/reconstruct.py View File

@@ -114,7 +114,7 @@ class Reconstructor:

def _reconstruct(self, tree):
# TODO: ambiguity?
parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=resolve_ambig.standard_resolve_ambig)
parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=True)
unreduced_tree = parser.parse(tree.children) # find a full derivation
assert unreduced_tree.data == tree.data
res = self.write_tokens.transform(unreduced_tree)


+ 43
- 10
tests/test_parser.py View File

@@ -21,6 +21,8 @@ from lark.lark import Lark
from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput
from lark.tree import Tree
from lark.visitors import Transformer
from lark.parsers.earley_forest import ForestToAmbiguousTreeVisitor
from lark.parsers.earley import ApplyCallbacks

__path__ = os.path.dirname(__file__)
def _read(n, *args):
@@ -236,10 +238,11 @@ def _make_full_earley_test(LEXER):
"""

parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit')
res = parser.parse('ab')

self.assertEqual( res.data, '_ambig')
self.assertEqual( len(res.children), 2)
root_symbol = parser.parse('ab')
ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go()
print(ambig_tree.pretty())
self.assertEqual( ambig_tree.data, '_ambig')
self.assertEqual( len(ambig_tree.children), 2)

def test_ambiguity1(self):
grammar = """
@@ -251,9 +254,35 @@ def _make_full_earley_test(LEXER):

"""
l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER)
x = l.parse('cde')
assert x.data == '_ambig', x
assert len(x.children) == 2
root_symbol = l.parse('cde')
ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, l.parser.parser.callbacks).go()
print(ambig_tree.pretty())
# tree = ApplyCallbacks(l.parser.parser.postprocess).transform(ambig_tree)

assert ambig_tree.data == '_ambig', ambig_tree
assert len(ambig_tree.children) == 2

@unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions")
def test_ambiguity2(self):
grammar = """
ANY: /[a-zA-Z0-9 ]+/
a.2: "A" b+
b.2: "B"
c: ANY

start: (a|c)*
"""
l = Lark(grammar, parser='earley', lexer=LEXER)
res = l.parse('ABX')
expected = Tree('start', [
Tree('a', [
Tree('b', [])
]),
Tree('c', [
'X'
])
])
self.assertEqual(res, expected)

def test_fruitflies_ambig(self):
grammar = """
@@ -272,7 +301,9 @@ def _make_full_earley_test(LEXER):
%ignore WS
"""
parser = Lark(grammar, ambiguity='explicit', lexer=LEXER)
res = parser.parse('fruit flies like bananas')
root_symbol = parser.parse('fruit flies like bananas')
tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go()
# tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree)

expected = Tree('_ambig', [
Tree('comparative', [
@@ -290,7 +321,7 @@ def _make_full_earley_test(LEXER):
# print res.pretty()
# print expected.pretty()

self.assertEqual(res, expected)
self.assertEqual(tree, expected)


@unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser")
@@ -303,7 +334,9 @@ def _make_full_earley_test(LEXER):
text = """cat"""

parser = Lark(grammar, start='start', ambiguity='explicit')
tree = parser.parse(text)
root_symbol = parser.parse(text)
ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol).go()
tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree)
self.assertEqual(tree.data, '_ambig')

combinations = {tuple(str(s) for s in t.children) for t in tree.children}


Loading…
Cancel
Save