@@ -1,5 +1,6 @@ | |||
from .tree import Tree, Transformer, InlineTransformer | |||
from .common import ParseError, GrammarError | |||
from .lexer import UnexpectedInput, LexError | |||
from .lark import Lark | |||
from .utils import inline_args | |||
@@ -57,6 +57,7 @@ class LarkOptions(object): | |||
self.profile = o.pop('profile', False) | |||
self.ambiguity = o.pop('ambiguity', 'auto') | |||
self.propagate_positions = o.pop('propagate_positions', False) | |||
self.earley__predict_all = o.pop('earley__predict_all', False) | |||
assert self.parser in ('earley', 'lalr', None) | |||
@@ -40,6 +40,14 @@ class Token(Str): | |||
def __deepcopy__(self, memo): | |||
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) | |||
def __eq__(self, other): | |||
if isinstance(other, Token) and self.type != other.type: | |||
return False | |||
return Str.__eq__(self, other) | |||
__hash__ = Str.__hash__ | |||
class Regex: | |||
def __init__(self, pattern, flags=()): | |||
self.pattern = pattern | |||
@@ -293,7 +293,6 @@ def _rfind(s, choices): | |||
def _fix_escaping(s): | |||
s = s.replace('\\"', '"').replace("'", "\\'") | |||
w = '' | |||
i = iter(s) | |||
for n in i: | |||
@@ -305,6 +304,7 @@ def _fix_escaping(s): | |||
elif n2 not in 'unftr': | |||
w += '\\' | |||
w += n2 | |||
w = w.replace('\\"', '"').replace("'", "\\'") | |||
to_eval = "u'''%s'''" % w | |||
try: | |||
@@ -435,9 +435,9 @@ class Grammar: | |||
for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | |||
if name.startswith('_'): | |||
options = RuleOptions(filter_out=True, priority=priority) | |||
options = RuleOptions(filter_out=True, priority=-priority) | |||
else: | |||
options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority) | |||
options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority) | |||
name = new_terminal_names[name] | |||
inner_name = name + '_inner' | |||
@@ -126,6 +126,7 @@ class XEarley: | |||
parser_conf.callback, | |||
resolve_ambiguity=get_ambiguity_resolver(options), | |||
ignore=ignore, | |||
predict_all=options.earley__predict_all | |||
) | |||
def _prepare_expansion(self, expansion): | |||
@@ -90,7 +90,7 @@ class NewsList(list): | |||
class Column: | |||
"An entry in the table, aka Earley Chart. Contains lists of items." | |||
def __init__(self, i, FIRST): | |||
def __init__(self, i, FIRST, predict_all=False): | |||
self.i = i | |||
self.to_reduce = NewsList() | |||
self.to_predict = NewsList() | |||
@@ -100,6 +100,7 @@ class Column: | |||
self.predicted = set() | |||
self.completed = {} | |||
self.predict_all = predict_all | |||
def add(self, items): | |||
"""Sort items into scan/predict/reduce newslists | |||
@@ -108,9 +109,9 @@ class Column: | |||
""" | |||
for item in items: | |||
item_key = item, item.tree # Elsewhere, tree is not part of the comparison | |||
if item.is_complete: | |||
# XXX Potential bug: What happens if there's ambiguity in an empty rule? | |||
item_key = item, item.tree # Elsewhere, tree is not part of the comparison | |||
if item.rule.expansion and item_key in self.completed: | |||
old_tree = self.completed[item_key].tree | |||
if old_tree == item.tree: | |||
@@ -137,9 +138,10 @@ class Column: | |||
if isinstance(item.expect, Terminal): | |||
self.to_scan.append(item) | |||
else: | |||
if item in self.predicted: | |||
k = item_key if self.predict_all else item | |||
if k in self.predicted: | |||
continue | |||
self.predicted.add(item) | |||
self.predicted.add(k) | |||
self.to_predict.append(item) | |||
self.item_count += 1 # Only count if actually added | |||
@@ -7,6 +7,10 @@ from ..common import ParseError, UnexpectedToken | |||
from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT | |||
class FinalReduce: | |||
def __init__(self, value): | |||
self.value = value | |||
class Parser: | |||
def __init__(self, parser_conf): | |||
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" | |||
@@ -56,7 +60,7 @@ class _Parser: | |||
res = self.callbacks[rule](s) | |||
if end and len(state_stack) == 1 and rule.origin == self.start_symbol: | |||
return res | |||
return FinalReduce(res) | |||
_action, new_state = get_action(rule.origin) | |||
assert _action == ACTION_SHIFT | |||
@@ -85,9 +89,9 @@ class _Parser: | |||
_action, rule = get_action('$end') | |||
assert _action == 'reduce' | |||
res = reduce(*rule, end=True) | |||
if res: | |||
if isinstance(res, FinalReduce): | |||
assert state_stack == [self.init_state] and not value_stack, len(state_stack) | |||
return res | |||
return res.value | |||
@@ -9,56 +9,60 @@ from ..tree import Tree, Visitor_NoRecurse | |||
# Author: Erez Sh | |||
def _compare_rules(rule1, rule2): | |||
if rule1.origin != rule2.origin: | |||
if rule1.options and rule2.options: | |||
if rule1.options.priority is not None and rule2.options.priority is not None: | |||
assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2) | |||
return -compare(rule1.options.priority, rule2.options.priority) | |||
return 0 | |||
c = compare( len(rule1.expansion), len(rule2.expansion)) | |||
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||
c = -compare( len(rule1.expansion), len(rule2.expansion)) | |||
if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here | |||
c = -c | |||
return c | |||
def _compare_drv(tree1, tree2): | |||
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||
def _sum_priority(tree): | |||
p = 0 | |||
for n in tree.iter_subtrees(): | |||
try: | |||
return -compare(tree1, tree2) | |||
except TypeError: | |||
return 0 | |||
p += n.rule.options.priority or 0 | |||
except AttributeError: | |||
pass | |||
return p | |||
def _compare_priority(tree1, tree2): | |||
tree1.iter_subtrees() | |||
def _compare_drv(tree1, tree2): | |||
try: | |||
rule1, rule2 = tree1.rule, tree2.rule | |||
except AttributeError: | |||
# Probably trees that don't take part in this parse (better way to distinguish?) | |||
return -compare(tree1, tree2) | |||
# Probably non-trees, or user trees that weren't created by the parse (better way to distinguish?) | |||
return compare(tree1, tree2) | |||
assert tree1.data != '_ambig' | |||
assert tree2.data != '_ambig' | |||
# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, | |||
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be | |||
# computationally inefficient. So we handle it here. | |||
if tree1.data == '_ambig': | |||
_standard_resolve_ambig(tree1) | |||
if tree2.data == '_ambig': | |||
_standard_resolve_ambig(tree2) | |||
p1 = _sum_priority(tree1) | |||
p2 = _sum_priority(tree2) | |||
c = (p1 or p2) and compare(p1, p2) | |||
if c: | |||
return c | |||
c = _compare_rules(tree1.rule, tree2.rule) | |||
if c: | |||
return c | |||
# rules are "equal", so compare trees | |||
for t1, t2 in zip(tree1.children, tree2.children): | |||
c = _compare_drv(t1, t2) | |||
if c: | |||
return c | |||
if len(tree1.children) == len(tree2.children): | |||
for t1, t2 in zip(tree1.children, tree2.children): | |||
c = _compare_drv(t1, t2) | |||
if c: | |||
return c | |||
return compare(len(tree1.children), len(tree2.children)) | |||
def _standard_resolve_ambig(tree): | |||
assert tree.data == '_ambig' | |||
best = min(tree.children, key=cmp_to_key(_compare_drv)) | |||
key_f = cmp_to_key(_compare_drv) | |||
best = max(tree.children, key=key_f) | |||
assert best.data == 'drv' | |||
tree.set('drv', best.children) | |||
tree.rule = best.rule # needed for applying callbacks | |||
@@ -80,23 +84,12 @@ def _antiscore_sum_drv(tree): | |||
if not isinstance(tree, Tree): | |||
return 0 | |||
# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, | |||
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be | |||
# computationally inefficient. So we handle it here. | |||
if tree.data == '_ambig': | |||
_antiscore_sum_resolve_ambig(tree) | |||
assert tree.data != '_ambig' | |||
try: | |||
priority = tree.rule.options.priority | |||
except AttributeError: | |||
# Probably trees that don't take part in this parse (better way to distinguish?) | |||
priority = None | |||
return (priority or 0) + sum(map(_antiscore_sum_drv, tree.children), 0) | |||
return _sum_priority(tree) | |||
def _antiscore_sum_resolve_ambig(tree): | |||
assert tree.data == '_ambig' | |||
best = min(tree.children, key=_antiscore_sum_drv) | |||
assert best.data == 'drv' | |||
tree.set('drv', best.children) | |||
@@ -28,11 +28,12 @@ from .grammar_analysis import GrammarAnalyzer | |||
from .earley import ApplyCallbacks, Item, Column | |||
class Parser: | |||
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=()): | |||
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): | |||
self.analysis = GrammarAnalyzer(rules, start_symbol) | |||
self.start_symbol = start_symbol | |||
self.resolve_ambiguity = resolve_ambiguity | |||
self.ignore = list(ignore) | |||
self.predict_all = predict_all | |||
self.postprocess = {} | |||
@@ -107,9 +108,10 @@ class Parser: | |||
for j in range(1, len(s)): | |||
m = item.expect.match(s[:-j]) | |||
if m: | |||
delayed_matches[m.end()].append(item.advance(m.group(0))) | |||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
delayed_matches[i+m.end()].append(item.advance(t)) | |||
next_set = Column(i+1, self.FIRST) | |||
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | |||
next_set.add(delayed_matches[i+1]) | |||
del delayed_matches[i+1] # No longer needed, so unburden memory | |||
@@ -119,7 +121,7 @@ class Parser: | |||
return next_set | |||
# Main loop starts | |||
column0 = Column(0, self.FIRST) | |||
column0 = Column(0, self.FIRST, predict_all=self.predict_all) | |||
column0.add(predict(start_symbol, column0)) | |||
column = column0 | |||
@@ -67,17 +67,26 @@ class Tree(object): | |||
yield c | |||
def iter_subtrees(self): | |||
# TODO: Re-write as a more efficient version | |||
visited = set() | |||
q = [self] | |||
l = [] | |||
while q: | |||
subtree = q.pop() | |||
l.append( subtree ) | |||
if id(subtree) in visited: | |||
continue # already been here from another branch | |||
visited.add(id(subtree)) | |||
yield subtree | |||
q += [c for c in subtree.children if isinstance(c, Tree)] | |||
seen = set() | |||
for x in reversed(l): | |||
if id(x) not in seen: | |||
yield x | |||
seen.add(id(x)) | |||
def __deepcopy__(self, memo): | |||
return type(self)(self.data, deepcopy(self.children, memo)) | |||
@@ -100,7 +109,7 @@ class Transformer(object): | |||
if isinstance(c, Tree): | |||
try: | |||
items.append(self.transform(c)) | |||
except Erase: | |||
except Discard: | |||
pass | |||
try: | |||
f = self._get_func(tree.data) | |||
@@ -116,7 +125,7 @@ class Transformer(object): | |||
return TransformerChain(self, other) | |||
class Erase(Exception): | |||
class Discard(Exception): | |||
pass | |||
class TransformerChain(object): | |||
@@ -156,7 +165,7 @@ class Visitor_NoRecurse(Visitor): | |||
def visit(self, tree): | |||
subtrees = list(tree.iter_subtrees()) | |||
for subtree in reversed(subtrees): | |||
for subtree in (subtrees): | |||
getattr(self, subtree.data, self.__default__)(subtree) | |||
return tree | |||
@@ -174,13 +183,13 @@ class Transformer_NoRecurse(Transformer): | |||
else: | |||
return f(t) | |||
for subtree in reversed(subtrees): | |||
for subtree in (subtrees): | |||
children = [] | |||
for c in subtree.children: | |||
if isinstance(c, Tree): | |||
try: | |||
children.append(_t(c)) | |||
except Erase: | |||
except Discard: | |||
pass | |||
else: | |||
children.append(c) | |||
@@ -711,6 +711,19 @@ def _make_parser_test(LEXER, PARSER): | |||
""") | |||
x = g.parse('AB') | |||
@unittest.skipIf(LEXER == None, "Scanless can't handle regexps") | |||
def test_regex_quote(self): | |||
g = r""" | |||
start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING | |||
SINGLE_QUOTED_STRING : /'[^']*'/ | |||
DOUBLE_QUOTED_STRING : /"[^"]*"/ | |||
""" | |||
g = _Lark(g) | |||
self.assertEqual( g.parse('"hello"').children, ['"hello"']) | |||
self.assertEqual( g.parse("'hello'").children, ["'hello'"]) | |||
def test_lexer_token_limit(self): | |||
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" | |||
tokens = {'A%d'%i:'"%d"'%i for i in range(300)} | |||