@@ -1,5 +1,6 @@ | |||||
from .tree import Tree, Transformer, InlineTransformer | from .tree import Tree, Transformer, InlineTransformer | ||||
from .common import ParseError, GrammarError | from .common import ParseError, GrammarError | ||||
from .lexer import UnexpectedInput, LexError | |||||
from .lark import Lark | from .lark import Lark | ||||
from .utils import inline_args | from .utils import inline_args | ||||
@@ -57,6 +57,7 @@ class LarkOptions(object): | |||||
self.profile = o.pop('profile', False) | self.profile = o.pop('profile', False) | ||||
self.ambiguity = o.pop('ambiguity', 'auto') | self.ambiguity = o.pop('ambiguity', 'auto') | ||||
self.propagate_positions = o.pop('propagate_positions', False) | self.propagate_positions = o.pop('propagate_positions', False) | ||||
self.earley__predict_all = o.pop('earley__predict_all', False) | |||||
assert self.parser in ('earley', 'lalr', None) | assert self.parser in ('earley', 'lalr', None) | ||||
@@ -40,6 +40,14 @@ class Token(Str): | |||||
def __deepcopy__(self, memo): | def __deepcopy__(self, memo): | ||||
return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) | return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) | ||||
def __eq__(self, other): | |||||
if isinstance(other, Token) and self.type != other.type: | |||||
return False | |||||
return Str.__eq__(self, other) | |||||
__hash__ = Str.__hash__ | |||||
class Regex: | class Regex: | ||||
def __init__(self, pattern, flags=()): | def __init__(self, pattern, flags=()): | ||||
self.pattern = pattern | self.pattern = pattern | ||||
@@ -293,7 +293,6 @@ def _rfind(s, choices): | |||||
def _fix_escaping(s): | def _fix_escaping(s): | ||||
s = s.replace('\\"', '"').replace("'", "\\'") | |||||
w = '' | w = '' | ||||
i = iter(s) | i = iter(s) | ||||
for n in i: | for n in i: | ||||
@@ -305,6 +304,7 @@ def _fix_escaping(s): | |||||
elif n2 not in 'unftr': | elif n2 not in 'unftr': | ||||
w += '\\' | w += '\\' | ||||
w += n2 | w += n2 | ||||
w = w.replace('\\"', '"').replace("'", "\\'") | |||||
to_eval = "u'''%s'''" % w | to_eval = "u'''%s'''" % w | ||||
try: | try: | ||||
@@ -435,9 +435,9 @@ class Grammar: | |||||
for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | ||||
if name.startswith('_'): | if name.startswith('_'): | ||||
options = RuleOptions(filter_out=True, priority=priority) | |||||
options = RuleOptions(filter_out=True, priority=-priority) | |||||
else: | else: | ||||
options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority) | |||||
options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority) | |||||
name = new_terminal_names[name] | name = new_terminal_names[name] | ||||
inner_name = name + '_inner' | inner_name = name + '_inner' | ||||
@@ -126,6 +126,7 @@ class XEarley: | |||||
parser_conf.callback, | parser_conf.callback, | ||||
resolve_ambiguity=get_ambiguity_resolver(options), | resolve_ambiguity=get_ambiguity_resolver(options), | ||||
ignore=ignore, | ignore=ignore, | ||||
predict_all=options.earley__predict_all | |||||
) | ) | ||||
def _prepare_expansion(self, expansion): | def _prepare_expansion(self, expansion): | ||||
@@ -90,7 +90,7 @@ class NewsList(list): | |||||
class Column: | class Column: | ||||
"An entry in the table, aka Earley Chart. Contains lists of items." | "An entry in the table, aka Earley Chart. Contains lists of items." | ||||
def __init__(self, i, FIRST): | |||||
def __init__(self, i, FIRST, predict_all=False): | |||||
self.i = i | self.i = i | ||||
self.to_reduce = NewsList() | self.to_reduce = NewsList() | ||||
self.to_predict = NewsList() | self.to_predict = NewsList() | ||||
@@ -100,6 +100,7 @@ class Column: | |||||
self.predicted = set() | self.predicted = set() | ||||
self.completed = {} | self.completed = {} | ||||
self.predict_all = predict_all | |||||
def add(self, items): | def add(self, items): | ||||
"""Sort items into scan/predict/reduce newslists | """Sort items into scan/predict/reduce newslists | ||||
@@ -108,9 +109,9 @@ class Column: | |||||
""" | """ | ||||
for item in items: | for item in items: | ||||
item_key = item, item.tree # Elsewhere, tree is not part of the comparison | |||||
if item.is_complete: | if item.is_complete: | ||||
# XXX Potential bug: What happens if there's ambiguity in an empty rule? | # XXX Potential bug: What happens if there's ambiguity in an empty rule? | ||||
item_key = item, item.tree # Elsewhere, tree is not part of the comparison | |||||
if item.rule.expansion and item_key in self.completed: | if item.rule.expansion and item_key in self.completed: | ||||
old_tree = self.completed[item_key].tree | old_tree = self.completed[item_key].tree | ||||
if old_tree == item.tree: | if old_tree == item.tree: | ||||
@@ -137,9 +138,10 @@ class Column: | |||||
if isinstance(item.expect, Terminal): | if isinstance(item.expect, Terminal): | ||||
self.to_scan.append(item) | self.to_scan.append(item) | ||||
else: | else: | ||||
if item in self.predicted: | |||||
k = item_key if self.predict_all else item | |||||
if k in self.predicted: | |||||
continue | continue | ||||
self.predicted.add(item) | |||||
self.predicted.add(k) | |||||
self.to_predict.append(item) | self.to_predict.append(item) | ||||
self.item_count += 1 # Only count if actually added | self.item_count += 1 # Only count if actually added | ||||
@@ -7,6 +7,10 @@ from ..common import ParseError, UnexpectedToken | |||||
from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT | from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT | ||||
class FinalReduce: | |||||
def __init__(self, value): | |||||
self.value = value | |||||
class Parser: | class Parser: | ||||
def __init__(self, parser_conf): | def __init__(self, parser_conf): | ||||
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" | assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" | ||||
@@ -56,7 +60,7 @@ class _Parser: | |||||
res = self.callbacks[rule](s) | res = self.callbacks[rule](s) | ||||
if end and len(state_stack) == 1 and rule.origin == self.start_symbol: | if end and len(state_stack) == 1 and rule.origin == self.start_symbol: | ||||
return res | |||||
return FinalReduce(res) | |||||
_action, new_state = get_action(rule.origin) | _action, new_state = get_action(rule.origin) | ||||
assert _action == ACTION_SHIFT | assert _action == ACTION_SHIFT | ||||
@@ -85,9 +89,9 @@ class _Parser: | |||||
_action, rule = get_action('$end') | _action, rule = get_action('$end') | ||||
assert _action == 'reduce' | assert _action == 'reduce' | ||||
res = reduce(*rule, end=True) | res = reduce(*rule, end=True) | ||||
if res: | |||||
if isinstance(res, FinalReduce): | |||||
assert state_stack == [self.init_state] and not value_stack, len(state_stack) | assert state_stack == [self.init_state] and not value_stack, len(state_stack) | ||||
return res | |||||
return res.value | |||||
@@ -9,56 +9,60 @@ from ..tree import Tree, Visitor_NoRecurse | |||||
# Author: Erez Sh | # Author: Erez Sh | ||||
def _compare_rules(rule1, rule2): | def _compare_rules(rule1, rule2): | ||||
if rule1.origin != rule2.origin: | |||||
if rule1.options and rule2.options: | |||||
if rule1.options.priority is not None and rule2.options.priority is not None: | |||||
assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2) | |||||
return -compare(rule1.options.priority, rule2.options.priority) | |||||
return 0 | |||||
c = compare( len(rule1.expansion), len(rule2.expansion)) | |||||
if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||||
c = -compare( len(rule1.expansion), len(rule2.expansion)) | |||||
if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here | |||||
c = -c | c = -c | ||||
return c | return c | ||||
def _compare_drv(tree1, tree2): | |||||
if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||||
def _sum_priority(tree): | |||||
p = 0 | |||||
for n in tree.iter_subtrees(): | |||||
try: | try: | ||||
return -compare(tree1, tree2) | |||||
except TypeError: | |||||
return 0 | |||||
p += n.rule.options.priority or 0 | |||||
except AttributeError: | |||||
pass | |||||
return p | |||||
def _compare_priority(tree1, tree2): | |||||
tree1.iter_subtrees() | |||||
def _compare_drv(tree1, tree2): | |||||
try: | try: | ||||
rule1, rule2 = tree1.rule, tree2.rule | rule1, rule2 = tree1.rule, tree2.rule | ||||
except AttributeError: | except AttributeError: | ||||
# Probably trees that don't take part in this parse (better way to distinguish?) | |||||
return -compare(tree1, tree2) | |||||
# Probably non-trees, or user trees that weren't created by the parse (better way to distinguish?) | |||||
return compare(tree1, tree2) | |||||
assert tree1.data != '_ambig' | |||||
assert tree2.data != '_ambig' | |||||
# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, | |||||
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be | |||||
# computationally inefficient. So we handle it here. | |||||
if tree1.data == '_ambig': | |||||
_standard_resolve_ambig(tree1) | |||||
if tree2.data == '_ambig': | |||||
_standard_resolve_ambig(tree2) | |||||
p1 = _sum_priority(tree1) | |||||
p2 = _sum_priority(tree2) | |||||
c = (p1 or p2) and compare(p1, p2) | |||||
if c: | |||||
return c | |||||
c = _compare_rules(tree1.rule, tree2.rule) | c = _compare_rules(tree1.rule, tree2.rule) | ||||
if c: | if c: | ||||
return c | return c | ||||
# rules are "equal", so compare trees | # rules are "equal", so compare trees | ||||
for t1, t2 in zip(tree1.children, tree2.children): | |||||
c = _compare_drv(t1, t2) | |||||
if c: | |||||
return c | |||||
if len(tree1.children) == len(tree2.children): | |||||
for t1, t2 in zip(tree1.children, tree2.children): | |||||
c = _compare_drv(t1, t2) | |||||
if c: | |||||
return c | |||||
return compare(len(tree1.children), len(tree2.children)) | return compare(len(tree1.children), len(tree2.children)) | ||||
def _standard_resolve_ambig(tree): | def _standard_resolve_ambig(tree): | ||||
assert tree.data == '_ambig' | assert tree.data == '_ambig' | ||||
best = min(tree.children, key=cmp_to_key(_compare_drv)) | |||||
key_f = cmp_to_key(_compare_drv) | |||||
best = max(tree.children, key=key_f) | |||||
assert best.data == 'drv' | assert best.data == 'drv' | ||||
tree.set('drv', best.children) | tree.set('drv', best.children) | ||||
tree.rule = best.rule # needed for applying callbacks | tree.rule = best.rule # needed for applying callbacks | ||||
@@ -80,23 +84,12 @@ def _antiscore_sum_drv(tree): | |||||
if not isinstance(tree, Tree): | if not isinstance(tree, Tree): | ||||
return 0 | return 0 | ||||
# XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, | |||||
# when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be | |||||
# computationally inefficient. So we handle it here. | |||||
if tree.data == '_ambig': | |||||
_antiscore_sum_resolve_ambig(tree) | |||||
assert tree.data != '_ambig' | |||||
try: | |||||
priority = tree.rule.options.priority | |||||
except AttributeError: | |||||
# Probably trees that don't take part in this parse (better way to distinguish?) | |||||
priority = None | |||||
return (priority or 0) + sum(map(_antiscore_sum_drv, tree.children), 0) | |||||
return _sum_priority(tree) | |||||
def _antiscore_sum_resolve_ambig(tree): | def _antiscore_sum_resolve_ambig(tree): | ||||
assert tree.data == '_ambig' | assert tree.data == '_ambig' | ||||
best = min(tree.children, key=_antiscore_sum_drv) | best = min(tree.children, key=_antiscore_sum_drv) | ||||
assert best.data == 'drv' | assert best.data == 'drv' | ||||
tree.set('drv', best.children) | tree.set('drv', best.children) | ||||
@@ -28,11 +28,12 @@ from .grammar_analysis import GrammarAnalyzer | |||||
from .earley import ApplyCallbacks, Item, Column | from .earley import ApplyCallbacks, Item, Column | ||||
class Parser: | class Parser: | ||||
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=()): | |||||
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): | |||||
self.analysis = GrammarAnalyzer(rules, start_symbol) | self.analysis = GrammarAnalyzer(rules, start_symbol) | ||||
self.start_symbol = start_symbol | self.start_symbol = start_symbol | ||||
self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
self.ignore = list(ignore) | self.ignore = list(ignore) | ||||
self.predict_all = predict_all | |||||
self.postprocess = {} | self.postprocess = {} | ||||
@@ -107,9 +108,10 @@ class Parser: | |||||
for j in range(1, len(s)): | for j in range(1, len(s)): | ||||
m = item.expect.match(s[:-j]) | m = item.expect.match(s[:-j]) | ||||
if m: | if m: | ||||
delayed_matches[m.end()].append(item.advance(m.group(0))) | |||||
t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||||
delayed_matches[i+m.end()].append(item.advance(t)) | |||||
next_set = Column(i+1, self.FIRST) | |||||
next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | |||||
next_set.add(delayed_matches[i+1]) | next_set.add(delayed_matches[i+1]) | ||||
del delayed_matches[i+1] # No longer needed, so unburden memory | del delayed_matches[i+1] # No longer needed, so unburden memory | ||||
@@ -119,7 +121,7 @@ class Parser: | |||||
return next_set | return next_set | ||||
# Main loop starts | # Main loop starts | ||||
column0 = Column(0, self.FIRST) | |||||
column0 = Column(0, self.FIRST, predict_all=self.predict_all) | |||||
column0.add(predict(start_symbol, column0)) | column0.add(predict(start_symbol, column0)) | ||||
column = column0 | column = column0 | ||||
@@ -67,17 +67,26 @@ class Tree(object): | |||||
yield c | yield c | ||||
def iter_subtrees(self): | def iter_subtrees(self): | ||||
# TODO: Re-write as a more efficient version | |||||
visited = set() | visited = set() | ||||
q = [self] | q = [self] | ||||
l = [] | |||||
while q: | while q: | ||||
subtree = q.pop() | subtree = q.pop() | ||||
l.append( subtree ) | |||||
if id(subtree) in visited: | if id(subtree) in visited: | ||||
continue # already been here from another branch | continue # already been here from another branch | ||||
visited.add(id(subtree)) | visited.add(id(subtree)) | ||||
yield subtree | |||||
q += [c for c in subtree.children if isinstance(c, Tree)] | q += [c for c in subtree.children if isinstance(c, Tree)] | ||||
seen = set() | |||||
for x in reversed(l): | |||||
if id(x) not in seen: | |||||
yield x | |||||
seen.add(id(x)) | |||||
def __deepcopy__(self, memo): | def __deepcopy__(self, memo): | ||||
return type(self)(self.data, deepcopy(self.children, memo)) | return type(self)(self.data, deepcopy(self.children, memo)) | ||||
@@ -100,7 +109,7 @@ class Transformer(object): | |||||
if isinstance(c, Tree): | if isinstance(c, Tree): | ||||
try: | try: | ||||
items.append(self.transform(c)) | items.append(self.transform(c)) | ||||
except Erase: | |||||
except Discard: | |||||
pass | pass | ||||
try: | try: | ||||
f = self._get_func(tree.data) | f = self._get_func(tree.data) | ||||
@@ -116,7 +125,7 @@ class Transformer(object): | |||||
return TransformerChain(self, other) | return TransformerChain(self, other) | ||||
class Erase(Exception): | |||||
class Discard(Exception): | |||||
pass | pass | ||||
class TransformerChain(object): | class TransformerChain(object): | ||||
@@ -156,7 +165,7 @@ class Visitor_NoRecurse(Visitor): | |||||
def visit(self, tree): | def visit(self, tree): | ||||
subtrees = list(tree.iter_subtrees()) | subtrees = list(tree.iter_subtrees()) | ||||
for subtree in reversed(subtrees): | |||||
for subtree in (subtrees): | |||||
getattr(self, subtree.data, self.__default__)(subtree) | getattr(self, subtree.data, self.__default__)(subtree) | ||||
return tree | return tree | ||||
@@ -174,13 +183,13 @@ class Transformer_NoRecurse(Transformer): | |||||
else: | else: | ||||
return f(t) | return f(t) | ||||
for subtree in reversed(subtrees): | |||||
for subtree in (subtrees): | |||||
children = [] | children = [] | ||||
for c in subtree.children: | for c in subtree.children: | ||||
if isinstance(c, Tree): | if isinstance(c, Tree): | ||||
try: | try: | ||||
children.append(_t(c)) | children.append(_t(c)) | ||||
except Erase: | |||||
except Discard: | |||||
pass | pass | ||||
else: | else: | ||||
children.append(c) | children.append(c) | ||||
@@ -711,6 +711,19 @@ def _make_parser_test(LEXER, PARSER): | |||||
""") | """) | ||||
x = g.parse('AB') | x = g.parse('AB') | ||||
@unittest.skipIf(LEXER == None, "Scanless can't handle regexps") | |||||
def test_regex_quote(self): | |||||
g = r""" | |||||
start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING | |||||
SINGLE_QUOTED_STRING : /'[^']*'/ | |||||
DOUBLE_QUOTED_STRING : /"[^"]*"/ | |||||
""" | |||||
g = _Lark(g) | |||||
self.assertEqual( g.parse('"hello"').children, ['"hello"']) | |||||
self.assertEqual( g.parse("'hello'").children, ["'hello'"]) | |||||
def test_lexer_token_limit(self): | def test_lexer_token_limit(self): | ||||
"Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" | "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" | ||||
tokens = {'A%d'%i:'"%d"'%i for i in range(300)} | tokens = {'A%d'%i:'"%d"'%i for i in range(300)} | ||||