| @@ -1,5 +1,6 @@ | |||
| from .tree import Tree, Transformer, InlineTransformer | |||
| from .common import ParseError, GrammarError | |||
| from .lexer import UnexpectedInput, LexError | |||
| from .lark import Lark | |||
| from .utils import inline_args | |||
| @@ -57,6 +57,7 @@ class LarkOptions(object): | |||
| self.profile = o.pop('profile', False) | |||
| self.ambiguity = o.pop('ambiguity', 'auto') | |||
| self.propagate_positions = o.pop('propagate_positions', False) | |||
| self.earley__predict_all = o.pop('earley__predict_all', False) | |||
| assert self.parser in ('earley', 'lalr', None) | |||
| @@ -40,6 +40,14 @@ class Token(Str): | |||
| def __deepcopy__(self, memo): | |||
| return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) | |||
| def __eq__(self, other): | |||
| if isinstance(other, Token) and self.type != other.type: | |||
| return False | |||
| return Str.__eq__(self, other) | |||
| __hash__ = Str.__hash__ | |||
| class Regex: | |||
| def __init__(self, pattern, flags=()): | |||
| self.pattern = pattern | |||
| @@ -293,7 +293,6 @@ def _rfind(s, choices): | |||
| def _fix_escaping(s): | |||
| s = s.replace('\\"', '"').replace("'", "\\'") | |||
| w = '' | |||
| i = iter(s) | |||
| for n in i: | |||
| @@ -305,6 +304,7 @@ def _fix_escaping(s): | |||
| elif n2 not in 'unftr': | |||
| w += '\\' | |||
| w += n2 | |||
| w = w.replace('\\"', '"').replace("'", "\\'") | |||
| to_eval = "u'''%s'''" % w | |||
| try: | |||
| @@ -435,9 +435,9 @@ class Grammar: | |||
| for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | |||
| if name.startswith('_'): | |||
| options = RuleOptions(filter_out=True, priority=priority) | |||
| options = RuleOptions(filter_out=True, priority=-priority) | |||
| else: | |||
| options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority) | |||
| options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority) | |||
| name = new_terminal_names[name] | |||
| inner_name = name + '_inner' | |||
| @@ -126,6 +126,7 @@ class XEarley: | |||
| parser_conf.callback, | |||
| resolve_ambiguity=get_ambiguity_resolver(options), | |||
| ignore=ignore, | |||
| predict_all=options.earley__predict_all | |||
| ) | |||
| def _prepare_expansion(self, expansion): | |||
| @@ -90,7 +90,7 @@ class NewsList(list): | |||
| class Column: | |||
| "An entry in the table, aka Earley Chart. Contains lists of items." | |||
| def __init__(self, i, FIRST): | |||
| def __init__(self, i, FIRST, predict_all=False): | |||
| self.i = i | |||
| self.to_reduce = NewsList() | |||
| self.to_predict = NewsList() | |||
| @@ -100,6 +100,7 @@ class Column: | |||
| self.predicted = set() | |||
| self.completed = {} | |||
| self.predict_all = predict_all | |||
| def add(self, items): | |||
| """Sort items into scan/predict/reduce newslists | |||
| @@ -108,9 +109,9 @@ class Column: | |||
| """ | |||
| for item in items: | |||
| item_key = item, item.tree # Elsewhere, tree is not part of the comparison | |||
| if item.is_complete: | |||
| # XXX Potential bug: What happens if there's ambiguity in an empty rule? | |||
| item_key = item, item.tree # Elsewhere, tree is not part of the comparison | |||
| if item.rule.expansion and item_key in self.completed: | |||
| old_tree = self.completed[item_key].tree | |||
| if old_tree == item.tree: | |||
| @@ -137,9 +138,10 @@ class Column: | |||
| if isinstance(item.expect, Terminal): | |||
| self.to_scan.append(item) | |||
| else: | |||
| if item in self.predicted: | |||
| k = item_key if self.predict_all else item | |||
| if k in self.predicted: | |||
| continue | |||
| self.predicted.add(item) | |||
| self.predicted.add(k) | |||
| self.to_predict.append(item) | |||
| self.item_count += 1 # Only count if actually added | |||
| @@ -7,6 +7,10 @@ from ..common import ParseError, UnexpectedToken | |||
| from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT | |||
| class FinalReduce: | |||
| def __init__(self, value): | |||
| self.value = value | |||
| class Parser: | |||
| def __init__(self, parser_conf): | |||
| assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" | |||
| @@ -56,7 +60,7 @@ class _Parser: | |||
| res = self.callbacks[rule](s) | |||
| if end and len(state_stack) == 1 and rule.origin == self.start_symbol: | |||
| return res | |||
| return FinalReduce(res) | |||
| _action, new_state = get_action(rule.origin) | |||
| assert _action == ACTION_SHIFT | |||
| @@ -85,9 +89,9 @@ class _Parser: | |||
| _action, rule = get_action('$end') | |||
| assert _action == 'reduce' | |||
| res = reduce(*rule, end=True) | |||
| if res: | |||
| if isinstance(res, FinalReduce): | |||
| assert state_stack == [self.init_state] and not value_stack, len(state_stack) | |||
| return res | |||
| return res.value | |||
| @@ -9,56 +9,60 @@ from ..tree import Tree, Visitor_NoRecurse | |||
| # Author: Erez Sh | |||
| def _compare_rules(rule1, rule2): | |||
| if rule1.origin != rule2.origin: | |||
| if rule1.options and rule2.options: | |||
| if rule1.options.priority is not None and rule2.options.priority is not None: | |||
| assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2) | |||
| return -compare(rule1.options.priority, rule2.options.priority) | |||
| return 0 | |||
| c = compare( len(rule1.expansion), len(rule2.expansion)) | |||
| if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||
| c = -compare( len(rule1.expansion), len(rule2.expansion)) | |||
| if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here | |||
| c = -c | |||
| return c | |||
| def _compare_drv(tree1, tree2): | |||
| if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||
| def _sum_priority(tree): | |||
| p = 0 | |||
| for n in tree.iter_subtrees(): | |||
| try: | |||
| return -compare(tree1, tree2) | |||
| except TypeError: | |||
| return 0 | |||
| p += n.rule.options.priority or 0 | |||
| except AttributeError: | |||
| pass | |||
| return p | |||
| def _compare_priority(tree1, tree2): | |||
| tree1.iter_subtrees() | |||
| def _compare_drv(tree1, tree2): | |||
| try: | |||
| rule1, rule2 = tree1.rule, tree2.rule | |||
| except AttributeError: | |||
| # Probably trees that don't take part in this parse (better way to distinguish?) | |||
| return -compare(tree1, tree2) | |||
| # Probably non-trees, or user trees that weren't created by the parse (better way to distinguish?) | |||
| return compare(tree1, tree2) | |||
| assert tree1.data != '_ambig' | |||
| assert tree2.data != '_ambig' | |||
| # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, | |||
| # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be | |||
| # computationally inefficient. So we handle it here. | |||
| if tree1.data == '_ambig': | |||
| _standard_resolve_ambig(tree1) | |||
| if tree2.data == '_ambig': | |||
| _standard_resolve_ambig(tree2) | |||
| p1 = _sum_priority(tree1) | |||
| p2 = _sum_priority(tree2) | |||
| c = (p1 or p2) and compare(p1, p2) | |||
| if c: | |||
| return c | |||
| c = _compare_rules(tree1.rule, tree2.rule) | |||
| if c: | |||
| return c | |||
| # rules are "equal", so compare trees | |||
| for t1, t2 in zip(tree1.children, tree2.children): | |||
| c = _compare_drv(t1, t2) | |||
| if c: | |||
| return c | |||
| if len(tree1.children) == len(tree2.children): | |||
| for t1, t2 in zip(tree1.children, tree2.children): | |||
| c = _compare_drv(t1, t2) | |||
| if c: | |||
| return c | |||
| return compare(len(tree1.children), len(tree2.children)) | |||
| def _standard_resolve_ambig(tree): | |||
| assert tree.data == '_ambig' | |||
| best = min(tree.children, key=cmp_to_key(_compare_drv)) | |||
| key_f = cmp_to_key(_compare_drv) | |||
| best = max(tree.children, key=key_f) | |||
| assert best.data == 'drv' | |||
| tree.set('drv', best.children) | |||
| tree.rule = best.rule # needed for applying callbacks | |||
| @@ -80,23 +84,12 @@ def _antiscore_sum_drv(tree): | |||
| if not isinstance(tree, Tree): | |||
| return 0 | |||
| # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, | |||
| # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be | |||
| # computationally inefficient. So we handle it here. | |||
| if tree.data == '_ambig': | |||
| _antiscore_sum_resolve_ambig(tree) | |||
| assert tree.data != '_ambig' | |||
| try: | |||
| priority = tree.rule.options.priority | |||
| except AttributeError: | |||
| # Probably trees that don't take part in this parse (better way to distinguish?) | |||
| priority = None | |||
| return (priority or 0) + sum(map(_antiscore_sum_drv, tree.children), 0) | |||
| return _sum_priority(tree) | |||
| def _antiscore_sum_resolve_ambig(tree): | |||
| assert tree.data == '_ambig' | |||
| best = min(tree.children, key=_antiscore_sum_drv) | |||
| assert best.data == 'drv' | |||
| tree.set('drv', best.children) | |||
| @@ -28,11 +28,12 @@ from .grammar_analysis import GrammarAnalyzer | |||
| from .earley import ApplyCallbacks, Item, Column | |||
| class Parser: | |||
| def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=()): | |||
| def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): | |||
| self.analysis = GrammarAnalyzer(rules, start_symbol) | |||
| self.start_symbol = start_symbol | |||
| self.resolve_ambiguity = resolve_ambiguity | |||
| self.ignore = list(ignore) | |||
| self.predict_all = predict_all | |||
| self.postprocess = {} | |||
| @@ -107,9 +108,10 @@ class Parser: | |||
| for j in range(1, len(s)): | |||
| m = item.expect.match(s[:-j]) | |||
| if m: | |||
| delayed_matches[m.end()].append(item.advance(m.group(0))) | |||
| t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
| delayed_matches[i+m.end()].append(item.advance(t)) | |||
| next_set = Column(i+1, self.FIRST) | |||
| next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | |||
| next_set.add(delayed_matches[i+1]) | |||
| del delayed_matches[i+1] # No longer needed, so unburden memory | |||
| @@ -119,7 +121,7 @@ class Parser: | |||
| return next_set | |||
| # Main loop starts | |||
| column0 = Column(0, self.FIRST) | |||
| column0 = Column(0, self.FIRST, predict_all=self.predict_all) | |||
| column0.add(predict(start_symbol, column0)) | |||
| column = column0 | |||
| @@ -67,17 +67,26 @@ class Tree(object): | |||
| yield c | |||
| def iter_subtrees(self): | |||
| # TODO: Re-write as a more efficient version | |||
| visited = set() | |||
| q = [self] | |||
| l = [] | |||
| while q: | |||
| subtree = q.pop() | |||
| l.append( subtree ) | |||
| if id(subtree) in visited: | |||
| continue # already been here from another branch | |||
| visited.add(id(subtree)) | |||
| yield subtree | |||
| q += [c for c in subtree.children if isinstance(c, Tree)] | |||
| seen = set() | |||
| for x in reversed(l): | |||
| if id(x) not in seen: | |||
| yield x | |||
| seen.add(id(x)) | |||
| def __deepcopy__(self, memo): | |||
| return type(self)(self.data, deepcopy(self.children, memo)) | |||
| @@ -100,7 +109,7 @@ class Transformer(object): | |||
| if isinstance(c, Tree): | |||
| try: | |||
| items.append(self.transform(c)) | |||
| except Erase: | |||
| except Discard: | |||
| pass | |||
| try: | |||
| f = self._get_func(tree.data) | |||
| @@ -116,7 +125,7 @@ class Transformer(object): | |||
| return TransformerChain(self, other) | |||
| class Erase(Exception): | |||
| class Discard(Exception): | |||
| pass | |||
| class TransformerChain(object): | |||
| @@ -156,7 +165,7 @@ class Visitor_NoRecurse(Visitor): | |||
| def visit(self, tree): | |||
| subtrees = list(tree.iter_subtrees()) | |||
| for subtree in reversed(subtrees): | |||
| for subtree in (subtrees): | |||
| getattr(self, subtree.data, self.__default__)(subtree) | |||
| return tree | |||
| @@ -174,13 +183,13 @@ class Transformer_NoRecurse(Transformer): | |||
| else: | |||
| return f(t) | |||
| for subtree in reversed(subtrees): | |||
| for subtree in (subtrees): | |||
| children = [] | |||
| for c in subtree.children: | |||
| if isinstance(c, Tree): | |||
| try: | |||
| children.append(_t(c)) | |||
| except Erase: | |||
| except Discard: | |||
| pass | |||
| else: | |||
| children.append(c) | |||
| @@ -711,6 +711,19 @@ def _make_parser_test(LEXER, PARSER): | |||
| """) | |||
| x = g.parse('AB') | |||
| @unittest.skipIf(LEXER == None, "Scanless can't handle regexps") | |||
| def test_regex_quote(self): | |||
| g = r""" | |||
| start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING | |||
| SINGLE_QUOTED_STRING : /'[^']*'/ | |||
| DOUBLE_QUOTED_STRING : /"[^"]*"/ | |||
| """ | |||
| g = _Lark(g) | |||
| self.assertEqual( g.parse('"hello"').children, ['"hello"']) | |||
| self.assertEqual( g.parse("'hello'").children, ["'hello'"]) | |||
| def test_lexer_token_limit(self): | |||
| "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" | |||
| tokens = {'A%d'%i:'"%d"'%i for i in range(300)} | |||