From cb18cf5e7785dc2dbbcedefb252f18976301d5f3 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 10 Dec 2017 17:54:12 +0200 Subject: [PATCH 1/7] BUGFIX: iter_trees() wasn't consistent with a recursive order (Issue #47) --- lark/tree.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/lark/tree.py b/lark/tree.py index 90583db..24d24ba 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -67,17 +67,26 @@ class Tree(object): yield c def iter_subtrees(self): + # TODO: Re-write as a more efficient version + visited = set() q = [self] + l = [] while q: subtree = q.pop() + l.append( subtree ) if id(subtree) in visited: continue # already been here from another branch visited.add(id(subtree)) - yield subtree q += [c for c in subtree.children if isinstance(c, Tree)] + seen = set() + for x in reversed(l): + if id(x) not in seen: + yield x + seen.add(id(x)) + def __deepcopy__(self, memo): return type(self)(self.data, deepcopy(self.children, memo)) @@ -147,7 +156,7 @@ class Visitor_NoRecurse(Visitor): def visit(self, tree): subtrees = list(tree.iter_subtrees()) - for subtree in reversed(subtrees): + for subtree in (subtrees): getattr(self, subtree.data, self.__default__)(subtree) return tree @@ -165,7 +174,7 @@ class Transformer_NoRecurse(Transformer): else: return f(t) - for subtree in reversed(subtrees): + for subtree in (subtrees): subtree.children = [_t(c) if isinstance(c, Tree) else c for c in subtree.children] return _t(tree) From ddae93f92f38e0b513d7e1c43930287c3eed4547 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 10 Dec 2017 19:32:41 +0200 Subject: [PATCH 2/7] BUGFIX: Ambiguity resolution now sums priority (Issue #46) --- lark/load_grammar.py | 4 +-- lark/parsers/resolve_ambig.py | 62 +++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 0414230..092c118 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -435,9 +435,9 @@ class Grammar: for name, (tree, priority) in term_defs: # TODO transfer priority to rule? if name.startswith('_'): - options = RuleOptions(filter_out=True, priority=priority) + options = RuleOptions(filter_out=True, priority=-priority) else: - options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority) + options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority) name = new_terminal_names[name] inner_name = name + '_inner' diff --git a/lark/parsers/resolve_ambig.py b/lark/parsers/resolve_ambig.py index f1a4431..c965433 100644 --- a/lark/parsers/resolve_ambig.py +++ b/lark/parsers/resolve_ambig.py @@ -9,56 +9,60 @@ from ..tree import Tree, Visitor_NoRecurse # Author: Erez Sh def _compare_rules(rule1, rule2): - if rule1.origin != rule2.origin: - if rule1.options and rule2.options: - if rule1.options.priority is not None and rule2.options.priority is not None: - assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2) - return -compare(rule1.options.priority, rule2.options.priority) - - return 0 - - c = compare( len(rule1.expansion), len(rule2.expansion)) - if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here + c = -compare( len(rule1.expansion), len(rule2.expansion)) + if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here c = -c return c -def _compare_drv(tree1, tree2): - if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): + +def _sum_priority(tree): + p = 0 + + for n in tree.iter_subtrees(): try: - return -compare(tree1, tree2) - except TypeError: - return 0 + p += n.rule.options.priority or 0 + except AttributeError: + pass + + return p + +def _compare_priority(tree1, tree2): + tree1.iter_subtrees() +def _compare_drv(tree1, tree2): try: rule1, rule2 = tree1.rule, tree2.rule except AttributeError: - # Probably trees that don't take part in this parse (better way to distinguish?) - return -compare(tree1, tree2) + # Probably non-trees, or user trees that weren't created by the parse (better way to distinguish?) + return compare(tree1, tree2) - # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, - # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be - # computationally inefficient. So we handle it here. - if tree1.data == '_ambig': - _standard_resolve_ambig(tree1) - if tree2.data == '_ambig': - _standard_resolve_ambig(tree2) + assert tree1.data != '_ambig' + assert tree2.data != '_ambig' + + p1 = _sum_priority(tree1) + p2 = _sum_priority(tree2) + c = (p1 or p2) and compare(p1, p2) + if c: + return c c = _compare_rules(tree1.rule, tree2.rule) if c: return c # rules are "equal", so compare trees - for t1, t2 in zip(tree1.children, tree2.children): - c = _compare_drv(t1, t2) - if c: - return c + if len(tree1.children) == len(tree2.children): + for t1, t2 in zip(tree1.children, tree2.children): + c = _compare_drv(t1, t2) + if c: + return c return compare(len(tree1.children), len(tree2.children)) def _standard_resolve_ambig(tree): assert tree.data == '_ambig' - best = min(tree.children, key=cmp_to_key(_compare_drv)) + key_f = cmp_to_key(_compare_drv) + best = max(tree.children, key=key_f) assert best.data == 'drv' tree.set('drv', best.children) tree.rule = best.rule # needed for applying callbacks From 852607b978584ecdec68ac115dd8554cdb0a2305 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 11 Dec 2017 00:29:27 +0200 Subject: [PATCH 3/7] BUGFIX: Tokens of different type were equal, causing disambiguation errors (Issue #21) --- lark/lexer.py | 8 ++++++++ lark/parsers/resolve_ambig.py | 15 ++------------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index c24a5b3..4fcdc95 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -40,6 +40,14 @@ class Token(Str): def __deepcopy__(self, memo): return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) + def __eq__(self, other): + if isinstance(other, Token) and self.type != other.type: + return False + + return Str.__eq__(self, other) + + __hash__ = Str.__hash__ + class Regex: def __init__(self, pattern, flags=()): self.pattern = pattern diff --git a/lark/parsers/resolve_ambig.py b/lark/parsers/resolve_ambig.py index c965433..f60a3f0 100644 --- a/lark/parsers/resolve_ambig.py +++ b/lark/parsers/resolve_ambig.py @@ -84,23 +84,12 @@ def _antiscore_sum_drv(tree): if not isinstance(tree, Tree): return 0 - # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, - # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be - # computationally inefficient. So we handle it here. - if tree.data == '_ambig': - _antiscore_sum_resolve_ambig(tree) + assert tree.data != '_ambig' - try: - priority = tree.rule.options.priority - except AttributeError: - # Probably trees that don't take part in this parse (better way to distinguish?) - priority = None - - return (priority or 0) + sum(map(_antiscore_sum_drv, tree.children), 0) + return _sum_priority(tree) def _antiscore_sum_resolve_ambig(tree): assert tree.data == '_ambig' - best = min(tree.children, key=_antiscore_sum_drv) assert best.data == 'drv' tree.set('drv', best.children) From 53a56d5dcb68e5bcea80b2b26259432908572823 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 11 Dec 2017 17:55:00 +0200 Subject: [PATCH 4/7] Added LexError to lark's __init__ imports --- lark/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lark/__init__.py b/lark/__init__.py index cb92d75..ebd8d8d 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -1,5 +1,6 @@ from .tree import Tree, Transformer, InlineTransformer from .common import ParseError, GrammarError +from .lexer import UnexpectedInput, LexError from .lark import Lark from .utils import inline_args From 5748920df4ab6d2cb3eaf430752efc38ecfeaf13 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 12 Dec 2017 14:17:48 +0200 Subject: [PATCH 5/7] BUGFIX in xearley + Feature: earley__predict_all --- lark/lark.py | 1 + lark/parser_frontends.py | 1 + lark/parsers/earley.py | 10 ++++++---- lark/parsers/xearley.py | 10 ++++++---- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index b8c8efe..d8ee186 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -57,6 +57,7 @@ class LarkOptions(object): self.profile = o.pop('profile', False) self.ambiguity = o.pop('ambiguity', 'auto') self.propagate_positions = o.pop('propagate_positions', False) + self.earley__predict_all = o.pop('earley__predict_all', False) assert self.parser in ('earley', 'lalr', None) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 18264ce..718a0f9 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -126,6 +126,7 @@ class XEarley: parser_conf.callback, resolve_ambiguity=get_ambiguity_resolver(options), ignore=ignore, + predict_all=options.earley__predict_all ) def _prepare_expansion(self, expansion): diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 1926afa..55893f5 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -90,7 +90,7 @@ class NewsList(list): class Column: "An entry in the table, aka Earley Chart. Contains lists of items." - def __init__(self, i, FIRST): + def __init__(self, i, FIRST, predict_all=False): self.i = i self.to_reduce = NewsList() self.to_predict = NewsList() @@ -100,6 +100,7 @@ class Column: self.predicted = set() self.completed = {} + self.predict_all = predict_all def add(self, items): """Sort items into scan/predict/reduce newslists @@ -108,9 +109,9 @@ class Column: """ for item in items: + item_key = item, item.tree # Elsewhere, tree is not part of the comparison if item.is_complete: # XXX Potential bug: What happens if there's ambiguity in an empty rule? - item_key = item, item.tree # Elsewhere, tree is not part of the comparison if item.rule.expansion and item_key in self.completed: old_tree = self.completed[item_key].tree if old_tree == item.tree: @@ -137,9 +138,10 @@ class Column: if isinstance(item.expect, Terminal): self.to_scan.append(item) else: - if item in self.predicted: + k = item_key if self.predict_all else item + if k in self.predicted: continue - self.predicted.add(item) + self.predicted.add(k) self.to_predict.append(item) self.item_count += 1 # Only count if actually added diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 59ecb84..9b26190 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -28,11 +28,12 @@ from .grammar_analysis import GrammarAnalyzer from .earley import ApplyCallbacks, Item, Column class Parser: - def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=()): + def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): self.analysis = GrammarAnalyzer(rules, start_symbol) self.start_symbol = start_symbol self.resolve_ambiguity = resolve_ambiguity self.ignore = list(ignore) + self.predict_all = predict_all self.postprocess = {} @@ -107,9 +108,10 @@ class Parser: for j in range(1, len(s)): m = item.expect.match(s[:-j]) if m: - delayed_matches[m.end()].append(item.advance(m.group(0))) + t = Token(item.expect.name, m.group(0), i, text_line, text_column) + delayed_matches[i+m.end()].append(item.advance(t)) - next_set = Column(i+1, self.FIRST) + next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) next_set.add(delayed_matches[i+1]) del delayed_matches[i+1] # No longer needed, so unburden memory @@ -119,7 +121,7 @@ class Parser: return next_set # Main loop starts - column0 = Column(0, self.FIRST) + column0 = Column(0, self.FIRST, predict_all=self.predict_all) column0.add(predict(start_symbol, column0)) column = column0 From fbeb0e6e59b8b0a1892ff9b13269f3e6686c2ab7 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 14 Dec 2017 12:48:43 +0200 Subject: [PATCH 6/7] BUGFIX: tree-less transformer may hang for empty values (Issue #49) --- lark/parsers/lalr_parser.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 07b3a0c..5b6f336 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -7,6 +7,10 @@ from ..common import ParseError, UnexpectedToken from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT +class FinalReduce: + def __init__(self, value): + self.value = value + class Parser: def __init__(self, parser_conf): assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" @@ -56,7 +60,7 @@ class _Parser: res = self.callbacks[rule](s) if end and len(state_stack) == 1 and rule.origin == self.start_symbol: - return res + return FinalReduce(res) _action, new_state = get_action(rule.origin) assert _action == ACTION_SHIFT @@ -85,9 +89,9 @@ class _Parser: _action, rule = get_action('$end') assert _action == 'reduce' res = reduce(*rule, end=True) - if res: + if isinstance(res, FinalReduce): assert state_stack == [self.init_state] and not value_stack, len(state_stack) - return res + return res.value From 209ac5ab4e893ef59a3bb4bdb5e01c612fa7ff64 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 14 Dec 2017 17:20:03 +0200 Subject: [PATCH 7/7] BUGFIX: Mishandling of quotes (Issue #50) --- lark/load_grammar.py | 2 +- tests/test_parser.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 092c118..72e2e22 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -293,7 +293,6 @@ def _rfind(s, choices): def _fix_escaping(s): - s = s.replace('\\"', '"').replace("'", "\\'") w = '' i = iter(s) for n in i: @@ -305,6 +304,7 @@ def _fix_escaping(s): elif n2 not in 'unftr': w += '\\' w += n2 + w = w.replace('\\"', '"').replace("'", "\\'") to_eval = "u'''%s'''" % w try: diff --git a/tests/test_parser.py b/tests/test_parser.py index 9ef7ab5..d93e33b 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -711,6 +711,19 @@ def _make_parser_test(LEXER, PARSER): """) x = g.parse('AB') + @unittest.skipIf(LEXER == None, "Scanless can't handle regexps") + def test_regex_quote(self): + g = r""" + start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING + SINGLE_QUOTED_STRING : /'[^']*'/ + DOUBLE_QUOTED_STRING : /"[^"]*"/ + """ + + g = _Lark(g) + self.assertEqual( g.parse('"hello"').children, ['"hello"']) + self.assertEqual( g.parse("'hello'").children, ["'hello'"]) + + def test_lexer_token_limit(self): "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation" tokens = {'A%d'%i:'"%d"'%i for i in range(300)}