Merge branch 'master' of https://github.com/erezsh/lark

6 年之前 · 9426010b70
--- a/lark/init.py
+++ b/lark/init.py
@@ -1,5 +1,6 @@
 from .tree import Tree, Transformer, InlineTransformer
 from .common import ParseError, GrammarError
 from .lexer import UnexpectedInput, LexError
 from .lark import Lark
 from .utils import inline_args

--- a/lark/lark.py
+++ b/lark/lark.py
@@ -57,6 +57,7 @@ class LarkOptions(object):
        self.profile = o.pop('profile', False)
        self.ambiguity = o.pop('ambiguity', 'auto')
        self.propagate_positions = o.pop('propagate_positions', False)
        self.earley__predict_all = o.pop('earley__predict_all', False)

        assert self.parser in ('earley', 'lalr', None)

--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -40,6 +40,14 @@ class Token(Str):
    def __deepcopy__(self, memo):
        return Token(self.type, self.value, self.pos_in_stream, self.line, self.column)

    def __eq__(self, other):
        if isinstance(other, Token) and self.type != other.type:
            return False
        
        return Str.__eq__(self, other)

    __hash__ = Str.__hash__

 class Regex:
    def __init__(self, pattern, flags=()):
        self.pattern = pattern
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -293,7 +293,6 @@ def _rfind(s, choices):


 def _fix_escaping(s):
    s = s.replace('\\"', '"').replace("'", "\\'")
    w = ''
    i = iter(s)
    for n in i:
@@ -305,6 +304,7 @@ def _fix_escaping(s):
            elif n2 not in 'unftr':
                w += '\\'
            w += n2
    w = w.replace('\\"', '"').replace("'", "\\'")

    to_eval = "u'''%s'''" % w
    try:
@@ -435,9 +435,9 @@ class Grammar:

        for name, (tree, priority) in term_defs:   # TODO transfer priority to rule?
            if name.startswith('_'):
                options = RuleOptions(filter_out=True, priority=priority)
                options = RuleOptions(filter_out=True, priority=-priority)
            else:
                options = RuleOptions(keep_all_tokens=True, create_token=name, priority=priority)
                options = RuleOptions(keep_all_tokens=True, create_token=name, priority=-priority)

            name = new_terminal_names[name]
            inner_name = name + '_inner'
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -126,6 +126,7 @@ class XEarley:
                                    parser_conf.callback,
                                    resolve_ambiguity=get_ambiguity_resolver(options),
                                    ignore=ignore,
                                    predict_all=options.earley__predict_all
                                    )

    def _prepare_expansion(self, expansion):
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -90,7 +90,7 @@ class NewsList(list):

 class Column:
    "An entry in the table, aka Earley Chart. Contains lists of items."
    def __init__(self, i, FIRST):
    def __init__(self, i, FIRST, predict_all=False):
        self.i = i
        self.to_reduce = NewsList()
        self.to_predict = NewsList()
@@ -100,6 +100,7 @@ class Column:

        self.predicted = set()
        self.completed = {}
        self.predict_all = predict_all

    def add(self, items):
        """Sort items into scan/predict/reduce newslists
@@ -108,9 +109,9 @@ class Column:
        """
        for item in items:

            item_key = item, item.tree  # Elsewhere, tree is not part of the comparison
            if item.is_complete:
                # XXX Potential bug: What happens if there's ambiguity in an empty rule?
                item_key = item, item.tree  # Elsewhere, tree is not part of the comparison
                if item.rule.expansion and item_key in self.completed:
                    old_tree = self.completed[item_key].tree
                    if old_tree == item.tree:
@@ -137,9 +138,10 @@ class Column:
                if isinstance(item.expect, Terminal):
                    self.to_scan.append(item)
                else:
                    if item in self.predicted:
                    k = item_key if self.predict_all else item
                    if k in self.predicted:
                        continue
                    self.predicted.add(item)
                    self.predicted.add(k)
                    self.to_predict.append(item)

            self.item_count += 1    # Only count if actually added
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -7,6 +7,10 @@ from ..common import ParseError, UnexpectedToken

 from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT

 class FinalReduce:
    def __init__(self, value):
        self.value = value

 class Parser:
    def __init__(self, parser_conf):
        assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization"
@@ -56,7 +60,7 @@ class _Parser:
            res = self.callbacks[rule](s)

            if end and len(state_stack) == 1 and rule.origin == self.start_symbol:
                return res
                return FinalReduce(res)

            _action, new_state = get_action(rule.origin)
            assert _action == ACTION_SHIFT
@@ -85,9 +89,9 @@ class _Parser:
            _action, rule = get_action('$end')
            assert _action == 'reduce'
            res = reduce(*rule, end=True)
            if res:
            if isinstance(res, FinalReduce):
                assert state_stack == [self.init_state] and not value_stack, len(state_stack)
                return res
                return res.value



--- a/lark/parsers/resolve_ambig.py
+++ b/lark/parsers/resolve_ambig.py
@@ -9,56 +9,60 @@ from ..tree import Tree, Visitor_NoRecurse
 # Author: Erez Sh

 def _compare_rules(rule1, rule2):
    if rule1.origin != rule2.origin:
        if rule1.options and rule2.options:
            if rule1.options.priority is not None and rule2.options.priority is not None:
                assert rule1.options.priority != rule2.options.priority, "Priority is the same between both rules: %s == %s" % (rule1, rule2)
                return -compare(rule1.options.priority, rule2.options.priority)

        return 0

    c = compare( len(rule1.expansion), len(rule2.expansion))
    if rule1.origin.startswith('__'):   # XXX hack! We need to set priority in parser, not here
    c = -compare( len(rule1.expansion), len(rule2.expansion))
    if rule1.origin.startswith('__'):   # XXX hack! We should set priority in parser, not here
        c = -c
    return c

 def _compare_drv(tree1, tree2):
    if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)):

 def _sum_priority(tree):
    p = 0

    for n in tree.iter_subtrees():
        try:
            return -compare(tree1, tree2)
        except TypeError:
            return 0
            p += n.rule.options.priority or 0
        except AttributeError:
            pass

    return p

 def _compare_priority(tree1, tree2):
    tree1.iter_subtrees()

 def _compare_drv(tree1, tree2):
    try:
        rule1, rule2 = tree1.rule, tree2.rule
    except AttributeError:
        # Probably trees that don't take part in this parse (better way to distinguish?)
        return -compare(tree1, tree2)
        # Probably non-trees, or user trees that weren't created by the parse (better way to distinguish?)
        return compare(tree1, tree2)

    assert tree1.data != '_ambig'
    assert tree2.data != '_ambig'

    # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse,
    #     when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be
    #     computationally inefficient. So we handle it here.
    if tree1.data == '_ambig':
        _standard_resolve_ambig(tree1)
    if tree2.data == '_ambig':
        _standard_resolve_ambig(tree2)
    p1 = _sum_priority(tree1)
    p2 = _sum_priority(tree2)
    c = (p1 or p2) and compare(p1, p2)
    if c:
        return c

    c = _compare_rules(tree1.rule, tree2.rule)
    if c:
        return c

    # rules are "equal", so compare trees
    for t1, t2 in zip(tree1.children, tree2.children):
        c = _compare_drv(t1, t2)
        if c:
            return c
    if len(tree1.children) == len(tree2.children):
        for t1, t2 in zip(tree1.children, tree2.children):
            c = _compare_drv(t1, t2)
            if c:
                return c

    return compare(len(tree1.children), len(tree2.children))


 def _standard_resolve_ambig(tree):
    assert tree.data == '_ambig'
    best = min(tree.children, key=cmp_to_key(_compare_drv))
    key_f = cmp_to_key(_compare_drv)
    best = max(tree.children, key=key_f)
    assert best.data == 'drv'
    tree.set('drv', best.children)
    tree.rule = best.rule   # needed for applying callbacks
@@ -80,23 +84,12 @@ def _antiscore_sum_drv(tree):
    if not isinstance(tree, Tree):
        return 0

    # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse,
    #     when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be
    #     computationally inefficient. So we handle it here.
    if tree.data == '_ambig':
        _antiscore_sum_resolve_ambig(tree)
    assert tree.data != '_ambig'

    try:
        priority = tree.rule.options.priority
    except AttributeError:
        # Probably trees that don't take part in this parse (better way to distinguish?)
        priority = None

    return (priority or 0) + sum(map(_antiscore_sum_drv, tree.children), 0)
    return _sum_priority(tree)

 def _antiscore_sum_resolve_ambig(tree):
    assert tree.data == '_ambig'

    best = min(tree.children, key=_antiscore_sum_drv)
    assert best.data == 'drv'
    tree.set('drv', best.children)
--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -28,11 +28,12 @@ from .grammar_analysis import GrammarAnalyzer
 from .earley import ApplyCallbacks, Item, Column

 class Parser:
    def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=()):
    def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False):
        self.analysis = GrammarAnalyzer(rules, start_symbol)
        self.start_symbol = start_symbol
        self.resolve_ambiguity = resolve_ambiguity
        self.ignore = list(ignore)
        self.predict_all = predict_all


        self.postprocess = {}
@@ -107,9 +108,10 @@ class Parser:
                    for j in range(1, len(s)):
                        m = item.expect.match(s[:-j])
                        if m:
                            delayed_matches[m.end()].append(item.advance(m.group(0)))
                            t = Token(item.expect.name, m.group(0), i, text_line, text_column)
                            delayed_matches[i+m.end()].append(item.advance(t))

            next_set = Column(i+1, self.FIRST)
            next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
            next_set.add(delayed_matches[i+1])
            del delayed_matches[i+1]    # No longer needed, so unburden memory

@@ -119,7 +121,7 @@ class Parser:
            return next_set

        # Main loop starts
        column0 = Column(0, self.FIRST)
        column0 = Column(0, self.FIRST, predict_all=self.predict_all)
        column0.add(predict(start_symbol, column0))

        column = column0
--- a/lark/tree.py
+++ b/lark/tree.py
@@ -67,17 +67,26 @@ class Tree(object):
                    yield c

    def iter_subtrees(self):
        # TODO: Re-write as a more efficient version

        visited = set()
        q = [self]

        l = []
        while q:
            subtree = q.pop()
            l.append( subtree )
            if id(subtree) in visited:
                continue    # already been here from another branch
            visited.add(id(subtree))
            yield subtree
            q += [c for c in subtree.children if isinstance(c, Tree)]

        seen = set()
        for x in reversed(l):
            if id(x) not in seen:
                yield x
                seen.add(id(x))


    def __deepcopy__(self, memo):
        return type(self)(self.data, deepcopy(self.children, memo))
@@ -100,7 +109,7 @@ class Transformer(object):
            if isinstance(c, Tree):
                try:
                    items.append(self.transform(c))
                except Erase:
                except Discard:
                    pass
        try:
            f = self._get_func(tree.data)
@@ -116,7 +125,7 @@ class Transformer(object):
        return TransformerChain(self, other)


 class Erase(Exception):
 class Discard(Exception):
    pass

 class TransformerChain(object):
@@ -156,7 +165,7 @@ class Visitor_NoRecurse(Visitor):
    def visit(self, tree):
        subtrees = list(tree.iter_subtrees())

        for subtree in reversed(subtrees):
        for subtree in (subtrees):
            getattr(self, subtree.data, self.__default__)(subtree)
        return tree

@@ -174,13 +183,13 @@ class Transformer_NoRecurse(Transformer):
            else:
                return f(t)

        for subtree in reversed(subtrees):
        for subtree in (subtrees):
            children = []
            for c in subtree.children:
                if isinstance(c, Tree):
                    try:
                        children.append(_t(c))
                    except Erase:
                    except Discard:
                        pass
                else:
                    children.append(c)
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -711,6 +711,19 @@ def _make_parser_test(LEXER, PARSER):
                            """)
            x = g.parse('AB')

        @unittest.skipIf(LEXER == None, "Scanless can't handle regexps")
        def test_regex_quote(self):
            g = r"""
            start: SINGLE_QUOTED_STRING | DOUBLE_QUOTED_STRING
            SINGLE_QUOTED_STRING  : /'[^']*'/
            DOUBLE_QUOTED_STRING  : /"[^"]*"/
            """

            g = _Lark(g)
            self.assertEqual( g.parse('"hello"').children, ['"hello"'])
            self.assertEqual( g.parse("'hello'").children, ["'hello'"])


        def test_lexer_token_limit(self):
            "Python has a stupid limit of 100 groups in a regular expression. Test that we handle this limitation"
            tokens = {'A%d'%i:'"%d"'%i for i in range(300)}