Merge branch 'master' into true_lalr2

5 years ago · d10182c253
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@
 /lark_parser.egg-info/**
 tags
 .vscode
 .idea
 .ropeproject
 .cache
 /dist
--- a/README.md
+++ b/README.md
@@ -72,7 +72,7 @@ Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like ba

 ![fruitflies.png](examples/fruitflies.png)

 See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
 See more [examples here](https://github.com/lark-parser/lark/tree/master/examples)



@@ -95,7 +95,7 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples)
 - Extensive test suite [![codecov](https://codecov.io/gh/erezsh/lark/branch/master/graph/badge.svg)](https://codecov.io/gh/erezsh/lark)
 - And much more!

 See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/Features)
 See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/)


 ### Comparison to other libraries
--- a/lark/init.py
+++ b/lark/init.py
@@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une
 from .lexer import Token
 from .lark import Lark

 __version__ = "0.7.2"
 __version__ = "0.7.4"
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -8,7 +8,6 @@ from .exceptions import UnexpectedCharacters, LexError
 ###{standalone

 class Pattern(Serialize):
    __serialize_fields__ = 'value', 'flags'

    def __init__(self, value, flags=()):
        self.value = value
@@ -41,6 +40,8 @@ class Pattern(Serialize):


 class PatternStr(Pattern):
    __serialize_fields__ = 'value', 'flags'

    type = "str"
    
    def to_regexp(self):
@@ -52,6 +53,8 @@ class PatternStr(Pattern):
    max_width = min_width

 class PatternRE(Pattern):
    __serialize_fields__ = 'value', 'flags', '_width'

    type = "re"

    def to_regexp(self):
@@ -98,7 +101,7 @@ class Token(Str):

        self.type = type_
        self.pos_in_stream = pos_in_stream
        self.value = Str(value)
        self.value = value
        self.line = line
        self.column = column
        self.end_line = end_line
@@ -265,13 +268,14 @@ def build_mres(terminals, match_whole=False):
    return _build_mres(terminals, len(terminals), match_whole)

 def _regexp_has_newline(r):
    """Expressions that may indicate newlines in a regexp:
    r"""Expressions that may indicate newlines in a regexp:
        - newlines (\n)
        - escaped newline (\\n)
        - anything but ([^...])
        - any-char (.) when the flag (?s) exists
        - spaces (\s)
    """
    return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r)
    return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)

 class Lexer(object):
    """Lexer interface
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import LALR_TraditionalLexer
 from .common import LexerConf, ParserConf
 from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol
 from .utils import classify, suppress, dedup_list
 from .utils import classify, suppress, dedup_list, Str
 from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken

 from .tree import Tree, SlottedTree as ST
@@ -351,7 +351,10 @@ def _fix_escaping(s):
    for n in i:
        w += n
        if n == '\\':
            n2 = next(i)
            try:
                n2 = next(i)
            except StopIteration:
                raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s)
            if n2 == '\\':
                w += '\\\\'
            elif n2 not in 'uxnftr':
@@ -451,9 +454,9 @@ class PrepareSymbols(Transformer_InPlace):
        if isinstance(v, Tree):
            return v
        elif v.type == 'RULE':
            return NonTerminal(v.value)
            return NonTerminal(Str(v.value))
        elif v.type == 'TERMINAL':
            return Terminal(v.value, filter_out=v.startswith('_'))
            return Terminal(Str(v.value), filter_out=v.startswith('_'))
        assert False

 def _choice_of_rules(rules):
@@ -511,12 +514,12 @@ class Grammar:

        simplify_rule = SimplifyRule_Visitor()
        compiled_rules = []
        for i, rule_content in enumerate(rules):
        for rule_content in rules:
            name, tree, options = rule_content
            simplify_rule.visit(tree)
            expansions = rule_tree_to_text.transform(tree)

            for expansion, alias in expansions:
            for i, (expansion, alias) in enumerate(expansions):
                if alias and name.startswith('_'):
                    raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias))

@@ -538,7 +541,7 @@ class Grammar:
            for dups in duplicates.values():
                if len(dups) > 1:
                    if dups[0].expansion:
                        raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates))
                        raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" % ''.join('\n  * %s' % i for i in dups))

                    # Empty rule; assert all other attributes are equal
                    assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups)
@@ -605,7 +608,9 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases):
            _, tree, _ = imported_rules[symbol]
        except KeyError:
            raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace))
        return tree.scan_values(lambda x: x.type in ('RULE', 'TERMINAL'))

        return _find_used_symbols(tree)


    def get_namespace_name(name):
        try:
@@ -682,6 +687,11 @@ class PrepareGrammar(Transformer_InPlace):
        return name


 def _find_used_symbols(tree):
    assert tree.data == 'expansions'
    return {t for x in tree.find_data('expansion')
              for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}

 class GrammarLoader:
    def __init__(self):
        terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()]
@@ -843,9 +853,7 @@ class GrammarLoader:
            rule_names.add(name)

        for name, expansions, _o in rules:
            used_symbols = {t for x in expansions.find_data('expansion')
                              for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))}
            for sym in used_symbols:
            for sym in _find_used_symbols(expansions):
                if sym.type == 'TERMINAL':
                    if sym not in terminal_names:
                        raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name))
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -118,7 +118,7 @@ class LALR_ContextualLexer(LALR_WithLexer):

 class LALR_CustomLexer(LALR_WithLexer):
    def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None):
        self.lexer = lexer_cls(self.lexer_conf)
        self.lexer = lexer_cls(lexer_conf)
        debug = options.debug if options else False
        self.parser = LALR_Parser(parser_conf, debug=debug)
        WithLexer.__init__(self, lexer_conf, parser_conf, options)
@@ -139,7 +139,8 @@ class Earley(WithLexer):
        self.init_traditional_lexer()

        resolve_ambiguity = options.ambiguity == 'resolve'
        self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity)
        debug = options.debug if options else False
        self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug)

    def match(self, term, token):
        return term.name == token.type
@@ -152,10 +153,12 @@ class XEarley(_ParserFrontend):

        self._prepare_match(lexer_conf)
        resolve_ambiguity = options.ambiguity == 'resolve'
        debug = options.debug if options else False
        self.parser = xearley.Parser(parser_conf,
                                    self.match,
                                    ignore=lexer_conf.ignore,
                                    resolve_ambiguity=resolve_ambiguity,
                                    debug=debug,
                                    **kw
                                    )

--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -20,10 +20,11 @@ from .earley_common import Item, TransitiveItem
 from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor

 class Parser:
    def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True):
    def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False):
        analysis = GrammarAnalyzer(parser_conf)
        self.parser_conf = parser_conf
        self.resolve_ambiguity = resolve_ambiguity
        self.debug = debug

        self.FIRST = analysis.FIRST
        self.NULLABLE = analysis.NULLABLE
@@ -296,6 +297,10 @@ class Parser:
        # symbol should have been completed in the last step of the Earley cycle, and will be in
        # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
        solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
        if self.debug:
            from .earley_forest import ForestToPyDotVisitor
            debug_walker = ForestToPyDotVisitor()
            debug_walker.visit(solutions[0], "sppf.png")

        if not solutions:
            expected_tokens = [t.expect for t in to_scan]
--- a/lark/parsers/earley_forest.py
+++ b/lark/parsers/earley_forest.py
@@ -122,7 +122,7 @@ class PackedNode(ForestNode):
        ambiguously. Hence, we use the sort order to identify
        the order in which ambiguous children should be considered.
        """
        return self.is_empty, -self.priority, -self.rule.order
        return self.is_empty, -self.priority, self.rule.order

    def __iter__(self):
        return iter([self.left, self.right])
--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -24,8 +24,8 @@ from .earley_forest import SymbolNode


 class Parser(BaseParser):
    def __init__(self,  parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False):
        BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity)
    def __init__(self,  parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False):
        BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug)
        self.ignore = [Terminal(t) for t in ignore]
        self.complete_lex = complete_lex

--- a/lark/tools/serialize.py
+++ b/lark/tools/serialize.py
@@ -0,0 +1,39 @@
 import codecs
 import sys
 import json

 from lark import Lark
 from lark.grammar import RuleOptions, Rule
 from lark.lexer import TerminalDef

 import argparse

 argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize') #description='''Lark Serialization Tool -- Stores Lark's internal state & LALR analysis as a convenient JSON file''')

 argparser.add_argument('grammar_file', type=argparse.FileType('r'), help='A valid .lark file')
 argparser.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='json file path to create (default=stdout)')
 argparser.add_argument('-s', '--start', default='start', help='start symbol (default="start")', nargs='+')
 argparser.add_argument('-l', '--lexer', default='standard', choices=['standard', 'contextual'], help='lexer type (default="standard")')


 def serialize(infile, outfile, lexer, start):
    lark_inst = Lark(infile, parser="lalr", lexer=lexer, start=start)    # TODO contextual

    data, memo = lark_inst.memo_serialize([TerminalDef, Rule])
    outfile.write('{\n')
    outfile.write('  "data": %s,\n' % json.dumps(data))
    outfile.write('  "memo": %s\n' % json.dumps(memo))
    outfile.write('}\n')


 def main():
    if len(sys.argv) == 1 or '-h' in sys.argv or '--help' in sys.argv:
        print("Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file")
        print("")
        argparser.print_help()
    else:
        args = argparser.parse_args()
        serialize(args.grammar_file, args.out, args.lexer, args.start)

 if __name__ == '__main__':
    main()
--- a/lark/tree.py
+++ b/lark/tree.py
@@ -56,30 +56,6 @@ class Tree(object):

    def __hash__(self):
        return hash((self.data, tuple(self.children)))
 ###}

    def expand_kids_by_index(self, *indices):
        "Expand (inline) children at the given indices"
        for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
            kid = self.children[i]
            self.children[i:i+1] = kid.children

    def find_pred(self, pred):
        "Find all nodes where pred(tree) == True"
        return filter(pred, self.iter_subtrees())

    def find_data(self, data):
        "Find all nodes where tree.data == data"
        return self.find_pred(lambda t: t.data == data)

    def scan_values(self, pred):
        for c in self.children:
            if isinstance(c, Tree):
                for t in c.scan_values(pred):
                    yield t
            else:
                if pred(c):
                    yield c

    def iter_subtrees(self):
        # TODO: Re-write as a more efficient version
@@ -102,6 +78,31 @@ class Tree(object):
                yield x
                seen.add(id(x))

    def find_pred(self, pred):
        "Find all nodes where pred(tree) == True"
        return filter(pred, self.iter_subtrees())

    def find_data(self, data):
        "Find all nodes where tree.data == data"
        return self.find_pred(lambda t: t.data == data)

 ###}

    def expand_kids_by_index(self, *indices):
        "Expand (inline) children at the given indices"
        for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices
            kid = self.children[i]
            self.children[i:i+1] = kid.children

    def scan_values(self, pred):
        for c in self.children:
            if isinstance(c, Tree):
                for t in c.scan_values(pred):
                    yield t
            else:
                if pred(c):
                    yield c

    def iter_subtrees_topdown(self):
        stack = [self]
        while stack:
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -160,7 +160,7 @@ def smart_decorator(f, create_decorator):

    elif isinstance(f, partial):
        # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445
        return create_decorator(f.__func__, True)
        return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True))

    else:
        return create_decorator(f.__func__.__call__, True)
@@ -172,7 +172,7 @@ import sre_parse
 import sre_constants
 def get_regexp_width(regexp):
    try:
        return sre_parse.parse(regexp).getwidth()
        return [int(x) for x in sre_parse.parse(regexp).getwidth()]
    except sre_constants.error:
        raise ValueError(regexp)

--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -0,0 +1,10 @@
 version: 2

 mkdocs:
  configuration: mkdocs.yml
  fail_on_warning: false

 formats: all

 python:
  version: 3.5
--- a/tests/main.py
+++ b/tests/main.py
@@ -21,6 +21,7 @@ from .test_parser import (
        TestCykStandard,
        TestLalrContextual,
        TestEarleyDynamic,
        TestLalrCustom,

        # TestFullEarleyStandard,
        TestFullEarleyDynamic,
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -22,7 +22,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte
 from lark.tree import Tree
 from lark.visitors import Transformer, Transformer_InPlace, v_args
 from lark.grammar import Rule
 from lark.lexer import TerminalDef
 from lark.lexer import TerminalDef, Lexer, TraditionalLexer

 __path__ = os.path.dirname(__file__)
 def _read(n, *args):
@@ -431,12 +431,22 @@ def _make_full_earley_test(LEXER):
    _TestFullEarley.__name__ = _NAME
    globals()[_NAME] = _TestFullEarley

 class CustomLexer(Lexer):
    """
    Purpose of this custom lexer is to test the integration,
    so it uses the traditionalparser as implementation without custom lexing behaviour.
    """
    def __init__(self, lexer_conf):
        self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks)
    def lex(self, *args, **kwargs):
        return self.lexer.lex(*args, **kwargs)

 def _make_parser_test(LEXER, PARSER):
    lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER
    def _Lark(grammar, **kwargs):
        return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
        return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
    def _Lark_open(gfilename, **kwargs):
        return Lark.open(gfilename, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs)
        return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs)
    class _TestParser(unittest.TestCase):
        def test_basic1(self):
            g = _Lark("""start: a+ b a* "b" a*
@@ -1532,7 +1542,7 @@ def _make_parser_test(LEXER, PARSER):
            parser = _Lark(grammar)


        @unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)")
        @unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)")
        def test_serialize(self):
            grammar = """
                start: _ANY b "C"
@@ -1558,6 +1568,28 @@ def _make_parser_test(LEXER, PARSER):
            self.assertEqual(parser.parse('xa', 'a'), Tree('a', []))
            self.assertEqual(parser.parse('xb', 'b'), Tree('b', []))

        def test_lexer_detect_newline_tokens(self):
            # Detect newlines in regular tokens
            g = _Lark(r"""start: "go" tail*
            !tail : SA "@" | SB "@" | SC "@" | SD "@"
            SA : "a" /\n/
            SB : /b./s
            SC : "c" /[^a-z]/
            SD : "d" /\s/
            """)
            a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children]
            self.assertEqual(a.line, 2)
            self.assertEqual(b.line, 3)
            self.assertEqual(c.line, 4)
            self.assertEqual(d.line, 5)

            # Detect newlines in ignored tokens
            for re in ['/\\n/', '/[^a-z]/', '/\\s/']:
                g = _Lark('''!start: "a" "a"
                             %ignore {}'''.format(re))
                a, b = g.parse('a\na').children
                self.assertEqual(a.line, 1)
                self.assertEqual(b.line, 2)


    _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()
@@ -1572,6 +1604,7 @@ _TO_TEST = [
        ('dynamic_complete', 'earley'),
        ('standard', 'lalr'),
        ('contextual', 'lalr'),
        ('custom', 'lalr'),
        # (None, 'earley'),
 ]

--- a/tests/test_trees.py
+++ b/tests/test_trees.py
@@ -4,6 +4,7 @@ import unittest
 from unittest import TestCase
 import copy
 import pickle
 import functools

 from lark.tree import Tree
 from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args, Discard
@@ -146,6 +147,22 @@ class TestTrees(TestCase):
        res = T().transform(t)
        self.assertEqual(res, 2.9)

    def test_partial(self):

        tree = Tree("start", [Tree("a", ["test1"]), Tree("b", ["test2"])])

        def test(prefix, s, postfix):
            return prefix + s.upper() + postfix

        @v_args(inline=True)
        class T(Transformer):
            a = functools.partial(test, "@", postfix="!")
            b = functools.partial(lambda s: s + "!")

        res = T().transform(tree)
        assert res.children == ["@TEST1!", "test2!"]


    def test_discard(self):
        class MyTransformer(Transformer):
            def a(self, args):