Added support for verifying imported files

4 years ago · 0f4ca60d83
--- a/lark-stubs/load_grammar.pyi
+++ b/lark-stubs/load_grammar.pyi
@@ -14,8 +14,9 @@ class Grammar:
 class GrammarBuilder:
    global_keep_all_tokens: bool
    import_paths: List[Union[str, Callable]]
    used_files: Dict[str, str]

    def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None) -> None: ...
    def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None, used_files: Dict[str, str]=None) -> None: ...

    def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None) -> None: ...

--- a/lark/lark.py
+++ b/lark/lark.py
@@ -8,7 +8,7 @@ from io import open
 import tempfile
 from warnings import warn

 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod
 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod, verify_used_files
 from .load_grammar import load_grammar, FromPackageLoader, Grammar
 from .tree import Tree
 from .common import LexerConf, ParserConf
@@ -277,14 +277,15 @@ class Lark(Serialize):
                options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
                from . import __version__
                s = grammar + options_str + __version__ + str(sys.version_info[:2])
                cache_md5 = hashlib.md5(s.encode()).hexdigest()
                cache_md5 = hashlib.md5(s.encode('utf8')).hexdigest()

                if isinstance(self.options.cache, STRING_TYPE):
                    cache_fn = self.options.cache
                else:
                    if self.options.cache is not True:
                        raise ConfigurationError("cache argument must be bool or str")
                    cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % (cache_md5, *sys.version_info[:2])
                    # Python2.7 doesn't support * syntax in tuples
                    cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % ((cache_md5,) + sys.version_info[:2])

                if FS.exists(cache_fn):
                    logger.debug('Loading grammar from cache: %s', cache_fn)
@@ -293,16 +294,23 @@ class Lark(Serialize):
                        del options[name]
                    with FS.open(cache_fn, 'rb') as f:
                        file_md5 = f.readline().rstrip(b'\n')
                        if file_md5 == cache_md5.encode():
                            try:
                                self._load(f, **options)
                            except Exception:
                                raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
                            return
                        if file_md5 == cache_md5.encode('utf8'):
                            if (not self.options.safe_cache) or verify_used_files(pickle.load(f)):
                                old_options = self.options
                                try:
                                    self._load(f, **options)
                                except Exception: # We should probably narrow done which errors we catch here.
                                    logger.exception("Failed to load Lark from cache: %r. We will try to carry on." % cache_fn)
                                    
                                    # In theory, the Lark instance might have been messed up by the call to `_load`.
                                    # In practice the only relevant thing that might have been overriden should be `options`
                                    self.options = old_options
                                else:
                                    return


            # Parse the grammar file and compose the grammars
            self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
            self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
        else:
            assert isinstance(grammar, Grammar)
            self.grammar = grammar
@@ -387,7 +395,9 @@ class Lark(Serialize):
        if cache_fn:
            logger.debug('Saving grammar to cache: %s', cache_fn)
            with FS.open(cache_fn, 'wb') as f:
                f.write(b'%s\n' % cache_md5.encode())
                f.write(b'%s\n' % cache_md5.encode('utf8'))
                if self.options.safe_cache:
                    pickle.dump(used_files, f)
                self.save(f)

    if __doc__:
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -1,5 +1,5 @@
 """Parses and creates Grammar objects"""

 import hashlib
 import os.path
 import sys
 from copy import copy, deepcopy
@@ -931,9 +931,10 @@ def _mangle_exp(exp, mangle):


 class GrammarBuilder:
    def __init__(self, global_keep_all_tokens=False, import_paths=None):
    def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None):
        self.global_keep_all_tokens = global_keep_all_tokens
        self.import_paths = import_paths or []
        self.used_files = used_files or {}

        self._definitions = {}
        self._ignore_names = []
@@ -1150,10 +1151,14 @@ class GrammarBuilder:
                    joined_path = os.path.join(source, grammar_path)
                    with open(joined_path, encoding='utf8') as f:
                        text = f.read()
                    h = hashlib.md5(text.encode('utf8')).hexdigest()
                    if self.used_files.get(joined_path, h) != h:
                        raise RuntimeError("Grammar file was changed during importing")
                    self.used_files[joined_path] = h
            except IOError:
                continue
            else:
                gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths)
                gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files)
                gb.load_grammar(text, joined_path, mangle)
                gb._remove_unused(map(mangle, aliases))
                for name in gb._definitions:
@@ -1213,4 +1218,4 @@ class GrammarBuilder:
 def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
    builder = GrammarBuilder(global_keep_all_tokens, import_paths)
    builder.load_grammar(grammar, source)
    return builder.build()
    return builder.build(), builder.used_files
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -1,3 +1,4 @@
 import hashlib
 import unicodedata
 import os
 from functools import reduce
@@ -6,6 +7,7 @@ from collections import deque
 ###{standalone
 import sys, re
 import logging
 from io import open
 logger = logging.getLogger("lark")
 logger.addHandler(logging.StreamHandler())
 # Set to highest level, since we have some warnings amongst the code
@@ -296,6 +298,16 @@ class FS:
        else:
            return open(name, mode, **kwargs)

 def verify_used_files(file_hashes):
    for path, old in file_hashes.items():
        with open(path, encoding='utf8') as f:
            text = f.read()
        current = hashlib.md5(text.encode()).hexdigest()
        if old != current:
            logger.info("File %r changed, rebuilding Parser" % path)
            return False
    return True


 def isascii(s):
    """ str.isascii only exists in python3.7+ """
--- a/tests/test_cache.py
+++ b/tests/test_cache.py
@@ -58,71 +58,84 @@ def append_zero(t):


 class TestCache(TestCase):
    g = '''start: "a"'''

    
    def setUp(self):
        pass
        self.fs = lark_module.FS
        self.mock_fs = MockFS()
        lark_module.FS = self.mock_fs
        
    def tearDown(self):
        self.mock_fs.files = {}
        lark_module.FS = self.fs

    def test_simple(self):
        g = '''start: "a"'''

        fn = "bla"

        fs = lark_module.FS
        mock_fs = MockFS()
        try:
            lark_module.FS = mock_fs
            Lark(g, parser='lalr', cache=fn)
            assert fn in mock_fs.files
            parser = Lark(g, parser='lalr', cache=fn)
            assert parser.parse('a') == Tree('start', [])

            mock_fs.files = {}
            assert len(mock_fs.files) == 0
            Lark(g, parser='lalr', cache=True)
            assert len(mock_fs.files) == 1
            parser = Lark(g, parser='lalr', cache=True)
            assert parser.parse('a') == Tree('start', [])

            parser = Lark(g + ' "b"', parser='lalr', cache=True)
            assert len(mock_fs.files) == 2
            assert parser.parse('ab') == Tree('start', [])

            parser = Lark(g, parser='lalr', cache=True)
            assert parser.parse('a') == Tree('start', [])

            # Test with custom lexer
            mock_fs.files = {}
            parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
            parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
            assert len(mock_fs.files) == 1
            assert parser.parse('a') == Tree('start', [])

            # Test options persistence
            mock_fs.files = {}
            Lark(g, parser="lalr", debug=True, cache=True)
            parser = Lark(g, parser="lalr", debug=True, cache=True)
            assert parser.options.options['debug']

            # Test inline transformer (tree-less) & lexer_callbacks
            mock_fs.files = {}
            g = """
            start: add+
            add: NUM "+" NUM
            NUM: /\d+/
            %ignore " "
            """
            text = "1+2 3+4"
            expected = Tree('start', [30, 70])

            parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
            res0 = parser.parse(text)
            parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
            assert len(mock_fs.files) == 1
            res1 = parser.parse(text)
            res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text))
            assert res0 == res1 == res2 == expected

        finally:
            lark_module.FS = fs
        Lark(self.g, parser='lalr', cache=fn)
        assert fn in self.mock_fs.files
        parser = Lark(self.g, parser='lalr', cache=fn)
        assert parser.parse('a') == Tree('start', [])
    
    def test_automatic_naming(self):
        assert len(self.mock_fs.files) == 0
        Lark(self.g, parser='lalr', cache=True)
        assert len(self.mock_fs.files) == 1
        parser = Lark(self.g, parser='lalr', cache=True)
        assert parser.parse('a') == Tree('start', [])

        parser = Lark(self.g + ' "b"', parser='lalr', cache=True)
        assert len(self.mock_fs.files) == 2
        assert parser.parse('ab') == Tree('start', [])

        parser = Lark(self.g, parser='lalr', cache=True)
        assert parser.parse('a') == Tree('start', [])
    
    def test_custom_lexer(self):

        parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True)
        parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True)
        assert len(self.mock_fs.files) == 1
        assert parser.parse('a') == Tree('start', [])

    def test_options(self):
        # Test options persistence
        Lark(self.g, parser="lalr", debug=True, cache=True)
        parser = Lark(self.g, parser="lalr", debug=True, cache=True)
        assert parser.options.options['debug']
    
    def test_inline(self):
        # Test inline transformer (tree-less) & lexer_callbacks
        g = """
        start: add+
        add: NUM "+" NUM
        NUM: /\d+/
        %ignore " "
        """
        text = "1+2 3+4"
        expected = Tree('start', [30, 70])

        parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
        res0 = parser.parse(text)
        parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
        assert len(self.mock_fs.files) == 1
        res1 = parser.parse(text)
        res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text))
        assert res0 == res1 == res2 == expected
    
    def test_imports(self):
        g = """
        %import .grammars.ab (startab, expr)
        """
        parser = Lark(g, parser='lalr', start='startab', cache=True)
        assert len(self.mock_fs.files) == 1
        parser = Lark(g, parser='lalr', start='startab', cache=True)
        assert len(self.mock_fs.files) == 1
        res = parser.parse("ab")
        self.assertEqual(res, Tree('startab', [Tree('expr', ['a', 'b'])]))
        
        


 if __name__ == '__main__':