| @@ -14,8 +14,9 @@ class Grammar: | |||
| class GrammarBuilder: | |||
| global_keep_all_tokens: bool | |||
| import_paths: List[Union[str, Callable]] | |||
| used_files: Dict[str, str] | |||
| def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None) -> None: ... | |||
| def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None, used_files: Dict[str, str]=None) -> None: ... | |||
| def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None) -> None: ... | |||
| @@ -8,7 +8,7 @@ from io import open | |||
| import tempfile | |||
| from warnings import warn | |||
| from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod | |||
| from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod, verify_used_files | |||
| from .load_grammar import load_grammar, FromPackageLoader, Grammar | |||
| from .tree import Tree | |||
| from .common import LexerConf, ParserConf | |||
| @@ -277,14 +277,15 @@ class Lark(Serialize): | |||
| options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) | |||
| from . import __version__ | |||
| s = grammar + options_str + __version__ + str(sys.version_info[:2]) | |||
| cache_md5 = hashlib.md5(s.encode()).hexdigest() | |||
| cache_md5 = hashlib.md5(s.encode('utf8')).hexdigest() | |||
| if isinstance(self.options.cache, STRING_TYPE): | |||
| cache_fn = self.options.cache | |||
| else: | |||
| if self.options.cache is not True: | |||
| raise ConfigurationError("cache argument must be bool or str") | |||
| cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % (cache_md5, *sys.version_info[:2]) | |||
| # Python2.7 doesn't support * syntax in tuples | |||
| cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % ((cache_md5,) + sys.version_info[:2]) | |||
| if FS.exists(cache_fn): | |||
| logger.debug('Loading grammar from cache: %s', cache_fn) | |||
| @@ -293,16 +294,23 @@ class Lark(Serialize): | |||
| del options[name] | |||
| with FS.open(cache_fn, 'rb') as f: | |||
| file_md5 = f.readline().rstrip(b'\n') | |||
| if file_md5 == cache_md5.encode(): | |||
| try: | |||
| self._load(f, **options) | |||
| except Exception: | |||
| raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn) | |||
| return | |||
| if file_md5 == cache_md5.encode('utf8'): | |||
| if (not self.options.safe_cache) or verify_used_files(pickle.load(f)): | |||
| old_options = self.options | |||
| try: | |||
| self._load(f, **options) | |||
| except Exception: # We should probably narrow done which errors we catch here. | |||
| logger.exception("Failed to load Lark from cache: %r. We will try to carry on." % cache_fn) | |||
| # In theory, the Lark instance might have been messed up by the call to `_load`. | |||
| # In practice the only relevant thing that might have been overriden should be `options` | |||
| self.options = old_options | |||
| else: | |||
| return | |||
| # Parse the grammar file and compose the grammars | |||
| self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) | |||
| self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) | |||
| else: | |||
| assert isinstance(grammar, Grammar) | |||
| self.grammar = grammar | |||
| @@ -387,7 +395,9 @@ class Lark(Serialize): | |||
| if cache_fn: | |||
| logger.debug('Saving grammar to cache: %s', cache_fn) | |||
| with FS.open(cache_fn, 'wb') as f: | |||
| f.write(b'%s\n' % cache_md5.encode()) | |||
| f.write(b'%s\n' % cache_md5.encode('utf8')) | |||
| if self.options.safe_cache: | |||
| pickle.dump(used_files, f) | |||
| self.save(f) | |||
| if __doc__: | |||
| @@ -1,5 +1,5 @@ | |||
| """Parses and creates Grammar objects""" | |||
| import hashlib | |||
| import os.path | |||
| import sys | |||
| from copy import copy, deepcopy | |||
| @@ -931,9 +931,10 @@ def _mangle_exp(exp, mangle): | |||
| class GrammarBuilder: | |||
| def __init__(self, global_keep_all_tokens=False, import_paths=None): | |||
| def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None): | |||
| self.global_keep_all_tokens = global_keep_all_tokens | |||
| self.import_paths = import_paths or [] | |||
| self.used_files = used_files or {} | |||
| self._definitions = {} | |||
| self._ignore_names = [] | |||
| @@ -1150,10 +1151,14 @@ class GrammarBuilder: | |||
| joined_path = os.path.join(source, grammar_path) | |||
| with open(joined_path, encoding='utf8') as f: | |||
| text = f.read() | |||
| h = hashlib.md5(text.encode('utf8')).hexdigest() | |||
| if self.used_files.get(joined_path, h) != h: | |||
| raise RuntimeError("Grammar file was changed during importing") | |||
| self.used_files[joined_path] = h | |||
| except IOError: | |||
| continue | |||
| else: | |||
| gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths) | |||
| gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files) | |||
| gb.load_grammar(text, joined_path, mangle) | |||
| gb._remove_unused(map(mangle, aliases)) | |||
| for name in gb._definitions: | |||
| @@ -1213,4 +1218,4 @@ class GrammarBuilder: | |||
| def load_grammar(grammar, source, import_paths, global_keep_all_tokens): | |||
| builder = GrammarBuilder(global_keep_all_tokens, import_paths) | |||
| builder.load_grammar(grammar, source) | |||
| return builder.build() | |||
| return builder.build(), builder.used_files | |||
| @@ -1,3 +1,4 @@ | |||
| import hashlib | |||
| import unicodedata | |||
| import os | |||
| from functools import reduce | |||
| @@ -6,6 +7,7 @@ from collections import deque | |||
| ###{standalone | |||
| import sys, re | |||
| import logging | |||
| from io import open | |||
| logger = logging.getLogger("lark") | |||
| logger.addHandler(logging.StreamHandler()) | |||
| # Set to highest level, since we have some warnings amongst the code | |||
| @@ -296,6 +298,16 @@ class FS: | |||
| else: | |||
| return open(name, mode, **kwargs) | |||
| def verify_used_files(file_hashes): | |||
| for path, old in file_hashes.items(): | |||
| with open(path, encoding='utf8') as f: | |||
| text = f.read() | |||
| current = hashlib.md5(text.encode()).hexdigest() | |||
| if old != current: | |||
| logger.info("File %r changed, rebuilding Parser" % path) | |||
| return False | |||
| return True | |||
| def isascii(s): | |||
| """ str.isascii only exists in python3.7+ """ | |||
| @@ -58,71 +58,84 @@ def append_zero(t): | |||
| class TestCache(TestCase): | |||
| g = '''start: "a"''' | |||
| def setUp(self): | |||
| pass | |||
| self.fs = lark_module.FS | |||
| self.mock_fs = MockFS() | |||
| lark_module.FS = self.mock_fs | |||
| def tearDown(self): | |||
| self.mock_fs.files = {} | |||
| lark_module.FS = self.fs | |||
| def test_simple(self): | |||
| g = '''start: "a"''' | |||
| fn = "bla" | |||
| fs = lark_module.FS | |||
| mock_fs = MockFS() | |||
| try: | |||
| lark_module.FS = mock_fs | |||
| Lark(g, parser='lalr', cache=fn) | |||
| assert fn in mock_fs.files | |||
| parser = Lark(g, parser='lalr', cache=fn) | |||
| assert parser.parse('a') == Tree('start', []) | |||
| mock_fs.files = {} | |||
| assert len(mock_fs.files) == 0 | |||
| Lark(g, parser='lalr', cache=True) | |||
| assert len(mock_fs.files) == 1 | |||
| parser = Lark(g, parser='lalr', cache=True) | |||
| assert parser.parse('a') == Tree('start', []) | |||
| parser = Lark(g + ' "b"', parser='lalr', cache=True) | |||
| assert len(mock_fs.files) == 2 | |||
| assert parser.parse('ab') == Tree('start', []) | |||
| parser = Lark(g, parser='lalr', cache=True) | |||
| assert parser.parse('a') == Tree('start', []) | |||
| # Test with custom lexer | |||
| mock_fs.files = {} | |||
| parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) | |||
| parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) | |||
| assert len(mock_fs.files) == 1 | |||
| assert parser.parse('a') == Tree('start', []) | |||
| # Test options persistence | |||
| mock_fs.files = {} | |||
| Lark(g, parser="lalr", debug=True, cache=True) | |||
| parser = Lark(g, parser="lalr", debug=True, cache=True) | |||
| assert parser.options.options['debug'] | |||
| # Test inline transformer (tree-less) & lexer_callbacks | |||
| mock_fs.files = {} | |||
| g = """ | |||
| start: add+ | |||
| add: NUM "+" NUM | |||
| NUM: /\d+/ | |||
| %ignore " " | |||
| """ | |||
| text = "1+2 3+4" | |||
| expected = Tree('start', [30, 70]) | |||
| parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero}) | |||
| res0 = parser.parse(text) | |||
| parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero}) | |||
| assert len(mock_fs.files) == 1 | |||
| res1 = parser.parse(text) | |||
| res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text)) | |||
| assert res0 == res1 == res2 == expected | |||
| finally: | |||
| lark_module.FS = fs | |||
| Lark(self.g, parser='lalr', cache=fn) | |||
| assert fn in self.mock_fs.files | |||
| parser = Lark(self.g, parser='lalr', cache=fn) | |||
| assert parser.parse('a') == Tree('start', []) | |||
| def test_automatic_naming(self): | |||
| assert len(self.mock_fs.files) == 0 | |||
| Lark(self.g, parser='lalr', cache=True) | |||
| assert len(self.mock_fs.files) == 1 | |||
| parser = Lark(self.g, parser='lalr', cache=True) | |||
| assert parser.parse('a') == Tree('start', []) | |||
| parser = Lark(self.g + ' "b"', parser='lalr', cache=True) | |||
| assert len(self.mock_fs.files) == 2 | |||
| assert parser.parse('ab') == Tree('start', []) | |||
| parser = Lark(self.g, parser='lalr', cache=True) | |||
| assert parser.parse('a') == Tree('start', []) | |||
| def test_custom_lexer(self): | |||
| parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True) | |||
| parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True) | |||
| assert len(self.mock_fs.files) == 1 | |||
| assert parser.parse('a') == Tree('start', []) | |||
| def test_options(self): | |||
| # Test options persistence | |||
| Lark(self.g, parser="lalr", debug=True, cache=True) | |||
| parser = Lark(self.g, parser="lalr", debug=True, cache=True) | |||
| assert parser.options.options['debug'] | |||
| def test_inline(self): | |||
| # Test inline transformer (tree-less) & lexer_callbacks | |||
| g = """ | |||
| start: add+ | |||
| add: NUM "+" NUM | |||
| NUM: /\d+/ | |||
| %ignore " " | |||
| """ | |||
| text = "1+2 3+4" | |||
| expected = Tree('start', [30, 70]) | |||
| parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero}) | |||
| res0 = parser.parse(text) | |||
| parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero}) | |||
| assert len(self.mock_fs.files) == 1 | |||
| res1 = parser.parse(text) | |||
| res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text)) | |||
| assert res0 == res1 == res2 == expected | |||
| def test_imports(self): | |||
| g = """ | |||
| %import .grammars.ab (startab, expr) | |||
| """ | |||
| parser = Lark(g, parser='lalr', start='startab', cache=True) | |||
| assert len(self.mock_fs.files) == 1 | |||
| parser = Lark(g, parser='lalr', start='startab', cache=True) | |||
| assert len(self.mock_fs.files) == 1 | |||
| res = parser.parse("ab") | |||
| self.assertEqual(res, Tree('startab', [Tree('expr', ['a', 'b'])])) | |||
| if __name__ == '__main__': | |||