| @@ -14,8 +14,9 @@ class Grammar: | |||||
| class GrammarBuilder: | class GrammarBuilder: | ||||
| global_keep_all_tokens: bool | global_keep_all_tokens: bool | ||||
| import_paths: List[Union[str, Callable]] | import_paths: List[Union[str, Callable]] | ||||
| used_files: Dict[str, str] | |||||
| def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None) -> None: ... | |||||
| def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None, used_files: Dict[str, str]=None) -> None: ... | |||||
| def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None) -> None: ... | def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None) -> None: ... | ||||
| @@ -9,7 +9,7 @@ import tempfile | |||||
| from warnings import warn | from warnings import warn | ||||
| from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod | from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod | ||||
| from .load_grammar import load_grammar, FromPackageLoader, Grammar | |||||
| from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files | |||||
| from .tree import Tree | from .tree import Tree | ||||
| from .common import LexerConf, ParserConf | from .common import LexerConf, ParserConf | ||||
| @@ -23,6 +23,11 @@ try: | |||||
| import regex | import regex | ||||
| except ImportError: | except ImportError: | ||||
| regex = None | regex = None | ||||
| try: | |||||
| import atomicwrites | |||||
| except ImportError: | |||||
| atomicwrites = None | |||||
| ###{standalone | ###{standalone | ||||
| @@ -100,7 +105,6 @@ class LarkOptions(Serialize): | |||||
| A List of either paths or loader functions to specify from where grammars are imported | A List of either paths or loader functions to specify from where grammars are imported | ||||
| source_path | source_path | ||||
| Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading | Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading | ||||
| **=== End Options ===** | **=== End Options ===** | ||||
| """ | """ | ||||
| if __doc__: | if __doc__: | ||||
| @@ -262,15 +266,16 @@ class Lark(Serialize): | |||||
| unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') | unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') | ||||
| options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) | options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) | ||||
| from . import __version__ | from . import __version__ | ||||
| s = grammar + options_str + __version__ | |||||
| cache_md5 = hashlib.md5(s.encode()).hexdigest() | |||||
| s = grammar + options_str + __version__ + str(sys.version_info[:2]) | |||||
| cache_md5 = hashlib.md5(s.encode('utf8')).hexdigest() | |||||
| if isinstance(self.options.cache, STRING_TYPE): | if isinstance(self.options.cache, STRING_TYPE): | ||||
| cache_fn = self.options.cache | cache_fn = self.options.cache | ||||
| else: | else: | ||||
| if self.options.cache is not True: | if self.options.cache is not True: | ||||
| raise ConfigurationError("cache argument must be bool or str") | raise ConfigurationError("cache argument must be bool or str") | ||||
| cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % cache_md5 | |||||
| # Python2.7 doesn't support * syntax in tuples | |||||
| cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % ((cache_md5,) + sys.version_info[:2]) | |||||
| if FS.exists(cache_fn): | if FS.exists(cache_fn): | ||||
| logger.debug('Loading grammar from cache: %s', cache_fn) | logger.debug('Loading grammar from cache: %s', cache_fn) | ||||
| @@ -279,16 +284,22 @@ class Lark(Serialize): | |||||
| del options[name] | del options[name] | ||||
| with FS.open(cache_fn, 'rb') as f: | with FS.open(cache_fn, 'rb') as f: | ||||
| file_md5 = f.readline().rstrip(b'\n') | file_md5 = f.readline().rstrip(b'\n') | ||||
| if file_md5 == cache_md5.encode(): | |||||
| if file_md5 == cache_md5.encode('utf8') and verify_used_files(pickle.load(f)): | |||||
| old_options = self.options | |||||
| try: | try: | ||||
| self._load(f, **options) | self._load(f, **options) | ||||
| except Exception: | |||||
| raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn) | |||||
| return | |||||
| except Exception: # We should probably narrow done which errors we catch here. | |||||
| logger.exception("Failed to load Lark from cache: %r. We will try to carry on." % cache_fn) | |||||
| # In theory, the Lark instance might have been messed up by the call to `_load`. | |||||
| # In practice the only relevant thing that might have been overriden should be `options` | |||||
| self.options = old_options | |||||
| else: | |||||
| return | |||||
| # Parse the grammar file and compose the grammars | # Parse the grammar file and compose the grammars | ||||
| self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) | |||||
| self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens) | |||||
| else: | else: | ||||
| assert isinstance(grammar, Grammar) | assert isinstance(grammar, Grammar) | ||||
| self.grammar = grammar | self.grammar = grammar | ||||
| @@ -373,7 +384,8 @@ class Lark(Serialize): | |||||
| if cache_fn: | if cache_fn: | ||||
| logger.debug('Saving grammar to cache: %s', cache_fn) | logger.debug('Saving grammar to cache: %s', cache_fn) | ||||
| with FS.open(cache_fn, 'wb') as f: | with FS.open(cache_fn, 'wb') as f: | ||||
| f.write(b'%s\n' % cache_md5.encode()) | |||||
| f.write(b'%s\n' % cache_md5.encode('utf8')) | |||||
| pickle.dump(used_files, f) | |||||
| self.save(f) | self.save(f) | ||||
| if __doc__: | if __doc__: | ||||
| @@ -1,7 +1,8 @@ | |||||
| """Parses and creates Grammar objects""" | """Parses and creates Grammar objects""" | ||||
| import hashlib | |||||
| import os.path | import os.path | ||||
| import sys | import sys | ||||
| from collections import namedtuple | |||||
| from copy import copy, deepcopy | from copy import copy, deepcopy | ||||
| from io import open | from io import open | ||||
| import pkgutil | import pkgutil | ||||
| @@ -673,19 +674,7 @@ class Grammar: | |||||
| return terminals, compiled_rules, self.ignore | return terminals, compiled_rules, self.ignore | ||||
| class PackageResource(object): | |||||
| """ | |||||
| Represents a path inside a Package. Used by `FromPackageLoader` | |||||
| """ | |||||
| def __init__(self, pkg_name, path): | |||||
| self.pkg_name = pkg_name | |||||
| self.path = path | |||||
| def __str__(self): | |||||
| return "<%s: %s>" % (self.pkg_name, self.path) | |||||
| def __repr__(self): | |||||
| return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path) | |||||
| PackageResource = namedtuple('PackageResource', 'pkg_name path') | |||||
| class FromPackageLoader(object): | class FromPackageLoader(object): | ||||
| @@ -931,9 +920,10 @@ def _mangle_exp(exp, mangle): | |||||
| class GrammarBuilder: | class GrammarBuilder: | ||||
| def __init__(self, global_keep_all_tokens=False, import_paths=None): | |||||
| def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None): | |||||
| self.global_keep_all_tokens = global_keep_all_tokens | self.global_keep_all_tokens = global_keep_all_tokens | ||||
| self.import_paths = import_paths or [] | self.import_paths = import_paths or [] | ||||
| self.used_files = used_files or {} | |||||
| self._definitions = {} | self._definitions = {} | ||||
| self._ignore_names = [] | self._ignore_names = [] | ||||
| @@ -1153,7 +1143,12 @@ class GrammarBuilder: | |||||
| except IOError: | except IOError: | ||||
| continue | continue | ||||
| else: | else: | ||||
| gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths) | |||||
| h = hashlib.md5(text.encode('utf8')).hexdigest() | |||||
| if self.used_files.get(joined_path, h) != h: | |||||
| raise RuntimeError("Grammar file was changed during importing") | |||||
| self.used_files[joined_path] = h | |||||
| gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files) | |||||
| gb.load_grammar(text, joined_path, mangle) | gb.load_grammar(text, joined_path, mangle) | ||||
| gb._remove_unused(map(mangle, aliases)) | gb._remove_unused(map(mangle, aliases)) | ||||
| for name in gb._definitions: | for name in gb._definitions: | ||||
| @@ -1210,7 +1205,26 @@ class GrammarBuilder: | |||||
| # resolve_term_references(term_defs) | # resolve_term_references(term_defs) | ||||
| return Grammar(rule_defs, term_defs, self._ignore_names) | return Grammar(rule_defs, term_defs, self._ignore_names) | ||||
| def verify_used_files(file_hashes): | |||||
| for path, old in file_hashes.items(): | |||||
| text = None | |||||
| if isinstance(path, str) and os.path.exists(path): | |||||
| with open(path, encoding='utf8') as f: | |||||
| text = f.read() | |||||
| elif isinstance(path, PackageResource): | |||||
| with suppress(IOError): | |||||
| text = pkgutil.get_data(*path).decode('utf-8') | |||||
| if text is None: # We don't know how to load the path. ignore it. | |||||
| continue | |||||
| current = hashlib.md5(text.encode()).hexdigest() | |||||
| if old != current: | |||||
| logger.info("File %r changed, rebuilding Parser" % path) | |||||
| return False | |||||
| return True | |||||
| def load_grammar(grammar, source, import_paths, global_keep_all_tokens): | def load_grammar(grammar, source, import_paths, global_keep_all_tokens): | ||||
| builder = GrammarBuilder(global_keep_all_tokens, import_paths) | builder = GrammarBuilder(global_keep_all_tokens, import_paths) | ||||
| builder.load_grammar(grammar, source) | builder.load_grammar(grammar, source) | ||||
| return builder.build() | |||||
| return builder.build(), builder.used_files | |||||
| @@ -1,3 +1,4 @@ | |||||
| import hashlib | |||||
| import unicodedata | import unicodedata | ||||
| import os | import os | ||||
| from functools import reduce | from functools import reduce | ||||
| @@ -6,6 +7,7 @@ from collections import deque | |||||
| ###{standalone | ###{standalone | ||||
| import sys, re | import sys, re | ||||
| import logging | import logging | ||||
| from io import open | |||||
| logger = logging.getLogger("lark") | logger = logging.getLogger("lark") | ||||
| logger.addHandler(logging.StreamHandler()) | logger.addHandler(logging.StreamHandler()) | ||||
| # Set to highest level, since we have some warnings amongst the code | # Set to highest level, since we have some warnings amongst the code | ||||
| @@ -281,9 +283,21 @@ def combine_alternatives(lists): | |||||
| return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init) | return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init) | ||||
| try: | |||||
| import atomicwrites | |||||
| except ImportError: | |||||
| atomicwrites = None | |||||
| class FS: | class FS: | ||||
| open = open | |||||
| exists = os.path.exists | exists = os.path.exists | ||||
| @staticmethod | |||||
| def open(name, mode="r", **kwargs): | |||||
| if atomicwrites and "w" in mode: | |||||
| return atomicwrites.atomic_write(name, mode=mode, override=True, **kwargs) | |||||
| else: | |||||
| return open(name, mode, **kwargs) | |||||
| def isascii(s): | def isascii(s): | ||||
| @@ -16,7 +16,8 @@ setup( | |||||
| extras_require = { | extras_require = { | ||||
| "regex": ["regex"], | "regex": ["regex"], | ||||
| "nearley": ["js2py"] | |||||
| "nearley": ["js2py"], | |||||
| "atomicwrites": ["atomicwrites"], | |||||
| }, | }, | ||||
| package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, | package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, | ||||
| @@ -58,71 +58,84 @@ def append_zero(t): | |||||
| class TestCache(TestCase): | class TestCache(TestCase): | ||||
| g = '''start: "a"''' | |||||
| def setUp(self): | def setUp(self): | ||||
| pass | |||||
| self.fs = lark_module.FS | |||||
| self.mock_fs = MockFS() | |||||
| lark_module.FS = self.mock_fs | |||||
| def tearDown(self): | |||||
| self.mock_fs.files = {} | |||||
| lark_module.FS = self.fs | |||||
| def test_simple(self): | def test_simple(self): | ||||
| g = '''start: "a"''' | |||||
| fn = "bla" | fn = "bla" | ||||
| fs = lark_module.FS | |||||
| mock_fs = MockFS() | |||||
| try: | |||||
| lark_module.FS = mock_fs | |||||
| Lark(g, parser='lalr', cache=fn) | |||||
| assert fn in mock_fs.files | |||||
| parser = Lark(g, parser='lalr', cache=fn) | |||||
| assert parser.parse('a') == Tree('start', []) | |||||
| mock_fs.files = {} | |||||
| assert len(mock_fs.files) == 0 | |||||
| Lark(g, parser='lalr', cache=True) | |||||
| assert len(mock_fs.files) == 1 | |||||
| parser = Lark(g, parser='lalr', cache=True) | |||||
| assert parser.parse('a') == Tree('start', []) | |||||
| parser = Lark(g + ' "b"', parser='lalr', cache=True) | |||||
| assert len(mock_fs.files) == 2 | |||||
| assert parser.parse('ab') == Tree('start', []) | |||||
| parser = Lark(g, parser='lalr', cache=True) | |||||
| assert parser.parse('a') == Tree('start', []) | |||||
| # Test with custom lexer | |||||
| mock_fs.files = {} | |||||
| parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) | |||||
| parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) | |||||
| assert len(mock_fs.files) == 1 | |||||
| assert parser.parse('a') == Tree('start', []) | |||||
| # Test options persistence | |||||
| mock_fs.files = {} | |||||
| Lark(g, parser="lalr", debug=True, cache=True) | |||||
| parser = Lark(g, parser="lalr", debug=True, cache=True) | |||||
| assert parser.options.options['debug'] | |||||
| # Test inline transformer (tree-less) & lexer_callbacks | |||||
| mock_fs.files = {} | |||||
| g = """ | |||||
| start: add+ | |||||
| add: NUM "+" NUM | |||||
| NUM: /\d+/ | |||||
| %ignore " " | |||||
| """ | |||||
| text = "1+2 3+4" | |||||
| expected = Tree('start', [30, 70]) | |||||
| parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero}) | |||||
| res0 = parser.parse(text) | |||||
| parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero}) | |||||
| assert len(mock_fs.files) == 1 | |||||
| res1 = parser.parse(text) | |||||
| res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text)) | |||||
| assert res0 == res1 == res2 == expected | |||||
| finally: | |||||
| lark_module.FS = fs | |||||
| Lark(self.g, parser='lalr', cache=fn) | |||||
| assert fn in self.mock_fs.files | |||||
| parser = Lark(self.g, parser='lalr', cache=fn) | |||||
| assert parser.parse('a') == Tree('start', []) | |||||
| def test_automatic_naming(self): | |||||
| assert len(self.mock_fs.files) == 0 | |||||
| Lark(self.g, parser='lalr', cache=True) | |||||
| assert len(self.mock_fs.files) == 1 | |||||
| parser = Lark(self.g, parser='lalr', cache=True) | |||||
| assert parser.parse('a') == Tree('start', []) | |||||
| parser = Lark(self.g + ' "b"', parser='lalr', cache=True) | |||||
| assert len(self.mock_fs.files) == 2 | |||||
| assert parser.parse('ab') == Tree('start', []) | |||||
| parser = Lark(self.g, parser='lalr', cache=True) | |||||
| assert parser.parse('a') == Tree('start', []) | |||||
| def test_custom_lexer(self): | |||||
| parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True) | |||||
| parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True) | |||||
| assert len(self.mock_fs.files) == 1 | |||||
| assert parser.parse('a') == Tree('start', []) | |||||
| def test_options(self): | |||||
| # Test options persistence | |||||
| Lark(self.g, parser="lalr", debug=True, cache=True) | |||||
| parser = Lark(self.g, parser="lalr", debug=True, cache=True) | |||||
| assert parser.options.options['debug'] | |||||
| def test_inline(self): | |||||
| # Test inline transformer (tree-less) & lexer_callbacks | |||||
| g = """ | |||||
| start: add+ | |||||
| add: NUM "+" NUM | |||||
| NUM: /\d+/ | |||||
| %ignore " " | |||||
| """ | |||||
| text = "1+2 3+4" | |||||
| expected = Tree('start', [30, 70]) | |||||
| parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero}) | |||||
| res0 = parser.parse(text) | |||||
| parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero}) | |||||
| assert len(self.mock_fs.files) == 1 | |||||
| res1 = parser.parse(text) | |||||
| res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text)) | |||||
| assert res0 == res1 == res2 == expected | |||||
| def test_imports(self): | |||||
| g = """ | |||||
| %import .grammars.ab (startab, expr) | |||||
| """ | |||||
| parser = Lark(g, parser='lalr', start='startab', cache=True) | |||||
| assert len(self.mock_fs.files) == 1 | |||||
| parser = Lark(g, parser='lalr', start='startab', cache=True) | |||||
| assert len(self.mock_fs.files) == 1 | |||||
| res = parser.parse("ab") | |||||
| self.assertEqual(res, Tree('startab', [Tree('expr', ['a', 'b'])])) | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||