浏览代码

Added support for verifying imported files

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.3
MegaIng1 3 年前
父节点
当前提交
0f4ca60d83
共有 5 个文件被更改,包括 117 次插入76 次删除
  1. +2
    -1
      lark-stubs/load_grammar.pyi
  2. +21
    -11
      lark/lark.py
  3. +9
    -4
      lark/load_grammar.py
  4. +12
    -0
      lark/utils.py
  5. +73
    -60
      tests/test_cache.py

+ 2
- 1
lark-stubs/load_grammar.pyi 查看文件

@@ -14,8 +14,9 @@ class Grammar:
class GrammarBuilder:
global_keep_all_tokens: bool
import_paths: List[Union[str, Callable]]
used_files: Dict[str, str]

def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None) -> None: ...
def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None, used_files: Dict[str, str]=None) -> None: ...

def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None) -> None: ...



+ 21
- 11
lark/lark.py 查看文件

@@ -8,7 +8,7 @@ from io import open
import tempfile
from warnings import warn

from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod
from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod, verify_used_files
from .load_grammar import load_grammar, FromPackageLoader, Grammar
from .tree import Tree
from .common import LexerConf, ParserConf
@@ -277,14 +277,15 @@ class Lark(Serialize):
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
from . import __version__
s = grammar + options_str + __version__ + str(sys.version_info[:2])
cache_md5 = hashlib.md5(s.encode()).hexdigest()
cache_md5 = hashlib.md5(s.encode('utf8')).hexdigest()

if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % (cache_md5, *sys.version_info[:2])
# Python2.7 doesn't support * syntax in tuples
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % ((cache_md5,) + sys.version_info[:2])

if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
@@ -293,16 +294,23 @@ class Lark(Serialize):
del options[name]
with FS.open(cache_fn, 'rb') as f:
file_md5 = f.readline().rstrip(b'\n')
if file_md5 == cache_md5.encode():
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return
if file_md5 == cache_md5.encode('utf8'):
if (not self.options.safe_cache) or verify_used_files(pickle.load(f)):
old_options = self.options
try:
self._load(f, **options)
except Exception: # We should probably narrow done which errors we catch here.
logger.exception("Failed to load Lark from cache: %r. We will try to carry on." % cache_fn)
# In theory, the Lark instance might have been messed up by the call to `_load`.
# In practice the only relevant thing that might have been overriden should be `options`
self.options = old_options
else:
return


# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
else:
assert isinstance(grammar, Grammar)
self.grammar = grammar
@@ -387,7 +395,9 @@ class Lark(Serialize):
if cache_fn:
logger.debug('Saving grammar to cache: %s', cache_fn)
with FS.open(cache_fn, 'wb') as f:
f.write(b'%s\n' % cache_md5.encode())
f.write(b'%s\n' % cache_md5.encode('utf8'))
if self.options.safe_cache:
pickle.dump(used_files, f)
self.save(f)

if __doc__:


+ 9
- 4
lark/load_grammar.py 查看文件

@@ -1,5 +1,5 @@
"""Parses and creates Grammar objects"""
import hashlib
import os.path
import sys
from copy import copy, deepcopy
@@ -931,9 +931,10 @@ def _mangle_exp(exp, mangle):


class GrammarBuilder:
def __init__(self, global_keep_all_tokens=False, import_paths=None):
def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None):
self.global_keep_all_tokens = global_keep_all_tokens
self.import_paths = import_paths or []
self.used_files = used_files or {}

self._definitions = {}
self._ignore_names = []
@@ -1150,10 +1151,14 @@ class GrammarBuilder:
joined_path = os.path.join(source, grammar_path)
with open(joined_path, encoding='utf8') as f:
text = f.read()
h = hashlib.md5(text.encode('utf8')).hexdigest()
if self.used_files.get(joined_path, h) != h:
raise RuntimeError("Grammar file was changed during importing")
self.used_files[joined_path] = h
except IOError:
continue
else:
gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths)
gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files)
gb.load_grammar(text, joined_path, mangle)
gb._remove_unused(map(mangle, aliases))
for name in gb._definitions:
@@ -1213,4 +1218,4 @@ class GrammarBuilder:
def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
builder = GrammarBuilder(global_keep_all_tokens, import_paths)
builder.load_grammar(grammar, source)
return builder.build()
return builder.build(), builder.used_files

+ 12
- 0
lark/utils.py 查看文件

@@ -1,3 +1,4 @@
import hashlib
import unicodedata
import os
from functools import reduce
@@ -6,6 +7,7 @@ from collections import deque
###{standalone
import sys, re
import logging
from io import open
logger = logging.getLogger("lark")
logger.addHandler(logging.StreamHandler())
# Set to highest level, since we have some warnings amongst the code
@@ -296,6 +298,16 @@ class FS:
else:
return open(name, mode, **kwargs)

def verify_used_files(file_hashes):
for path, old in file_hashes.items():
with open(path, encoding='utf8') as f:
text = f.read()
current = hashlib.md5(text.encode()).hexdigest()
if old != current:
logger.info("File %r changed, rebuilding Parser" % path)
return False
return True


def isascii(s):
""" str.isascii only exists in python3.7+ """


+ 73
- 60
tests/test_cache.py 查看文件

@@ -58,71 +58,84 @@ def append_zero(t):


class TestCache(TestCase):
g = '''start: "a"'''

def setUp(self):
pass
self.fs = lark_module.FS
self.mock_fs = MockFS()
lark_module.FS = self.mock_fs
def tearDown(self):
self.mock_fs.files = {}
lark_module.FS = self.fs

def test_simple(self):
g = '''start: "a"'''

fn = "bla"

fs = lark_module.FS
mock_fs = MockFS()
try:
lark_module.FS = mock_fs
Lark(g, parser='lalr', cache=fn)
assert fn in mock_fs.files
parser = Lark(g, parser='lalr', cache=fn)
assert parser.parse('a') == Tree('start', [])

mock_fs.files = {}
assert len(mock_fs.files) == 0
Lark(g, parser='lalr', cache=True)
assert len(mock_fs.files) == 1
parser = Lark(g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

parser = Lark(g + ' "b"', parser='lalr', cache=True)
assert len(mock_fs.files) == 2
assert parser.parse('ab') == Tree('start', [])

parser = Lark(g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

# Test with custom lexer
mock_fs.files = {}
parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
assert len(mock_fs.files) == 1
assert parser.parse('a') == Tree('start', [])

# Test options persistence
mock_fs.files = {}
Lark(g, parser="lalr", debug=True, cache=True)
parser = Lark(g, parser="lalr", debug=True, cache=True)
assert parser.options.options['debug']

# Test inline transformer (tree-less) & lexer_callbacks
mock_fs.files = {}
g = """
start: add+
add: NUM "+" NUM
NUM: /\d+/
%ignore " "
"""
text = "1+2 3+4"
expected = Tree('start', [30, 70])

parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
res0 = parser.parse(text)
parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
assert len(mock_fs.files) == 1
res1 = parser.parse(text)
res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text))
assert res0 == res1 == res2 == expected

finally:
lark_module.FS = fs
Lark(self.g, parser='lalr', cache=fn)
assert fn in self.mock_fs.files
parser = Lark(self.g, parser='lalr', cache=fn)
assert parser.parse('a') == Tree('start', [])
def test_automatic_naming(self):
assert len(self.mock_fs.files) == 0
Lark(self.g, parser='lalr', cache=True)
assert len(self.mock_fs.files) == 1
parser = Lark(self.g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

parser = Lark(self.g + ' "b"', parser='lalr', cache=True)
assert len(self.mock_fs.files) == 2
assert parser.parse('ab') == Tree('start', [])

parser = Lark(self.g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])
def test_custom_lexer(self):

parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True)
parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True)
assert len(self.mock_fs.files) == 1
assert parser.parse('a') == Tree('start', [])

def test_options(self):
# Test options persistence
Lark(self.g, parser="lalr", debug=True, cache=True)
parser = Lark(self.g, parser="lalr", debug=True, cache=True)
assert parser.options.options['debug']
def test_inline(self):
# Test inline transformer (tree-less) & lexer_callbacks
g = """
start: add+
add: NUM "+" NUM
NUM: /\d+/
%ignore " "
"""
text = "1+2 3+4"
expected = Tree('start', [30, 70])

parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
res0 = parser.parse(text)
parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
assert len(self.mock_fs.files) == 1
res1 = parser.parse(text)
res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text))
assert res0 == res1 == res2 == expected
def test_imports(self):
g = """
%import .grammars.ab (startab, expr)
"""
parser = Lark(g, parser='lalr', start='startab', cache=True)
assert len(self.mock_fs.files) == 1
parser = Lark(g, parser='lalr', start='startab', cache=True)
assert len(self.mock_fs.files) == 1
res = parser.parse("ab")
self.assertEqual(res, Tree('startab', [Tree('expr', ['a', 'b'])]))


if __name__ == '__main__':


正在加载...
取消
保存