Pārlūkot izejas kodu

Merge branch 'better-cache' of https://github.com/MegaIng/lark into MegaIng-better-cache

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.3
Erez Sh pirms 3 gadiem
vecāks
revīzija
b556c39815
6 mainītis faili ar 146 papildinājumiem un 91 dzēšanām
  1. +2
    -1
      lark-stubs/load_grammar.pyi
  2. +23
    -11
      lark/lark.py
  3. +31
    -17
      lark/load_grammar.py
  4. +15
    -1
      lark/utils.py
  5. +2
    -1
      setup.py
  6. +73
    -60
      tests/test_cache.py

+ 2
- 1
lark-stubs/load_grammar.pyi Parādīt failu

@@ -14,8 +14,9 @@ class Grammar:
class GrammarBuilder:
global_keep_all_tokens: bool
import_paths: List[Union[str, Callable]]
used_files: Dict[str, str]

def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None) -> None: ...
def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None, used_files: Dict[str, str]=None) -> None: ...

def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None) -> None: ...



+ 23
- 11
lark/lark.py Parādīt failu

@@ -9,7 +9,7 @@ import tempfile
from warnings import warn

from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod
from .load_grammar import load_grammar, FromPackageLoader, Grammar
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files
from .tree import Tree
from .common import LexerConf, ParserConf

@@ -23,6 +23,11 @@ try:
import regex
except ImportError:
regex = None
try:
import atomicwrites
except ImportError:
atomicwrites = None


###{standalone

@@ -100,7 +105,6 @@ class LarkOptions(Serialize):
A List of either paths or loader functions to specify from where grammars are imported
source_path
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading

**=== End Options ===**
"""
if __doc__:
@@ -262,15 +266,16 @@ class Lark(Serialize):
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
from . import __version__
s = grammar + options_str + __version__
cache_md5 = hashlib.md5(s.encode()).hexdigest()
s = grammar + options_str + __version__ + str(sys.version_info[:2])
cache_md5 = hashlib.md5(s.encode('utf8')).hexdigest()

if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache
else:
if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str")
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % cache_md5
# Python2.7 doesn't support * syntax in tuples
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % ((cache_md5,) + sys.version_info[:2])

if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn)
@@ -279,16 +284,22 @@ class Lark(Serialize):
del options[name]
with FS.open(cache_fn, 'rb') as f:
file_md5 = f.readline().rstrip(b'\n')
if file_md5 == cache_md5.encode():
if file_md5 == cache_md5.encode('utf8') and verify_used_files(pickle.load(f)):
old_options = self.options
try:
self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return
except Exception: # We should probably narrow done which errors we catch here.
logger.exception("Failed to load Lark from cache: %r. We will try to carry on." % cache_fn)
# In theory, the Lark instance might have been messed up by the call to `_load`.
# In practice the only relevant thing that might have been overriden should be `options`
self.options = old_options
else:
return


# Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
else:
assert isinstance(grammar, Grammar)
self.grammar = grammar
@@ -373,7 +384,8 @@ class Lark(Serialize):
if cache_fn:
logger.debug('Saving grammar to cache: %s', cache_fn)
with FS.open(cache_fn, 'wb') as f:
f.write(b'%s\n' % cache_md5.encode())
f.write(b'%s\n' % cache_md5.encode('utf8'))
pickle.dump(used_files, f)
self.save(f)

if __doc__:


+ 31
- 17
lark/load_grammar.py Parādīt failu

@@ -1,7 +1,8 @@
"""Parses and creates Grammar objects"""
import hashlib
import os.path
import sys
from collections import namedtuple
from copy import copy, deepcopy
from io import open
import pkgutil
@@ -673,19 +674,7 @@ class Grammar:
return terminals, compiled_rules, self.ignore


class PackageResource(object):
"""
Represents a path inside a Package. Used by `FromPackageLoader`
"""
def __init__(self, pkg_name, path):
self.pkg_name = pkg_name
self.path = path

def __str__(self):
return "<%s: %s>" % (self.pkg_name, self.path)

def __repr__(self):
return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path)
PackageResource = namedtuple('PackageResource', 'pkg_name path')


class FromPackageLoader(object):
@@ -931,9 +920,10 @@ def _mangle_exp(exp, mangle):


class GrammarBuilder:
def __init__(self, global_keep_all_tokens=False, import_paths=None):
def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None):
self.global_keep_all_tokens = global_keep_all_tokens
self.import_paths = import_paths or []
self.used_files = used_files or {}

self._definitions = {}
self._ignore_names = []
@@ -1153,7 +1143,12 @@ class GrammarBuilder:
except IOError:
continue
else:
gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths)
h = hashlib.md5(text.encode('utf8')).hexdigest()
if self.used_files.get(joined_path, h) != h:
raise RuntimeError("Grammar file was changed during importing")
self.used_files[joined_path] = h
gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files)
gb.load_grammar(text, joined_path, mangle)
gb._remove_unused(map(mangle, aliases))
for name in gb._definitions:
@@ -1210,7 +1205,26 @@ class GrammarBuilder:
# resolve_term_references(term_defs)
return Grammar(rule_defs, term_defs, self._ignore_names)


def verify_used_files(file_hashes):
for path, old in file_hashes.items():
text = None
if isinstance(path, str) and os.path.exists(path):
with open(path, encoding='utf8') as f:
text = f.read()
elif isinstance(path, PackageResource):
with suppress(IOError):
text = pkgutil.get_data(*path).decode('utf-8')
if text is None: # We don't know how to load the path. ignore it.
continue
current = hashlib.md5(text.encode()).hexdigest()
if old != current:
logger.info("File %r changed, rebuilding Parser" % path)
return False
return True

def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
builder = GrammarBuilder(global_keep_all_tokens, import_paths)
builder.load_grammar(grammar, source)
return builder.build()
return builder.build(), builder.used_files

+ 15
- 1
lark/utils.py Parādīt failu

@@ -1,3 +1,4 @@
import hashlib
import unicodedata
import os
from functools import reduce
@@ -6,6 +7,7 @@ from collections import deque
###{standalone
import sys, re
import logging
from io import open
logger = logging.getLogger("lark")
logger.addHandler(logging.StreamHandler())
# Set to highest level, since we have some warnings amongst the code
@@ -281,9 +283,21 @@ def combine_alternatives(lists):
return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init)


try:
import atomicwrites
except ImportError:
atomicwrites = None

class FS:
open = open
exists = os.path.exists
@staticmethod
def open(name, mode="r", **kwargs):
if atomicwrites and "w" in mode:
return atomicwrites.atomic_write(name, mode=mode, override=True, **kwargs)
else:
return open(name, mode, **kwargs)



def isascii(s):


+ 2
- 1
setup.py Parādīt failu

@@ -16,7 +16,8 @@ setup(

extras_require = {
"regex": ["regex"],
"nearley": ["js2py"]
"nearley": ["js2py"],
"atomicwrites": ["atomicwrites"],
},

package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']},


+ 73
- 60
tests/test_cache.py Parādīt failu

@@ -58,71 +58,84 @@ def append_zero(t):


class TestCache(TestCase):
g = '''start: "a"'''

def setUp(self):
pass
self.fs = lark_module.FS
self.mock_fs = MockFS()
lark_module.FS = self.mock_fs
def tearDown(self):
self.mock_fs.files = {}
lark_module.FS = self.fs

def test_simple(self):
g = '''start: "a"'''

fn = "bla"

fs = lark_module.FS
mock_fs = MockFS()
try:
lark_module.FS = mock_fs
Lark(g, parser='lalr', cache=fn)
assert fn in mock_fs.files
parser = Lark(g, parser='lalr', cache=fn)
assert parser.parse('a') == Tree('start', [])

mock_fs.files = {}
assert len(mock_fs.files) == 0
Lark(g, parser='lalr', cache=True)
assert len(mock_fs.files) == 1
parser = Lark(g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

parser = Lark(g + ' "b"', parser='lalr', cache=True)
assert len(mock_fs.files) == 2
assert parser.parse('ab') == Tree('start', [])

parser = Lark(g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

# Test with custom lexer
mock_fs.files = {}
parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
assert len(mock_fs.files) == 1
assert parser.parse('a') == Tree('start', [])

# Test options persistence
mock_fs.files = {}
Lark(g, parser="lalr", debug=True, cache=True)
parser = Lark(g, parser="lalr", debug=True, cache=True)
assert parser.options.options['debug']

# Test inline transformer (tree-less) & lexer_callbacks
mock_fs.files = {}
g = """
start: add+
add: NUM "+" NUM
NUM: /\d+/
%ignore " "
"""
text = "1+2 3+4"
expected = Tree('start', [30, 70])

parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
res0 = parser.parse(text)
parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
assert len(mock_fs.files) == 1
res1 = parser.parse(text)
res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text))
assert res0 == res1 == res2 == expected

finally:
lark_module.FS = fs
Lark(self.g, parser='lalr', cache=fn)
assert fn in self.mock_fs.files
parser = Lark(self.g, parser='lalr', cache=fn)
assert parser.parse('a') == Tree('start', [])
def test_automatic_naming(self):
assert len(self.mock_fs.files) == 0
Lark(self.g, parser='lalr', cache=True)
assert len(self.mock_fs.files) == 1
parser = Lark(self.g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

parser = Lark(self.g + ' "b"', parser='lalr', cache=True)
assert len(self.mock_fs.files) == 2
assert parser.parse('ab') == Tree('start', [])

parser = Lark(self.g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])
def test_custom_lexer(self):

parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True)
parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True)
assert len(self.mock_fs.files) == 1
assert parser.parse('a') == Tree('start', [])

def test_options(self):
# Test options persistence
Lark(self.g, parser="lalr", debug=True, cache=True)
parser = Lark(self.g, parser="lalr", debug=True, cache=True)
assert parser.options.options['debug']
def test_inline(self):
# Test inline transformer (tree-less) & lexer_callbacks
g = """
start: add+
add: NUM "+" NUM
NUM: /\d+/
%ignore " "
"""
text = "1+2 3+4"
expected = Tree('start', [30, 70])

parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
res0 = parser.parse(text)
parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
assert len(self.mock_fs.files) == 1
res1 = parser.parse(text)
res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text))
assert res0 == res1 == res2 == expected
def test_imports(self):
g = """
%import .grammars.ab (startab, expr)
"""
parser = Lark(g, parser='lalr', start='startab', cache=True)
assert len(self.mock_fs.files) == 1
parser = Lark(g, parser='lalr', start='startab', cache=True)
assert len(self.mock_fs.files) == 1
res = parser.parse("ab")
self.assertEqual(res, Tree('startab', [Tree('expr', ['a', 'b'])]))


if __name__ == '__main__':


Notiek ielāde…
Atcelt
Saglabāt