Browse Source

Merge branch 'better-cache' of https://github.com/MegaIng/lark into MegaIng-better-cache

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.3
Erez Sh 3 years ago
parent
commit
b556c39815
6 changed files with 146 additions and 91 deletions
  1. +2
    -1
      lark-stubs/load_grammar.pyi
  2. +23
    -11
      lark/lark.py
  3. +31
    -17
      lark/load_grammar.py
  4. +15
    -1
      lark/utils.py
  5. +2
    -1
      setup.py
  6. +73
    -60
      tests/test_cache.py

+ 2
- 1
lark-stubs/load_grammar.pyi View File

@@ -14,8 +14,9 @@ class Grammar:
class GrammarBuilder: class GrammarBuilder:
global_keep_all_tokens: bool global_keep_all_tokens: bool
import_paths: List[Union[str, Callable]] import_paths: List[Union[str, Callable]]
used_files: Dict[str, str]


def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None) -> None: ...
def __init__(self, global_keep_all_tokens: bool = False, import_paths: List[Union[str, Callable]] = None, used_files: Dict[str, str]=None) -> None: ...


def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None) -> None: ... def load_grammar(self, grammar_text: str, grammar_name: str = ..., mangle: Callable[[str], str] = None) -> None: ...




+ 23
- 11
lark/lark.py View File

@@ -9,7 +9,7 @@ import tempfile
from warnings import warn from warnings import warn


from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger, ABC, abstractmethod
from .load_grammar import load_grammar, FromPackageLoader, Grammar
from .load_grammar import load_grammar, FromPackageLoader, Grammar, verify_used_files
from .tree import Tree from .tree import Tree
from .common import LexerConf, ParserConf from .common import LexerConf, ParserConf


@@ -23,6 +23,11 @@ try:
import regex import regex
except ImportError: except ImportError:
regex = None regex = None
try:
import atomicwrites
except ImportError:
atomicwrites = None



###{standalone ###{standalone


@@ -100,7 +105,6 @@ class LarkOptions(Serialize):
A List of either paths or loader functions to specify from where grammars are imported A List of either paths or loader functions to specify from where grammars are imported
source_path source_path
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading

**=== End Options ===** **=== End Options ===**
""" """
if __doc__: if __doc__:
@@ -262,15 +266,16 @@ class Lark(Serialize):
unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
from . import __version__ from . import __version__
s = grammar + options_str + __version__
cache_md5 = hashlib.md5(s.encode()).hexdigest()
s = grammar + options_str + __version__ + str(sys.version_info[:2])
cache_md5 = hashlib.md5(s.encode('utf8')).hexdigest()


if isinstance(self.options.cache, STRING_TYPE): if isinstance(self.options.cache, STRING_TYPE):
cache_fn = self.options.cache cache_fn = self.options.cache
else: else:
if self.options.cache is not True: if self.options.cache is not True:
raise ConfigurationError("cache argument must be bool or str") raise ConfigurationError("cache argument must be bool or str")
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s.tmp' % cache_md5
# Python2.7 doesn't support * syntax in tuples
cache_fn = tempfile.gettempdir() + '/.lark_cache_%s_%s_%s.tmp' % ((cache_md5,) + sys.version_info[:2])


if FS.exists(cache_fn): if FS.exists(cache_fn):
logger.debug('Loading grammar from cache: %s', cache_fn) logger.debug('Loading grammar from cache: %s', cache_fn)
@@ -279,16 +284,22 @@ class Lark(Serialize):
del options[name] del options[name]
with FS.open(cache_fn, 'rb') as f: with FS.open(cache_fn, 'rb') as f:
file_md5 = f.readline().rstrip(b'\n') file_md5 = f.readline().rstrip(b'\n')
if file_md5 == cache_md5.encode():
if file_md5 == cache_md5.encode('utf8') and verify_used_files(pickle.load(f)):
old_options = self.options
try: try:
self._load(f, **options) self._load(f, **options)
except Exception:
raise RuntimeError("Failed to load Lark from cache: %r. Try to delete the file and run again." % cache_fn)
return
except Exception: # We should probably narrow done which errors we catch here.
logger.exception("Failed to load Lark from cache: %r. We will try to carry on." % cache_fn)
# In theory, the Lark instance might have been messed up by the call to `_load`.
# In practice the only relevant thing that might have been overriden should be `options`
self.options = old_options
else:
return




# Parse the grammar file and compose the grammars # Parse the grammar file and compose the grammars
self.grammar = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
self.grammar, used_files = load_grammar(grammar, self.source_path, self.options.import_paths, self.options.keep_all_tokens)
else: else:
assert isinstance(grammar, Grammar) assert isinstance(grammar, Grammar)
self.grammar = grammar self.grammar = grammar
@@ -373,7 +384,8 @@ class Lark(Serialize):
if cache_fn: if cache_fn:
logger.debug('Saving grammar to cache: %s', cache_fn) logger.debug('Saving grammar to cache: %s', cache_fn)
with FS.open(cache_fn, 'wb') as f: with FS.open(cache_fn, 'wb') as f:
f.write(b'%s\n' % cache_md5.encode())
f.write(b'%s\n' % cache_md5.encode('utf8'))
pickle.dump(used_files, f)
self.save(f) self.save(f)


if __doc__: if __doc__:


+ 31
- 17
lark/load_grammar.py View File

@@ -1,7 +1,8 @@
"""Parses and creates Grammar objects""" """Parses and creates Grammar objects"""
import hashlib
import os.path import os.path
import sys import sys
from collections import namedtuple
from copy import copy, deepcopy from copy import copy, deepcopy
from io import open from io import open
import pkgutil import pkgutil
@@ -673,19 +674,7 @@ class Grammar:
return terminals, compiled_rules, self.ignore return terminals, compiled_rules, self.ignore




class PackageResource(object):
"""
Represents a path inside a Package. Used by `FromPackageLoader`
"""
def __init__(self, pkg_name, path):
self.pkg_name = pkg_name
self.path = path

def __str__(self):
return "<%s: %s>" % (self.pkg_name, self.path)

def __repr__(self):
return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.path)
PackageResource = namedtuple('PackageResource', 'pkg_name path')




class FromPackageLoader(object): class FromPackageLoader(object):
@@ -931,9 +920,10 @@ def _mangle_exp(exp, mangle):




class GrammarBuilder: class GrammarBuilder:
def __init__(self, global_keep_all_tokens=False, import_paths=None):
def __init__(self, global_keep_all_tokens=False, import_paths=None, used_files=None):
self.global_keep_all_tokens = global_keep_all_tokens self.global_keep_all_tokens = global_keep_all_tokens
self.import_paths = import_paths or [] self.import_paths = import_paths or []
self.used_files = used_files or {}


self._definitions = {} self._definitions = {}
self._ignore_names = [] self._ignore_names = []
@@ -1153,7 +1143,12 @@ class GrammarBuilder:
except IOError: except IOError:
continue continue
else: else:
gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths)
h = hashlib.md5(text.encode('utf8')).hexdigest()
if self.used_files.get(joined_path, h) != h:
raise RuntimeError("Grammar file was changed during importing")
self.used_files[joined_path] = h
gb = GrammarBuilder(self.global_keep_all_tokens, self.import_paths, self.used_files)
gb.load_grammar(text, joined_path, mangle) gb.load_grammar(text, joined_path, mangle)
gb._remove_unused(map(mangle, aliases)) gb._remove_unused(map(mangle, aliases))
for name in gb._definitions: for name in gb._definitions:
@@ -1210,7 +1205,26 @@ class GrammarBuilder:
# resolve_term_references(term_defs) # resolve_term_references(term_defs)
return Grammar(rule_defs, term_defs, self._ignore_names) return Grammar(rule_defs, term_defs, self._ignore_names)



def verify_used_files(file_hashes):
for path, old in file_hashes.items():
text = None
if isinstance(path, str) and os.path.exists(path):
with open(path, encoding='utf8') as f:
text = f.read()
elif isinstance(path, PackageResource):
with suppress(IOError):
text = pkgutil.get_data(*path).decode('utf-8')
if text is None: # We don't know how to load the path. ignore it.
continue
current = hashlib.md5(text.encode()).hexdigest()
if old != current:
logger.info("File %r changed, rebuilding Parser" % path)
return False
return True

def load_grammar(grammar, source, import_paths, global_keep_all_tokens): def load_grammar(grammar, source, import_paths, global_keep_all_tokens):
builder = GrammarBuilder(global_keep_all_tokens, import_paths) builder = GrammarBuilder(global_keep_all_tokens, import_paths)
builder.load_grammar(grammar, source) builder.load_grammar(grammar, source)
return builder.build()
return builder.build(), builder.used_files

+ 15
- 1
lark/utils.py View File

@@ -1,3 +1,4 @@
import hashlib
import unicodedata import unicodedata
import os import os
from functools import reduce from functools import reduce
@@ -6,6 +7,7 @@ from collections import deque
###{standalone ###{standalone
import sys, re import sys, re
import logging import logging
from io import open
logger = logging.getLogger("lark") logger = logging.getLogger("lark")
logger.addHandler(logging.StreamHandler()) logger.addHandler(logging.StreamHandler())
# Set to highest level, since we have some warnings amongst the code # Set to highest level, since we have some warnings amongst the code
@@ -281,9 +283,21 @@ def combine_alternatives(lists):
return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init) return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init)




try:
import atomicwrites
except ImportError:
atomicwrites = None

class FS: class FS:
open = open
exists = os.path.exists exists = os.path.exists
@staticmethod
def open(name, mode="r", **kwargs):
if atomicwrites and "w" in mode:
return atomicwrites.atomic_write(name, mode=mode, override=True, **kwargs)
else:
return open(name, mode, **kwargs)





def isascii(s): def isascii(s):


+ 2
- 1
setup.py View File

@@ -16,7 +16,8 @@ setup(


extras_require = { extras_require = {
"regex": ["regex"], "regex": ["regex"],
"nearley": ["js2py"]
"nearley": ["js2py"],
"atomicwrites": ["atomicwrites"],
}, },


package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']},


+ 73
- 60
tests/test_cache.py View File

@@ -58,71 +58,84 @@ def append_zero(t):




class TestCache(TestCase): class TestCache(TestCase):
g = '''start: "a"'''

def setUp(self): def setUp(self):
pass
self.fs = lark_module.FS
self.mock_fs = MockFS()
lark_module.FS = self.mock_fs
def tearDown(self):
self.mock_fs.files = {}
lark_module.FS = self.fs


def test_simple(self): def test_simple(self):
g = '''start: "a"'''

fn = "bla" fn = "bla"


fs = lark_module.FS
mock_fs = MockFS()
try:
lark_module.FS = mock_fs
Lark(g, parser='lalr', cache=fn)
assert fn in mock_fs.files
parser = Lark(g, parser='lalr', cache=fn)
assert parser.parse('a') == Tree('start', [])

mock_fs.files = {}
assert len(mock_fs.files) == 0
Lark(g, parser='lalr', cache=True)
assert len(mock_fs.files) == 1
parser = Lark(g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

parser = Lark(g + ' "b"', parser='lalr', cache=True)
assert len(mock_fs.files) == 2
assert parser.parse('ab') == Tree('start', [])

parser = Lark(g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

# Test with custom lexer
mock_fs.files = {}
parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True)
assert len(mock_fs.files) == 1
assert parser.parse('a') == Tree('start', [])

# Test options persistence
mock_fs.files = {}
Lark(g, parser="lalr", debug=True, cache=True)
parser = Lark(g, parser="lalr", debug=True, cache=True)
assert parser.options.options['debug']

# Test inline transformer (tree-less) & lexer_callbacks
mock_fs.files = {}
g = """
start: add+
add: NUM "+" NUM
NUM: /\d+/
%ignore " "
"""
text = "1+2 3+4"
expected = Tree('start', [30, 70])

parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
res0 = parser.parse(text)
parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
assert len(mock_fs.files) == 1
res1 = parser.parse(text)
res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text))
assert res0 == res1 == res2 == expected

finally:
lark_module.FS = fs
Lark(self.g, parser='lalr', cache=fn)
assert fn in self.mock_fs.files
parser = Lark(self.g, parser='lalr', cache=fn)
assert parser.parse('a') == Tree('start', [])
def test_automatic_naming(self):
assert len(self.mock_fs.files) == 0
Lark(self.g, parser='lalr', cache=True)
assert len(self.mock_fs.files) == 1
parser = Lark(self.g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])

parser = Lark(self.g + ' "b"', parser='lalr', cache=True)
assert len(self.mock_fs.files) == 2
assert parser.parse('ab') == Tree('start', [])

parser = Lark(self.g, parser='lalr', cache=True)
assert parser.parse('a') == Tree('start', [])
def test_custom_lexer(self):

parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True)
parser = Lark(self.g, parser='lalr', lexer=CustomLexer, cache=True)
assert len(self.mock_fs.files) == 1
assert parser.parse('a') == Tree('start', [])

def test_options(self):
# Test options persistence
Lark(self.g, parser="lalr", debug=True, cache=True)
parser = Lark(self.g, parser="lalr", debug=True, cache=True)
assert parser.options.options['debug']
def test_inline(self):
# Test inline transformer (tree-less) & lexer_callbacks
g = """
start: add+
add: NUM "+" NUM
NUM: /\d+/
%ignore " "
"""
text = "1+2 3+4"
expected = Tree('start', [30, 70])

parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
res0 = parser.parse(text)
parser = Lark(g, parser='lalr', transformer=TestT(), cache=True, lexer_callbacks={'NUM': append_zero})
assert len(self.mock_fs.files) == 1
res1 = parser.parse(text)
res2 = TestT().transform(Lark(g, parser="lalr", cache=True, lexer_callbacks={'NUM': append_zero}).parse(text))
assert res0 == res1 == res2 == expected
def test_imports(self):
g = """
%import .grammars.ab (startab, expr)
"""
parser = Lark(g, parser='lalr', start='startab', cache=True)
assert len(self.mock_fs.files) == 1
parser = Lark(g, parser='lalr', start='startab', cache=True)
assert len(self.mock_fs.files) == 1
res = parser.parse("ab")
self.assertEqual(res, Tree('startab', [Tree('expr', ['a', 'b'])]))




if __name__ == '__main__': if __name__ == '__main__':


Loading…
Cancel
Save