diff --git a/ui/fixtures/cmd.container.tar.json b/ui/fixtures/cmd.container.tar.json new file mode 100644 index 0000000..e089666 --- /dev/null +++ b/ui/fixtures/cmd.container.tar.json @@ -0,0 +1,31 @@ +[ +{ + "title": "gen ident", + "cmd": [ "genident", "name=A Test User" ], + "exit": 0 +}, +{ + "special": "setup tar file" +}, +{ + "title": "import tar.gz container", + "cmd": [ "container", "testfile.tar.gz" ] +}, +{ + "special": "verify store object cnt", + "comment": "should have one container and one file", + "count": 2 +}, +{ + "title": "verify correct files imported", + "cmd": [ "dump" ], + "stdout_check": [ + { "type": "identity" }, + { "files": [ "testfiles/newfile.txt", "testfiles/test.txt" ], + "hashes": [ "sha512:90f8342520f0ac57fb5a779f5d331c2fa87aa40f8799940257f9ba619940951e67143a8d746535ed0284924b2b7bc1478f095198800ba96d01847d7b56ca465c", "sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f" ], + "type": "container", + "uri": "hash://sha512/79fab684ca73e25994c1b739dcf8f03acf27dff74d63b4b3affd9aa69fbb37d23794b723802cad131969225403846f8f8c470bc2432c32de34d39f044a360073" }, + { "type": "file", "hashes": [ "sha512:79fab684ca73e25994c1b739dcf8f03acf27dff74d63b4b3affd9aa69fbb37d23794b723802cad131969225403846f8f8c470bc2432c32de34d39f044a360073" ] } + ] +} +] diff --git a/ui/fixtures/testfile.tar.gz b/ui/fixtures/testfile.tar.gz new file mode 100644 index 0000000..2ccd671 Binary files /dev/null and b/ui/fixtures/testfile.tar.gz differ diff --git a/ui/medashare/cli.py b/ui/medashare/cli.py index f27f4a1..73673bd 100644 --- a/ui/medashare/cli.py +++ b/ui/medashare/cli.py @@ -18,17 +18,7 @@ if False: logging.getLogger('sqlalchemy').addHandler(_handler) logging.getLogger('sqlalchemy.engine').setLevel(lvl) -def _debprint(*args): # pragma: no cover - import traceback, sys, os.path - st = traceback.extract_stack(limit=2)[0] - - sep = '' - if args: - sep = ':' - - print('%s:%d%s' % (os.path.basename(st.filename), st.lineno, sep), - *args, file=_real_stderr) - sys.stderr.flush() +from .utils import _debprint #import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace() @@ -38,6 +28,7 @@ from unittest import mock from .hostid import hostuuid from .tags import TagCache from . import orm +from .magic_wrap import detect_from_filename from .btv import _TestCases as bttestcase, validate_file @@ -52,6 +43,7 @@ import importlib import io import itertools import json +import libarchive import magic import os.path import pathlib @@ -78,6 +70,11 @@ _validhashes = set([ 'sha256', 'sha512' ]) _hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in _validhashes } +def _makehashuri(hashstr): + hash, value = ObjectStore.makehash(hashstr).split(':') + + return f'hash://{hash}/{value}' + def _keyordering(x): k, v = x try: @@ -651,16 +648,20 @@ class ObjectStore(object): def _readfp(fp): while True: r = fp.read(64*1024) - if r == b'': + # libarchive returns None on EOF + if r == b'' or r is None: return yield r def _hashfile(fname): - hash = getattr(hashlib, _defaulthash)() with open(fname, 'rb') as fp: - for r in _readfp(fp): - hash.update(r) + return _hashfp(fp) + +def _hashfp(fp): + hash = getattr(hashlib, _defaulthash)() + for r in _readfp(fp): + hash.update(r) return '%s:%s' % (_defaulthash, hash.hexdigest()) @@ -1219,7 +1220,7 @@ def cmd_dump(options, persona, objstr, cache): def cmd_auto(options): for i in options.files: - mf = magic.detect_from_filename(i) + mf = detect_from_filename(i) primary = mf[0].split('/', 1)[0] mt = mf[0] @@ -1261,68 +1262,113 @@ def cmd_list(options, persona, objstr, cache): # This is needed so that if it creates a FileObj, which may be # expensive (hashing large file), that it gets saved. -@init_datastructs -def cmd_container(options, persona, objstr, cache): - for i in options.files: - with open(i, 'rb') as fp: - torrent = bencode.bdecode(fp.read()) - bencodedinfo = bencode.bencode(torrent['info']) - infohash = hashlib.sha1(bencodedinfo).hexdigest() +def handle_bittorrent(fname, persona, objstr): + with open(fname, 'rb') as fp: + torrent = bencode.bdecode(fp.read()) + bencodedinfo = bencode.bencode(torrent['info']) + infohash = hashlib.sha1(bencodedinfo).hexdigest() - # XXX - not entirely happy w/ URI - uri = 'magnet:?xt=urn:btih:%s&dn=%s' % (infohash, - torrent['info']['name'].decode('utf-8')) + # XXX - not entirely happy w/ URI + uri = 'magnet:?xt=urn:btih:%s&dn=%s' % (infohash, + torrent['info']['name'].decode('utf-8')) + try: + cont = objstr.by_id(Container.make_id(uri)) + except KeyError: + pass + else: + if not 'incomplete' in cont: + print('Warning, container already complete, skipping %s.' % repr(fname), file=sys.stderr) + return + + good, bad = validate_file(fname) + + if bad: + print('Warning, incomple/invalid files, not added for %s:' % + repr(fname), file=sys.stderr) + print('\n'.join('\t%s' % + repr(str(pathlib.Path(*x.parts[1:]))) for x in + sorted(bad)), file=sys.stderr) + + files = [] + hashes = [] + for j in sorted(good): + files.append(str(pathlib.PosixPath(*j.parts[1:]))) try: - cont = objstr.by_id(Container.make_id(uri)) - except KeyError: - pass - else: - if not 'incomplete' in cont: - print('Warning, container already complete, skipping %s.' % repr(i), file=sys.stderr) - continue + fobj = objstr.by_file(j, ('file',))[0] + except: + fobj = persona.by_file(j) + objstr.loadobj(fobj) - good, bad = validate_file(i) + # XXX - ensure only one is added? + hashes.extend(fobj.hashes) - if bad: - print('Warning, incomple/invalid files, not added for %s:' % repr(i), - file=sys.stderr) - print('\n'.join('\t%s' % - repr(str(pathlib.Path(*x.parts[1:]))) for x in - sorted(bad)), file=sys.stderr) + kwargs = dict(files=files, hashes=hashes, + uri=uri) + + if bad: + kwargs['incomplete'] = True + + # XXX - doesn't combine files/hashes, that is if a + # Container has one set of good files, and then the + # next scan has a different set, only the second set + # will be present, not any from the first set. + + try: + cont = objstr.by_id(Container.make_id(uri)) + cont = cont.new_version(dels=() if bad + else ('incomplete',), replaces=kwargs.items()) + except KeyError: + cont = persona.Container(**kwargs) + + objstr.loadobj(cont) +def handle_archive(fname, persona, objstr): + with libarchive.Archive(fname) as arch: files = [] hashes = [] - for j in sorted(good): - files.append(str(pathlib.PosixPath(*j.parts[1:]))) - try: - fobj = objstr.by_file(j, ('file',))[0] - except: - fobj = persona.by_file(j) - objstr.loadobj(fobj) - # XXX - ensure only one is added? - hashes.extend(fobj.hashes) + for i in arch: + if not i.isfile(): + continue - kwargs = dict(files=files, hashes=hashes, - uri=uri) + files.append(i.pathname) - if bad: - kwargs['incomplete'] = True + with arch.readstream(i.size) as fp: + hashes.append(_hashfp(fp)) - # XXX - doesn't combine files/hashes, that is if a - # Container has one set of good files, and then the - # next scan has a different set, only the second set - # will be present, not any from the first set. + try: + fobj = objstr.by_file(fname, ('file',))[0] + except: + fobj = persona.by_file(fname) + objstr.loadobj(fobj) - try: - cont = objstr.by_id(Container.make_id(uri)) - cont = cont.new_version(dels=() if bad - else ('incomplete',), replaces=kwargs.items()) - except KeyError: - cont = persona.Container(**kwargs) + uri = _makehashuri(fobj.hashes[0]) + + kwargs = dict(files=files, hashes=hashes, + uri=uri) + try: + cont = objstr.by_id(Container.make_id(uri)) + # XXX - only update when different, check uri + cont = cont.new_version(replaces=kwargs.items()) + except KeyError: + cont = persona.Container(**kwargs) + + objstr.loadobj(cont) + +_container_mapping = { + 'application/x-bittorrent': handle_bittorrent, + 'application/x-tar': handle_archive, +} + +@init_datastructs +def cmd_container(options, persona, objstr, cache): + for i in options.files: + mf = detect_from_filename(i) + #_debprint('mf:', repr(mf)) + fun = _container_mapping[mf.mime_type] - objstr.loadobj(cont) + fun(i, persona, objstr) def _json_objstream(fp): inp = fp.read() @@ -2135,6 +2181,9 @@ class _TestCases(unittest.TestCase): elif special == 'delete files': for i in cmd['files']: os.unlink(i) + elif special == 'setup tar file': + shutil.copy(self.fixtures / + 'testfile.tar.gz', self.tempdir) else: # pragma: no cover raise ValueError('unhandled special: %s' % repr(special)) @@ -2181,7 +2230,6 @@ class _TestCases(unittest.TestCase): if outcheck: stdout.seek(0) self.objcompare(_json_objstream(stdout), outcheck) - self.assertEqual(stderr.getvalue(), cmd.get('stderr', '')) @@ -2218,7 +2266,8 @@ class _TestCases(unittest.TestCase): self.setUp() os.chdir(self.tempdir) - self.run_command_file(i) + with self.subTest(file=i): + self.run_command_file(i) # XXX - the following test may no longer be needed def test_main(self): diff --git a/ui/medashare/magic b/ui/medashare/magic new file mode 100644 index 0000000..155fa22 --- /dev/null +++ b/ui/medashare/magic @@ -0,0 +1,12 @@ +# This file contains magic that is used by tests and the code +# that must be present to work properly. + +# Transmission adds this +0 string d10:created\ by BitTorrent file +!:mime application/x-bittorrent +!:ext torrent +# BitTornado adds this +0 string d13:creation\ date BitTorrent file +!:mime application/x-bittorrent +!:ext torrent + diff --git a/ui/medashare/magic_wrap.py b/ui/medashare/magic_wrap.py new file mode 100644 index 0000000..f8acdec --- /dev/null +++ b/ui/medashare/magic_wrap.py @@ -0,0 +1,155 @@ +import functools +import importlib +import magic +import os +import pathlib +import shutil +import tempfile +import unittest + +from .utils import _debprint + +__doc__ = ''' +This is a number of hacks to the Python magic module so that it works +better. These bugs should be fixed in the module, but I don't want to +deal w/ forking and getting the fixed upstreamed. +''' + +magic.FileMagic = magic.namedtuple('FileMagic', ('mime_type', 'encoding', + 'name', 'compressed_type'), defaults=[ '' ]) + +from magic import * + +__all__ = [ + 'detect_from_filename', + 'detect_from_content', +] + +_mgc_data = {} +_lcl_mgc_data = None + +# Wrapper magic.open so that we look past compression +_real_magic_open = magic.open + +@functools.wraps(magic.open) +def open(flags): + return _real_magic_open(flags|magic.MAGIC_COMPRESS) + +magic.open = open + +def _create_filemagic(mime_detected, type_detected): + try: + mime_type, mime_encoding = mime_detected.split('; ', 1) + except ValueError: + raise ValueError(mime_detected) + + kwargs = {} + try: + mime_encoding, compressed_type = mime_encoding.split(' compressed-encoding=') + except ValueError: + pass + else: + compressed_type, _ = compressed_type.split('; ', 1) + kwargs['compressed_type'] = compressed_type + + return FileMagic(name=type_detected, mime_type=mime_type, + encoding=mime_encoding.replace('charset=', ''), **kwargs) + +magic._create_filemagic = _create_filemagic + +def _get_mgc_data(fname): + try: + return _mgc_data[fname] + except KeyError: + data = pathlib.Path(fname).read_bytes() + _mgc_data[fname] = data + + return data + +def _compile_file(inp, out, tempdir): + oldcwd = pathlib.Path.cwd() + + try: + os.chdir(tempdir) + + mag = magic.open(magic.MAGIC_NONE) + + mag.compile(str(inp)) + + inp.with_suffix('.mgc').rename(out) + finally: + os.chdir(oldcwd) + +def _compile_lcl(): + magicfile = importlib.resources.files('medashare') / 'magic' + + try: + d = pathlib.Path(tempfile.mkdtemp()).resolve() + + # write out data + inpfile = d / 'magic' + inpfile.write_bytes(magicfile.read_bytes()) + + # where it'll go + outfile = d / 'someotherfile' + _compile_file(inpfile, outfile, tempdir=d) + + return outfile.read_bytes() + finally: + shutil.rmtree(d) + + +def _get_mgc_res(): + global _lcl_mgc_data + if _lcl_mgc_data is None: + try: + mgcfile = importlib.resources.files('medashare') / 'magic.mgc' + _lcl_mgc_data = mgcfile.read_bytes() + except FileNotFoundError: + _lcl_mgc_data = _compile_lcl() + _debprint(repr(_lcl_mgc_data)) + + return _lcl_mgc_data + +# patch magic to load custom magic file +_mgp = magic._libraries['magic'].magic_getpath +_mgp.restype = magic.c_char_p +_mgp.argtypes = [ magic.c_char_p, magic.c_int ] +_mlb = magic._libraries['magic'].magic_load_buffers +_mlb.restype = magic.c_int +_mlb.argtypes = [ magic.magic_t, magic.POINTER(magic.c_void_p), magic.POINTER(magic.c_size_t), magic.c_size_t ] + +def _new_magic_load(self, fname=None): + files = _mgp(None, 0).decode('utf-8') + '.mgc' + ':' + str(pathlib.Path(__file__).parent / 'magic') + + return magic._load(self._magic_t, files.encode('utf-8')) + + # XXX - for some reason this code isn't working + files = [ _mgp(None, 0).decode('utf-8') + '.mgc' ] + + buffers = [ _get_mgc_data(x) for x in files ] + [ _get_mgc_res() ] + #buffers.reverse() + del buffers[0] + cnt = len(buffers) + + mgcdatas = [ (magic.c_char * len(x))(*x) for x in buffers ] + + bufs = (magic.c_void_p * cnt)(*(magic.cast(magic.pointer(x), magic.c_void_p) for x in mgcdatas)) + sizes = (magic.c_size_t * cnt)(*(len(x) for x in buffers)) + + _debprint('mg:', cnt, repr([len(x) for x in buffers])) + + r = _mlb(self._magic_t, bufs, sizes, cnt) + + return r + +magic.Magic.load = _new_magic_load + +class _TestMagic(unittest.TestCase): + def test_create_filemagic(self): + a = _create_filemagic('application/x-tar; charset=binary compressed-encoding=application/gzip; charset=binary', 'foobar') + + self.assertEqual(a.mime_type, 'application/x-tar') + self.assertEqual(a.encoding, 'binary') + self.assertEqual(a.compressed_type, 'application/gzip') + self.assertEqual(a.name, 'foobar') diff --git a/ui/medashare/tests.py b/ui/medashare/tests.py index 08f2c2a..9cca7a3 100644 --- a/ui/medashare/tests.py +++ b/ui/medashare/tests.py @@ -6,3 +6,4 @@ from .cli import _TestMigrations from .tags import _TestTagCache from .mtree import Test from .server import _TestCases, _TestPostConfig +from .magic_wrap import _TestMagic diff --git a/ui/medashare/utils.py b/ui/medashare/utils.py index 8c09292..7832385 100644 --- a/ui/medashare/utils.py +++ b/ui/medashare/utils.py @@ -1,8 +1,23 @@ import base64 import datetime import pasn1 +import sys import uuid +_real_stderr = sys.stderr + +def _debprint(*args): # pragma: no cover + import traceback, sys, os.path + st = traceback.extract_stack(limit=2)[0] + + sep = '' + if args: + sep = ':' + + print('%s:%d%s' % (os.path.basename(st.filename), st.lineno, sep), + *args, file=_real_stderr) + sys.stderr.flush() + def _makeuuid(s): if isinstance(s, uuid.UUID): return s diff --git a/ui/setup.py b/ui/setup.py index 7498ff3..6c75328 100644 --- a/ui/setup.py +++ b/ui/setup.py @@ -2,13 +2,43 @@ # python setup.py --dry-run --verbose install import os.path -from setuptools import setup, find_packages +import pathlib +import shutil +import subprocess +from setuptools import setup, find_packages, Command, Extension +from setuptools.command.build_ext import build_ext +#from medashare.magic_wrap import compile_file -from distutils.core import setup +class file_ext(build_ext): + def __init__(self, dist): + super().__init__(dist) + + def run(self): + # do the building + #print(repr(self.distribution)) + fnames = [ (x, pathlib.Path(self.build_lib) / 'medashare' / x) for x in self.get_source_files() ] + + + oldcwd = os.getcwd() + for src, dst in fnames: + os.chdir(oldcwd) + shutil.copyfile(src, dst) + os.chdir(dst.parent) + cmd = [ 'file', '-C', '-m' ] + [ str(dst) for src, dst in fnames ] + #print('running:', cmd) + r = subprocess.run(cmd) + + os.chdir(oldcwd) + r.check_returncode() + + def get_outputs(self): + return [ '%s.mgc' % i for i in self.get_source_files() ] + + # method build_extension not needed, in run setup( name='medashare', - version='0.1.0', + version='0.1.1', author='John-Mark Gurney', author_email='jmg@funkthat.com', packages=find_packages(), @@ -18,6 +48,9 @@ setup( #download_url='', long_description=open('README.md').read(), python_requires='>=3.8', + # This isn't needed till magic_wrap.py can use it + #cmdclass=dict(build_ext=file_ext), + #ext_modules=[ Extension(name='magic', sources=['medashare/magic']) ], install_requires=[ 'alembic', 'base58', @@ -30,9 +63,14 @@ setup( 'hypercorn', # option, for server only? 'orm', 'pasn1 @ git+https://www.funkthat.com/gitea/jmg/pasn1.git@c6c64510b42292557ace2b77272eb32cb647399d#egg=pasn1', + 'python-libarchive @ git+https://www.funkthat.com/gitea/jmg/python-libarchive.git#egg=python-libarchive', 'file-magic @ git+https://github.com/file/file.git#egg=file-magic&subdirectory=python', 'pydantic[dotenv]', ], + include_package_data=True, + package_data={ + 'medashare': [ 'alembic/**/*.py', 'alembic.ini', ], + }, extras_require = { # requests needed for fastpi.testclient.TestClient 'dev': [ 'coverage', 'requests' ],