add support for archives, such as tar.gz...

2 years ago · b75a4d82da
--- a/ui/fixtures/cmd.container.tar.json
+++ b/ui/fixtures/cmd.container.tar.json
@@ -0,0 +1,31 @@
 [
 {
 	"title": "gen ident",
 	"cmd": [ "genident", "name=A Test User" ],
 	"exit": 0
 },
 {
 	"special": "setup tar file"
 },
 {
 	"title": "import tar.gz container",
 	"cmd": [ "container", "testfile.tar.gz" ]
 },
 {
        "special": "verify store object cnt",
        "comment": "should have one container and one file",
        "count": 2
 },
 {
 	"title": "verify correct files imported",
 	"cmd": [ "dump" ],
 	"stdout_check": [
 		{ "type": "identity" },
 		{ "files": [ "testfiles/newfile.txt", "testfiles/test.txt" ],
 		  "hashes": [ "sha512:90f8342520f0ac57fb5a779f5d331c2fa87aa40f8799940257f9ba619940951e67143a8d746535ed0284924b2b7bc1478f095198800ba96d01847d7b56ca465c", "sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f" ],
 		  "type": "container",
 		  "uri": "hash://sha512/79fab684ca73e25994c1b739dcf8f03acf27dff74d63b4b3affd9aa69fbb37d23794b723802cad131969225403846f8f8c470bc2432c32de34d39f044a360073" },
 		{ "type": "file", "hashes": [ "sha512:79fab684ca73e25994c1b739dcf8f03acf27dff74d63b4b3affd9aa69fbb37d23794b723802cad131969225403846f8f8c470bc2432c32de34d39f044a360073" ] }
 	]
 }
 ]
--- a/ui/fixtures/testfile.tar.gz
+++ b/ui/fixtures/testfile.tar.gz
--- a/ui/medashare/cli.py
+++ b/ui/medashare/cli.py
@@ -18,17 +18,7 @@ if False:
 	logging.getLogger('sqlalchemy').addHandler(_handler)
 	logging.getLogger('sqlalchemy.engine').setLevel(lvl)

 def _debprint(*args): # pragma: no cover
 	import traceback, sys, os.path
 	st = traceback.extract_stack(limit=2)[0]

 	sep = ''
 	if args:
 		sep = ':'

 	print('%s:%d%s' % (os.path.basename(st.filename), st.lineno, sep),
 	    *args, file=_real_stderr)
 	sys.stderr.flush()
 from .utils import _debprint

 #import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace()

@@ -38,6 +28,7 @@ from unittest import mock
 from .hostid import hostuuid
 from .tags import TagCache
 from . import orm
 from .magic_wrap import detect_from_filename

 from .btv import _TestCases as bttestcase, validate_file

@@ -52,6 +43,7 @@ import importlib
 import io
 import itertools
 import json
 import libarchive
 import magic
 import os.path
 import pathlib
@@ -78,6 +70,11 @@ _validhashes = set([ 'sha256', 'sha512' ])
 _hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in
 	_validhashes }

 def _makehashuri(hashstr):
 	hash, value = ObjectStore.makehash(hashstr).split(':')

 	return f'hash://{hash}/{value}'

 def _keyordering(x):
 	k, v = x
 	try:
@@ -651,16 +648,20 @@ class ObjectStore(object):
 def _readfp(fp):
 	while True:
 		r = fp.read(64*1024)
 		if r == b'':
 		# libarchive returns None on EOF
 		if r == b'' or r is None:
 			return

 		yield r

 def _hashfile(fname):
 	hash = getattr(hashlib, _defaulthash)()
 	with open(fname, 'rb') as fp:
 		for r in _readfp(fp):
 			hash.update(r)
 		return _hashfp(fp)

 def _hashfp(fp):
 	hash = getattr(hashlib, _defaulthash)()
 	for r in _readfp(fp):
 		hash.update(r)

 	return '%s:%s' % (_defaulthash, hash.hexdigest())

@@ -1219,7 +1220,7 @@ def cmd_dump(options, persona, objstr, cache):

 def cmd_auto(options):
 	for i in options.files:
 		mf = magic.detect_from_filename(i)
 		mf = detect_from_filename(i)

 		primary = mf[0].split('/', 1)[0]
 		mt = mf[0]
@@ -1261,68 +1262,113 @@ def cmd_list(options, persona, objstr, cache):
 	# This is needed so that if it creates a FileObj, which may be
 	# expensive (hashing large file), that it gets saved.

@init_datastructs
 def cmd_container(options, persona, objstr, cache):
 	for i in options.files:
 		with open(i, 'rb') as fp:
 			torrent = bencode.bdecode(fp.read())
 			bencodedinfo = bencode.bencode(torrent['info'])
 			infohash = hashlib.sha1(bencodedinfo).hexdigest()
 def handle_bittorrent(fname, persona, objstr):
 	with open(fname, 'rb') as fp:
 		torrent = bencode.bdecode(fp.read())
 		bencodedinfo = bencode.bencode(torrent['info'])
 		infohash = hashlib.sha1(bencodedinfo).hexdigest()

 		# XXX - not entirely happy w/ URI
 		uri = 'magnet:?xt=urn:btih:%s&dn=%s' % (infohash,
 		    torrent['info']['name'].decode('utf-8'))
 	# XXX - not entirely happy w/ URI
 	uri = 'magnet:?xt=urn:btih:%s&dn=%s' % (infohash,
 	    torrent['info']['name'].decode('utf-8'))

 	try:
 		cont = objstr.by_id(Container.make_id(uri))
 	except KeyError:
 		pass
 	else:
 		if not 'incomplete' in cont:
 			print('Warning, container already complete, skipping %s.' % repr(fname), file=sys.stderr)
 			return

 	good, bad = validate_file(fname)

 	if bad:
 		print('Warning, incomple/invalid files, not added for %s:' %
 		    repr(fname), file=sys.stderr)
 		print('\n'.join('\t%s' %
 		    repr(str(pathlib.Path(*x.parts[1:]))) for x in
 		    sorted(bad)), file=sys.stderr)

 	files = []
 	hashes = []
 	for j in sorted(good):
 		files.append(str(pathlib.PosixPath(*j.parts[1:])))
 		try:
 			cont = objstr.by_id(Container.make_id(uri))
 		except KeyError:
 			pass
 		else:
 			if not 'incomplete' in cont:
 				print('Warning, container already complete, skipping %s.' % repr(i), file=sys.stderr)
 				continue
 			fobj = objstr.by_file(j, ('file',))[0]
 		except:
 			fobj = persona.by_file(j)
 			objstr.loadobj(fobj)

 		good, bad = validate_file(i)
 		# XXX - ensure only one is added?
 		hashes.extend(fobj.hashes)

 		if bad:
 			print('Warning, incomple/invalid files, not added for %s:' % repr(i),
 			    file=sys.stderr)
 			print('\n'.join('\t%s' %
 			    repr(str(pathlib.Path(*x.parts[1:]))) for x in
 			    sorted(bad)), file=sys.stderr)
 	kwargs = dict(files=files, hashes=hashes,
 		    uri=uri)

 	if bad:
 		kwargs['incomplete'] = True

 	# XXX - doesn't combine files/hashes, that is if a
 	# Container has one set of good files, and then the
 	# next scan has a different set, only the second set
 	# will be present, not any from the first set.

 	try:
 		cont = objstr.by_id(Container.make_id(uri))
 		cont = cont.new_version(dels=() if bad
 		    else ('incomplete',), replaces=kwargs.items())
 	except KeyError:
 		cont = persona.Container(**kwargs)

 	objstr.loadobj(cont)

 def handle_archive(fname, persona, objstr):
 	with libarchive.Archive(fname) as arch:
 		files = []
 		hashes = []
 		for j in sorted(good):
 			files.append(str(pathlib.PosixPath(*j.parts[1:])))
 			try:
 				fobj = objstr.by_file(j, ('file',))[0]
 			except:
 				fobj = persona.by_file(j)
 				objstr.loadobj(fobj)

 			# XXX - ensure only one is added?
 			hashes.extend(fobj.hashes)
 		for i in arch:
 			if not i.isfile():
 				continue

 		kwargs = dict(files=files, hashes=hashes,
 			    uri=uri)
 			files.append(i.pathname)

 		if bad:
 			kwargs['incomplete'] = True
 			with arch.readstream(i.size) as fp:
 				hashes.append(_hashfp(fp))

 		# XXX - doesn't combine files/hashes, that is if a
 		# Container has one set of good files, and then the
 		# next scan has a different set, only the second set
 		# will be present, not any from the first set.
 	try:
 		fobj = objstr.by_file(fname, ('file',))[0]
 	except:
 		fobj = persona.by_file(fname)
 		objstr.loadobj(fobj)

 		try:
 			cont = objstr.by_id(Container.make_id(uri))
 			cont = cont.new_version(dels=() if bad
 			    else ('incomplete',), replaces=kwargs.items())
 		except KeyError:
 			cont = persona.Container(**kwargs)
 	uri = _makehashuri(fobj.hashes[0])

 	kwargs = dict(files=files, hashes=hashes,
 		    uri=uri)
 	try:
 		cont = objstr.by_id(Container.make_id(uri))
 		# XXX - only update when different, check uri
 		cont = cont.new_version(replaces=kwargs.items())
 	except KeyError:
 		cont = persona.Container(**kwargs)

 	objstr.loadobj(cont)

 _container_mapping = {
 	'application/x-bittorrent': handle_bittorrent,
 	'application/x-tar': handle_archive,
 }

@init_datastructs
 def cmd_container(options, persona, objstr, cache):
 	for i in options.files:
 		mf = detect_from_filename(i)
 		#_debprint('mf:', repr(mf))
 		fun = _container_mapping[mf.mime_type]

 		objstr.loadobj(cont)
 		fun(i, persona, objstr)

 def _json_objstream(fp):
 	inp = fp.read()
@@ -2135,6 +2181,9 @@ class _TestCases(unittest.TestCase):
 				elif special == 'delete files':
 					for i in cmd['files']:
 						os.unlink(i)
 				elif special == 'setup tar file':
 					shutil.copy(self.fixtures /
 					    'testfile.tar.gz', self.tempdir)
 				else: # pragma: no cover
 					raise ValueError('unhandled special: %s' % repr(special))

@@ -2181,7 +2230,6 @@ class _TestCases(unittest.TestCase):
 				if outcheck:
 					stdout.seek(0)
 					self.objcompare(_json_objstream(stdout), outcheck)
 					

 				self.assertEqual(stderr.getvalue(), cmd.get('stderr', ''))

@@ -2218,7 +2266,8 @@ class _TestCases(unittest.TestCase):
 			self.setUp()

 			os.chdir(self.tempdir)
 			self.run_command_file(i)
 			with self.subTest(file=i):
 				self.run_command_file(i)

 	# XXX - the following test may no longer be needed
 	def test_main(self):
--- a/ui/medashare/magic
+++ b/ui/medashare/magic
@@ -0,0 +1,12 @@
 # This file contains magic that is used by tests and the code
 # that must be present to work properly.

 # Transmission adds this
 0       string  d10:created\ by         BitTorrent file
 !:mime  application/x-bittorrent
 !:ext   torrent
 # BitTornado adds this
 0       string  d13:creation\ date              BitTorrent file
 !:mime  application/x-bittorrent
 !:ext   torrent

--- a/ui/medashare/magic_wrap.py
+++ b/ui/medashare/magic_wrap.py
@@ -0,0 +1,155 @@
 import functools
 import importlib
 import magic
 import os
 import pathlib
 import shutil
 import tempfile
 import unittest

 from .utils import _debprint

 __doc__ = '''
 This is a number of hacks to the Python magic module so that it works
 better.  These bugs should be fixed in the module, but I don't want to
 deal w/ forking and getting the fixed upstreamed.
 '''

 magic.FileMagic = magic.namedtuple('FileMagic', ('mime_type', 'encoding',
    'name', 'compressed_type'), defaults=[ '' ])

 from magic import *

 __all__ = [
 	'detect_from_filename',
 	'detect_from_content',
 ]

 _mgc_data = {}
 _lcl_mgc_data = None

 # Wrapper magic.open so that we look past compression
 _real_magic_open = magic.open

@functools.wraps(magic.open)
 def open(flags):
 	return _real_magic_open(flags|magic.MAGIC_COMPRESS)

 magic.open = open

 def _create_filemagic(mime_detected, type_detected):
 	try:
 		mime_type, mime_encoding = mime_detected.split('; ', 1)
 	except ValueError:
 		raise ValueError(mime_detected)

 	kwargs = {}
 	try:
 		mime_encoding, compressed_type = mime_encoding.split(' compressed-encoding=')
 	except ValueError:
 		pass
 	else:
 		compressed_type, _ = compressed_type.split('; ', 1)
 		kwargs['compressed_type'] = compressed_type

 	return FileMagic(name=type_detected, mime_type=mime_type,
 	    encoding=mime_encoding.replace('charset=', ''), **kwargs)

 magic._create_filemagic = _create_filemagic

 def _get_mgc_data(fname):
 	try:
 		return _mgc_data[fname]
 	except KeyError:
 		data = pathlib.Path(fname).read_bytes()
 		_mgc_data[fname] = data

 		return data

 def _compile_file(inp, out, tempdir):
 	oldcwd = pathlib.Path.cwd()

 	try:
 		os.chdir(tempdir)

 		mag = magic.open(magic.MAGIC_NONE)

 		mag.compile(str(inp))

 		inp.with_suffix('.mgc').rename(out)
 	finally:
 		os.chdir(oldcwd)

 def _compile_lcl():
 	magicfile = importlib.resources.files('medashare') / 'magic'

 	try:
 		d = pathlib.Path(tempfile.mkdtemp()).resolve()

 		# write out data
 		inpfile = d / 'magic'
 		inpfile.write_bytes(magicfile.read_bytes())

 		# where it'll go
 		outfile = d / 'someotherfile'
 		_compile_file(inpfile, outfile, tempdir=d)

 		return outfile.read_bytes()
 	finally:
 		shutil.rmtree(d)


 def _get_mgc_res():
 	global _lcl_mgc_data
 	if _lcl_mgc_data is None:
 		try:
 			mgcfile = importlib.resources.files('medashare') / 'magic.mgc'
 			_lcl_mgc_data = mgcfile.read_bytes()
 		except FileNotFoundError:
 			_lcl_mgc_data = _compile_lcl()
 			_debprint(repr(_lcl_mgc_data))

 	return _lcl_mgc_data

 # patch magic to load custom magic file
 _mgp = magic._libraries['magic'].magic_getpath
 _mgp.restype = magic.c_char_p
 _mgp.argtypes = [ magic.c_char_p, magic.c_int ]
 _mlb = magic._libraries['magic'].magic_load_buffers
 _mlb.restype = magic.c_int
 _mlb.argtypes = [ magic.magic_t, magic.POINTER(magic.c_void_p), magic.POINTER(magic.c_size_t), magic.c_size_t ]

 def _new_magic_load(self, fname=None):
 	files = _mgp(None, 0).decode('utf-8') + '.mgc' + ':' + str(pathlib.Path(__file__).parent / 'magic')

 	return magic._load(self._magic_t, files.encode('utf-8'))

 	# XXX - for some reason this code isn't working
 	files = [ _mgp(None, 0).decode('utf-8') + '.mgc' ]

 	buffers = [ _get_mgc_data(x) for x in files ] + [ _get_mgc_res() ]
 	#buffers.reverse()
 	del buffers[0]
 	cnt = len(buffers)

 	mgcdatas = [ (magic.c_char * len(x))(*x) for x in buffers ]

 	bufs = (magic.c_void_p * cnt)(*(magic.cast(magic.pointer(x), magic.c_void_p) for x in mgcdatas))
 	sizes = (magic.c_size_t * cnt)(*(len(x) for x in buffers))

 	_debprint('mg:', cnt, repr([len(x) for x in buffers]))

 	r = _mlb(self._magic_t, bufs, sizes, cnt)

 	return r

 magic.Magic.load = _new_magic_load

 class _TestMagic(unittest.TestCase):
 	def test_create_filemagic(self):
 		a = _create_filemagic('application/x-tar; charset=binary compressed-encoding=application/gzip; charset=binary', 'foobar')

 		self.assertEqual(a.mime_type, 'application/x-tar')
 		self.assertEqual(a.encoding, 'binary')
 		self.assertEqual(a.compressed_type, 'application/gzip')
 		self.assertEqual(a.name, 'foobar')
--- a/ui/medashare/tests.py
+++ b/ui/medashare/tests.py
@@ -6,3 +6,4 @@ from .cli import _TestMigrations
 from .tags import _TestTagCache
 from .mtree import Test
 from .server import _TestCases, _TestPostConfig
 from .magic_wrap import _TestMagic
--- a/ui/medashare/utils.py
+++ b/ui/medashare/utils.py
@@ -1,8 +1,23 @@
 import base64
 import datetime
 import pasn1
 import sys
 import uuid

 _real_stderr = sys.stderr

 def _debprint(*args): # pragma: no cover
 	import traceback, sys, os.path
 	st = traceback.extract_stack(limit=2)[0]

 	sep = ''
 	if args:
 		sep = ':'

 	print('%s:%d%s' % (os.path.basename(st.filename), st.lineno, sep),
 	    *args, file=_real_stderr)
 	sys.stderr.flush()

 def _makeuuid(s):
 	if isinstance(s, uuid.UUID):
 		return s
--- a/ui/setup.py
+++ b/ui/setup.py
@@ -2,13 +2,43 @@
 # python setup.py --dry-run --verbose install

 import os.path
 from setuptools import setup, find_packages
 import pathlib
 import shutil
 import subprocess
 from setuptools import setup, find_packages, Command, Extension
 from setuptools.command.build_ext import build_ext
 #from medashare.magic_wrap import compile_file

 from distutils.core import setup
 class file_ext(build_ext):
 	def __init__(self, dist):
 		super().__init__(dist)

 	def run(self):
 		# do the building
 		#print(repr(self.distribution))
 		fnames = [ (x, pathlib.Path(self.build_lib) / 'medashare' / x) for x in self.get_source_files() ]


 		oldcwd = os.getcwd()
 		for src, dst in fnames:
 			os.chdir(oldcwd)
 			shutil.copyfile(src, dst)
 			os.chdir(dst.parent)
 			cmd = [ 'file', '-C', '-m' ] + [ str(dst) for src, dst in fnames ]
 			#print('running:', cmd)
 			r = subprocess.run(cmd)

 		os.chdir(oldcwd)
 		r.check_returncode()

 	def get_outputs(self):
 		return [ '%s.mgc' % i for i in self.get_source_files() ]

 	# method build_extension not needed, in run

 setup(
    name='medashare',
    version='0.1.0',
    version='0.1.1',
    author='John-Mark Gurney',
    author_email='jmg@funkthat.com',
    packages=find_packages(),
@@ -18,6 +48,9 @@ setup(
    #download_url='',
    long_description=open('README.md').read(),
    python_requires='>=3.8',
    # This isn't needed till magic_wrap.py can use it
    #cmdclass=dict(build_ext=file_ext),
    #ext_modules=[ Extension(name='magic', sources=['medashare/magic']) ],
    install_requires=[
 	'alembic',
 	'base58',
@@ -30,9 +63,14 @@ setup(
        'hypercorn', # option, for server only?
        'orm',
 	'pasn1 @ git+https://www.funkthat.com/gitea/jmg/pasn1.git@c6c64510b42292557ace2b77272eb32cb647399d#egg=pasn1',
 	'python-libarchive @ git+https://www.funkthat.com/gitea/jmg/python-libarchive.git#egg=python-libarchive',
 	'file-magic @ git+https://github.com/file/file.git#egg=file-magic&subdirectory=python',
        'pydantic[dotenv]',
    ],
    include_package_data=True,
    package_data={
        'medashare': [ 'alembic/**/*.py', 'alembic.ini', ],
    },
    extras_require = {
 	# requests needed for fastpi.testclient.TestClient
        'dev': [ 'coverage', 'requests' ],