Browse Source

add support for archives, such as tar.gz...

main
John-Mark Gurney 2 years ago
parent
commit
b75a4d82da
8 changed files with 370 additions and 69 deletions
  1. +31
    -0
      ui/fixtures/cmd.container.tar.json
  2. BIN
      ui/fixtures/testfile.tar.gz
  3. +115
    -66
      ui/medashare/cli.py
  4. +12
    -0
      ui/medashare/magic
  5. +155
    -0
      ui/medashare/magic_wrap.py
  6. +1
    -0
      ui/medashare/tests.py
  7. +15
    -0
      ui/medashare/utils.py
  8. +41
    -3
      ui/setup.py

+ 31
- 0
ui/fixtures/cmd.container.tar.json View File

@@ -0,0 +1,31 @@
[
{
"title": "gen ident",
"cmd": [ "genident", "name=A Test User" ],
"exit": 0
},
{
"special": "setup tar file"
},
{
"title": "import tar.gz container",
"cmd": [ "container", "testfile.tar.gz" ]
},
{
"special": "verify store object cnt",
"comment": "should have one container and one file",
"count": 2
},
{
"title": "verify correct files imported",
"cmd": [ "dump" ],
"stdout_check": [
{ "type": "identity" },
{ "files": [ "testfiles/newfile.txt", "testfiles/test.txt" ],
"hashes": [ "sha512:90f8342520f0ac57fb5a779f5d331c2fa87aa40f8799940257f9ba619940951e67143a8d746535ed0284924b2b7bc1478f095198800ba96d01847d7b56ca465c", "sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f" ],
"type": "container",
"uri": "hash://sha512/79fab684ca73e25994c1b739dcf8f03acf27dff74d63b4b3affd9aa69fbb37d23794b723802cad131969225403846f8f8c470bc2432c32de34d39f044a360073" },
{ "type": "file", "hashes": [ "sha512:79fab684ca73e25994c1b739dcf8f03acf27dff74d63b4b3affd9aa69fbb37d23794b723802cad131969225403846f8f8c470bc2432c32de34d39f044a360073" ] }
]
}
]

BIN
ui/fixtures/testfile.tar.gz View File


+ 115
- 66
ui/medashare/cli.py View File

@@ -18,17 +18,7 @@ if False:
logging.getLogger('sqlalchemy').addHandler(_handler)
logging.getLogger('sqlalchemy.engine').setLevel(lvl)

def _debprint(*args): # pragma: no cover
import traceback, sys, os.path
st = traceback.extract_stack(limit=2)[0]

sep = ''
if args:
sep = ':'

print('%s:%d%s' % (os.path.basename(st.filename), st.lineno, sep),
*args, file=_real_stderr)
sys.stderr.flush()
from .utils import _debprint

#import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace()

@@ -38,6 +28,7 @@ from unittest import mock
from .hostid import hostuuid
from .tags import TagCache
from . import orm
from .magic_wrap import detect_from_filename

from .btv import _TestCases as bttestcase, validate_file

@@ -52,6 +43,7 @@ import importlib
import io
import itertools
import json
import libarchive
import magic
import os.path
import pathlib
@@ -78,6 +70,11 @@ _validhashes = set([ 'sha256', 'sha512' ])
_hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in
_validhashes }

def _makehashuri(hashstr):
hash, value = ObjectStore.makehash(hashstr).split(':')

return f'hash://{hash}/{value}'

def _keyordering(x):
k, v = x
try:
@@ -651,16 +648,20 @@ class ObjectStore(object):
def _readfp(fp):
while True:
r = fp.read(64*1024)
if r == b'':
# libarchive returns None on EOF
if r == b'' or r is None:
return

yield r

def _hashfile(fname):
hash = getattr(hashlib, _defaulthash)()
with open(fname, 'rb') as fp:
for r in _readfp(fp):
hash.update(r)
return _hashfp(fp)

def _hashfp(fp):
hash = getattr(hashlib, _defaulthash)()
for r in _readfp(fp):
hash.update(r)

return '%s:%s' % (_defaulthash, hash.hexdigest())

@@ -1219,7 +1220,7 @@ def cmd_dump(options, persona, objstr, cache):

def cmd_auto(options):
for i in options.files:
mf = magic.detect_from_filename(i)
mf = detect_from_filename(i)

primary = mf[0].split('/', 1)[0]
mt = mf[0]
@@ -1261,68 +1262,113 @@ def cmd_list(options, persona, objstr, cache):
# This is needed so that if it creates a FileObj, which may be
# expensive (hashing large file), that it gets saved.

@init_datastructs
def cmd_container(options, persona, objstr, cache):
for i in options.files:
with open(i, 'rb') as fp:
torrent = bencode.bdecode(fp.read())
bencodedinfo = bencode.bencode(torrent['info'])
infohash = hashlib.sha1(bencodedinfo).hexdigest()
def handle_bittorrent(fname, persona, objstr):
with open(fname, 'rb') as fp:
torrent = bencode.bdecode(fp.read())
bencodedinfo = bencode.bencode(torrent['info'])
infohash = hashlib.sha1(bencodedinfo).hexdigest()

# XXX - not entirely happy w/ URI
uri = 'magnet:?xt=urn:btih:%s&dn=%s' % (infohash,
torrent['info']['name'].decode('utf-8'))
# XXX - not entirely happy w/ URI
uri = 'magnet:?xt=urn:btih:%s&dn=%s' % (infohash,
torrent['info']['name'].decode('utf-8'))

try:
cont = objstr.by_id(Container.make_id(uri))
except KeyError:
pass
else:
if not 'incomplete' in cont:
print('Warning, container already complete, skipping %s.' % repr(fname), file=sys.stderr)
return

good, bad = validate_file(fname)

if bad:
print('Warning, incomple/invalid files, not added for %s:' %
repr(fname), file=sys.stderr)
print('\n'.join('\t%s' %
repr(str(pathlib.Path(*x.parts[1:]))) for x in
sorted(bad)), file=sys.stderr)

files = []
hashes = []
for j in sorted(good):
files.append(str(pathlib.PosixPath(*j.parts[1:])))
try:
cont = objstr.by_id(Container.make_id(uri))
except KeyError:
pass
else:
if not 'incomplete' in cont:
print('Warning, container already complete, skipping %s.' % repr(i), file=sys.stderr)
continue
fobj = objstr.by_file(j, ('file',))[0]
except:
fobj = persona.by_file(j)
objstr.loadobj(fobj)

good, bad = validate_file(i)
# XXX - ensure only one is added?
hashes.extend(fobj.hashes)

if bad:
print('Warning, incomple/invalid files, not added for %s:' % repr(i),
file=sys.stderr)
print('\n'.join('\t%s' %
repr(str(pathlib.Path(*x.parts[1:]))) for x in
sorted(bad)), file=sys.stderr)
kwargs = dict(files=files, hashes=hashes,
uri=uri)

if bad:
kwargs['incomplete'] = True

# XXX - doesn't combine files/hashes, that is if a
# Container has one set of good files, and then the
# next scan has a different set, only the second set
# will be present, not any from the first set.

try:
cont = objstr.by_id(Container.make_id(uri))
cont = cont.new_version(dels=() if bad
else ('incomplete',), replaces=kwargs.items())
except KeyError:
cont = persona.Container(**kwargs)

objstr.loadobj(cont)

def handle_archive(fname, persona, objstr):
with libarchive.Archive(fname) as arch:
files = []
hashes = []
for j in sorted(good):
files.append(str(pathlib.PosixPath(*j.parts[1:])))
try:
fobj = objstr.by_file(j, ('file',))[0]
except:
fobj = persona.by_file(j)
objstr.loadobj(fobj)

# XXX - ensure only one is added?
hashes.extend(fobj.hashes)
for i in arch:
if not i.isfile():
continue

kwargs = dict(files=files, hashes=hashes,
uri=uri)
files.append(i.pathname)

if bad:
kwargs['incomplete'] = True
with arch.readstream(i.size) as fp:
hashes.append(_hashfp(fp))

# XXX - doesn't combine files/hashes, that is if a
# Container has one set of good files, and then the
# next scan has a different set, only the second set
# will be present, not any from the first set.
try:
fobj = objstr.by_file(fname, ('file',))[0]
except:
fobj = persona.by_file(fname)
objstr.loadobj(fobj)

try:
cont = objstr.by_id(Container.make_id(uri))
cont = cont.new_version(dels=() if bad
else ('incomplete',), replaces=kwargs.items())
except KeyError:
cont = persona.Container(**kwargs)
uri = _makehashuri(fobj.hashes[0])

kwargs = dict(files=files, hashes=hashes,
uri=uri)
try:
cont = objstr.by_id(Container.make_id(uri))
# XXX - only update when different, check uri
cont = cont.new_version(replaces=kwargs.items())
except KeyError:
cont = persona.Container(**kwargs)

objstr.loadobj(cont)

_container_mapping = {
'application/x-bittorrent': handle_bittorrent,
'application/x-tar': handle_archive,
}

@init_datastructs
def cmd_container(options, persona, objstr, cache):
for i in options.files:
mf = detect_from_filename(i)
#_debprint('mf:', repr(mf))
fun = _container_mapping[mf.mime_type]

objstr.loadobj(cont)
fun(i, persona, objstr)

def _json_objstream(fp):
inp = fp.read()
@@ -2135,6 +2181,9 @@ class _TestCases(unittest.TestCase):
elif special == 'delete files':
for i in cmd['files']:
os.unlink(i)
elif special == 'setup tar file':
shutil.copy(self.fixtures /
'testfile.tar.gz', self.tempdir)
else: # pragma: no cover
raise ValueError('unhandled special: %s' % repr(special))

@@ -2181,7 +2230,6 @@ class _TestCases(unittest.TestCase):
if outcheck:
stdout.seek(0)
self.objcompare(_json_objstream(stdout), outcheck)

self.assertEqual(stderr.getvalue(), cmd.get('stderr', ''))

@@ -2218,7 +2266,8 @@ class _TestCases(unittest.TestCase):
self.setUp()

os.chdir(self.tempdir)
self.run_command_file(i)
with self.subTest(file=i):
self.run_command_file(i)

# XXX - the following test may no longer be needed
def test_main(self):


+ 12
- 0
ui/medashare/magic View File

@@ -0,0 +1,12 @@
# This file contains magic that is used by tests and the code
# that must be present to work properly.

# Transmission adds this
0 string d10:created\ by BitTorrent file
!:mime application/x-bittorrent
!:ext torrent
# BitTornado adds this
0 string d13:creation\ date BitTorrent file
!:mime application/x-bittorrent
!:ext torrent


+ 155
- 0
ui/medashare/magic_wrap.py View File

@@ -0,0 +1,155 @@
import functools
import importlib
import magic
import os
import pathlib
import shutil
import tempfile
import unittest

from .utils import _debprint

__doc__ = '''
This is a number of hacks to the Python magic module so that it works
better. These bugs should be fixed in the module, but I don't want to
deal w/ forking and getting the fixed upstreamed.
'''

magic.FileMagic = magic.namedtuple('FileMagic', ('mime_type', 'encoding',
'name', 'compressed_type'), defaults=[ '' ])

from magic import *

__all__ = [
'detect_from_filename',
'detect_from_content',
]

_mgc_data = {}
_lcl_mgc_data = None

# Wrapper magic.open so that we look past compression
_real_magic_open = magic.open

@functools.wraps(magic.open)
def open(flags):
return _real_magic_open(flags|magic.MAGIC_COMPRESS)

magic.open = open

def _create_filemagic(mime_detected, type_detected):
try:
mime_type, mime_encoding = mime_detected.split('; ', 1)
except ValueError:
raise ValueError(mime_detected)

kwargs = {}
try:
mime_encoding, compressed_type = mime_encoding.split(' compressed-encoding=')
except ValueError:
pass
else:
compressed_type, _ = compressed_type.split('; ', 1)
kwargs['compressed_type'] = compressed_type

return FileMagic(name=type_detected, mime_type=mime_type,
encoding=mime_encoding.replace('charset=', ''), **kwargs)

magic._create_filemagic = _create_filemagic

def _get_mgc_data(fname):
try:
return _mgc_data[fname]
except KeyError:
data = pathlib.Path(fname).read_bytes()
_mgc_data[fname] = data

return data

def _compile_file(inp, out, tempdir):
oldcwd = pathlib.Path.cwd()

try:
os.chdir(tempdir)

mag = magic.open(magic.MAGIC_NONE)

mag.compile(str(inp))

inp.with_suffix('.mgc').rename(out)
finally:
os.chdir(oldcwd)

def _compile_lcl():
magicfile = importlib.resources.files('medashare') / 'magic'

try:
d = pathlib.Path(tempfile.mkdtemp()).resolve()

# write out data
inpfile = d / 'magic'
inpfile.write_bytes(magicfile.read_bytes())

# where it'll go
outfile = d / 'someotherfile'
_compile_file(inpfile, outfile, tempdir=d)

return outfile.read_bytes()
finally:
shutil.rmtree(d)


def _get_mgc_res():
global _lcl_mgc_data
if _lcl_mgc_data is None:
try:
mgcfile = importlib.resources.files('medashare') / 'magic.mgc'
_lcl_mgc_data = mgcfile.read_bytes()
except FileNotFoundError:
_lcl_mgc_data = _compile_lcl()
_debprint(repr(_lcl_mgc_data))

return _lcl_mgc_data

# patch magic to load custom magic file
_mgp = magic._libraries['magic'].magic_getpath
_mgp.restype = magic.c_char_p
_mgp.argtypes = [ magic.c_char_p, magic.c_int ]
_mlb = magic._libraries['magic'].magic_load_buffers
_mlb.restype = magic.c_int
_mlb.argtypes = [ magic.magic_t, magic.POINTER(magic.c_void_p), magic.POINTER(magic.c_size_t), magic.c_size_t ]

def _new_magic_load(self, fname=None):
files = _mgp(None, 0).decode('utf-8') + '.mgc' + ':' + str(pathlib.Path(__file__).parent / 'magic')

return magic._load(self._magic_t, files.encode('utf-8'))

# XXX - for some reason this code isn't working
files = [ _mgp(None, 0).decode('utf-8') + '.mgc' ]

buffers = [ _get_mgc_data(x) for x in files ] + [ _get_mgc_res() ]
#buffers.reverse()
del buffers[0]
cnt = len(buffers)

mgcdatas = [ (magic.c_char * len(x))(*x) for x in buffers ]

bufs = (magic.c_void_p * cnt)(*(magic.cast(magic.pointer(x), magic.c_void_p) for x in mgcdatas))
sizes = (magic.c_size_t * cnt)(*(len(x) for x in buffers))

_debprint('mg:', cnt, repr([len(x) for x in buffers]))

r = _mlb(self._magic_t, bufs, sizes, cnt)

return r

magic.Magic.load = _new_magic_load

class _TestMagic(unittest.TestCase):
def test_create_filemagic(self):
a = _create_filemagic('application/x-tar; charset=binary compressed-encoding=application/gzip; charset=binary', 'foobar')

self.assertEqual(a.mime_type, 'application/x-tar')
self.assertEqual(a.encoding, 'binary')
self.assertEqual(a.compressed_type, 'application/gzip')
self.assertEqual(a.name, 'foobar')

+ 1
- 0
ui/medashare/tests.py View File

@@ -6,3 +6,4 @@ from .cli import _TestMigrations
from .tags import _TestTagCache
from .mtree import Test
from .server import _TestCases, _TestPostConfig
from .magic_wrap import _TestMagic

+ 15
- 0
ui/medashare/utils.py View File

@@ -1,8 +1,23 @@
import base64
import datetime
import pasn1
import sys
import uuid

_real_stderr = sys.stderr

def _debprint(*args): # pragma: no cover
import traceback, sys, os.path
st = traceback.extract_stack(limit=2)[0]

sep = ''
if args:
sep = ':'

print('%s:%d%s' % (os.path.basename(st.filename), st.lineno, sep),
*args, file=_real_stderr)
sys.stderr.flush()

def _makeuuid(s):
if isinstance(s, uuid.UUID):
return s


+ 41
- 3
ui/setup.py View File

@@ -2,13 +2,43 @@
# python setup.py --dry-run --verbose install

import os.path
from setuptools import setup, find_packages
import pathlib
import shutil
import subprocess
from setuptools import setup, find_packages, Command, Extension
from setuptools.command.build_ext import build_ext
#from medashare.magic_wrap import compile_file

from distutils.core import setup
class file_ext(build_ext):
def __init__(self, dist):
super().__init__(dist)

def run(self):
# do the building
#print(repr(self.distribution))
fnames = [ (x, pathlib.Path(self.build_lib) / 'medashare' / x) for x in self.get_source_files() ]


oldcwd = os.getcwd()
for src, dst in fnames:
os.chdir(oldcwd)
shutil.copyfile(src, dst)
os.chdir(dst.parent)
cmd = [ 'file', '-C', '-m' ] + [ str(dst) for src, dst in fnames ]
#print('running:', cmd)
r = subprocess.run(cmd)

os.chdir(oldcwd)
r.check_returncode()

def get_outputs(self):
return [ '%s.mgc' % i for i in self.get_source_files() ]

# method build_extension not needed, in run

setup(
name='medashare',
version='0.1.0',
version='0.1.1',
author='John-Mark Gurney',
author_email='jmg@funkthat.com',
packages=find_packages(),
@@ -18,6 +48,9 @@ setup(
#download_url='',
long_description=open('README.md').read(),
python_requires='>=3.8',
# This isn't needed till magic_wrap.py can use it
#cmdclass=dict(build_ext=file_ext),
#ext_modules=[ Extension(name='magic', sources=['medashare/magic']) ],
install_requires=[
'alembic',
'base58',
@@ -30,9 +63,14 @@ setup(
'hypercorn', # option, for server only?
'orm',
'pasn1 @ git+https://www.funkthat.com/gitea/jmg/pasn1.git@c6c64510b42292557ace2b77272eb32cb647399d#egg=pasn1',
'python-libarchive @ git+https://www.funkthat.com/gitea/jmg/python-libarchive.git#egg=python-libarchive',
'file-magic @ git+https://github.com/file/file.git#egg=file-magic&subdirectory=python',
'pydantic[dotenv]',
],
include_package_data=True,
package_data={
'medashare': [ 'alembic/**/*.py', 'alembic.ini', ],
},
extras_require = {
# requests needed for fastpi.testclient.TestClient
'dev': [ 'coverage', 'requests' ],


Loading…
Cancel
Save