#!/usr/bin/env python #import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace() import copy import datetime import hashlib import mock import os.path import pasn1 import shutil import string import tempfile import unittest import uuid from contextlib import nested # The UUID for the namespace representing the path to a file _NAMESPACE_MEDASHARE_PATH = uuid.UUID('f6f36b62-3770-4a68-bc3d-dc3e31e429e6') _defaulthash = 'sha512' _validhashes = set([ 'sha256', 'sha512' ]) _hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in _validhashes } def _iterdictlist(obj): itms = obj.items() itms.sort() for k, v in itms: if isinstance(v, list): v = v[:] v.sort() for i in v: yield k, i else: yield k, v # XXX - add validation class MDBase(object): '''This is a simple wrapper that turns a JSON object into a pythonesc object where attribute accesses work.''' _generated_properties = { 'uuid': uuid.uuid4, 'modified': datetime.datetime.utcnow } _common_properties = [ 'type', 'created_by_ref' ] # XXX - add lang? _common_optional = [ 'overlay_refs' ] _common_names = set(_common_properties + _generated_properties.keys()) def __init__(self, obj): obj = copy.deepcopy(obj) if 'type' not in obj: obj['type'] = self._type for x in self._common_properties: if x not in obj: raise ValueError('common property %s not present' % `x`) for x, fun in self._generated_properties.iteritems(): if x not in obj: obj[x] = fun() self._obj = obj @classmethod def create_obj(cls, obj): '''Using obj as a base, create an instance of MDBase of the correct type. If the correct type is not found, a ValueError is raised.''' if isinstance(obj, cls): obj = obj._obj ty = obj['type'] for i in MDBase.__subclasses__(): if i._type == ty: return i(obj) else: raise ValueError('Unable to find class for type %s' % `ty`) def new_version(self, *args): '''For each k, v pari, add the property k as an additional one (or new on if first), with the value v.''' obj = copy.deepcopy(self._obj) for k, v in args: obj.setdefault(k, []).append(v) del obj['modified'] return self.create_obj(obj) def __getattr__(self, k): return self._obj[k] def __getitem__(self, k): return self._obj[k] def __to_dict__(self): return self._obj def __eq__(self, o): return cmp(self._obj, o) == 0 def __contains__(self, k): return k in self._obj def items(self, skipcommon=True): return [ (k, v) for k, v in self._obj.items() if k not in self._common_names ] class MetaData(MDBase): _type = 'metadata' def _trytodict(o): if isinstance(o, uuid.UUID): return 'unicode', str(o) try: return 'dict', o.__to_dict__() except Exception: # pragma: no cover raise TypeError('unable to find __to_dict__ on %s: %s' % (type(o), `o`)) _asn1coder = pasn1.ASN1DictCoder(coerce=_trytodict) class ObjectStore(object): '''A container to store for the various Metadata objects.''' # The _uuids property contains both the UUIDv4 for objects, and # looking up the UUIDv5 for FileObjects. def __init__(self, created_by_ref): self._created_by_ref = created_by_ref self._uuids = {} self._hashes = {} @staticmethod def makehash(hashstr, strict=True): '''Take a hash string, and return a valid hash string from it. This makes sure that it is of the correct type and length. If strict is False, the function will detect the length and return a valid hash if one can be found.''' try: hash, value = hashstr.split(':') except ValueError: if strict: raise hash = _hashlengths[len(hashstr)] value = hashstr if strict and len(str(value).translate(None, string.hexdigits.lower())) != 0: raise ValueError('value has invalid hex digits (must be lower case)', value) if hash in _validhashes: return ':'.join((hash, value)) raise ValueError def __len__(self): return len(self._uuids) def store(self, fname): '''Write out the objects in the store to the file named fname.''' with open(fname, 'w') as fp: obj = { 'created_by_ref': self._created_by_ref, 'objects': self._uuids.values(), } fp.write(_asn1coder.dumps(obj)) def loadobj(self, obj): '''Load obj into the data store.''' obj = MDBase.create_obj(obj) if not isinstance(obj.uuid, uuid.UUID): id = uuid.UUID(obj.uuid) else: id = obj.uuid self._uuids[id] = obj for j in obj.hashes: h = self.makehash(j) self._hashes.setdefault(h, []).append(obj) @classmethod def load(cls, fname): '''Load objects from the provided file name. Basic validation will be done on the objects in the file. The objects will be accessible via other methods.''' with open(fname) as fp: objs = _asn1coder.loads(fp.read()) obj = cls(objs['created_by_ref']) for i in objs['objects']: obj.loadobj(i) return obj def by_id(self, id): '''Look up an object by it's UUID.''' if not isinstance(id, uuid.UUID): uid = uuid.UUID(id) else: uid = id return self._uuids[uid] def by_hash(self, hash): '''Look up an object by it's hash value.''' h = self.makehash(hash, strict=False) return self._hashes[h] def by_file(self, fname): '''Return a metadata object for the file named fname.''' fid = FileObject.make_id(fname) try: fobj = self.by_id(fid) except KeyError: # unable to find it fobj = FileObject.from_file(fname, self._created_by_ref) self.loadobj(fobj) for i in fobj.hashes: j = self.by_hash(i) # Filter out non-metadata objects j = [ x for x in j if x.type == 'metadata' ] if j: return j else: raise KeyError('unable to find metadata for file') def _hashfile(fname): hash = getattr(hashlib, _defaulthash)() with open(fname) as fp: r = fp.read() hash.update(r) return '%s:%s' % (_defaulthash, hash.hexdigest()) class FileObject(MDBase): _type = 'file' @staticmethod def make_id(fname): '''Take a local file name, and make the id for it. Note that converts from the local path separator to a forward slash so that it will be the same between Windows and Unix systems.''' fname = os.path.realpath(fname) return uuid.uuid5(_NAMESPACE_MEDASHARE_PATH, '/'.join(os.path.split(fname))) @classmethod def from_file(cls, filename, created_by_ref): s = os.stat(filename) obj = { 'dir': os.path.dirname(filename), 'created_by_ref': created_by_ref, 'filename': os.path.basename(filename), 'id': cls.make_id(filename), 'mtime': datetime.datetime.utcfromtimestamp(s.st_mtime), 'size': s.st_size, 'hashes': [ _hashfile(filename), ], } return cls(obj) def enumeratedir(_dir, created_by_ref): '''Enumerate all the files and directories (not recursive) in _dir. Returned is a list of FileObjects.''' return map(lambda x: FileObject.from_file(os.path.join(_dir, x), created_by_ref), os.listdir(_dir)) def main(): from optparse import OptionParser parser = OptionParser() parser.add_option('-a', action='append', dest='add', default=[], help='add the arg as metadata for files, tag=value') parser.add_option('-d', action='append', dest='delete', default=[], help='delete the arg as metadata from files. Either specify tag, and all tags are removed, or specify tag=value and that specific tag will be removed.') parser.add_option('-l', action='store_true', dest='list', default=False, help='list metadata') options, args = parser.parse_args() storefname = os.path.expanduser('~/.medashare_store.pasn1') import sys #print >>sys.stderr, `storefname` objstr = ObjectStore.load(storefname) if options.list: for i in args: for j in objstr.by_file(i): #print >>sys.stderr, `j._obj` for k, v in _iterdictlist(j): print '%s:\t%s' % (k, v) elif options.add: addprops = map(lambda x: x.split('=', 1), options.add) for i in args: for j in objstr.by_file(i): nobj = j.new_version(*addprops) objstr.loadobj(nobj) elif options.delete: for i in args: for j in objstr.by_file(i): obj = j.__to_dict__() for k in options.delete: try: key, v = k.split('=', 1) obj[key].remove(v) except ValueError: del obj[k] nobj = MDBase(obj) objstr.loadobj(nobj) else: # pragma: no cover raise NotImplementedError objstr.store(storefname) if __name__ == '__main__': # pragma: no cover main() class _TestCases(unittest.TestCase): created_by_ref = '867c7563-79ae-435c-a265-9d8509cefac5' def setUp(self): d = os.path.realpath(tempfile.mkdtemp()) self.basetempdir = d self.tempdir = os.path.join(d, 'subdir') shutil.copytree(os.path.join('fixtures', 'testfiles'), self.tempdir) def tearDown(self): shutil.rmtree(self.basetempdir) self.tempdir = None def test_mdbase(self): self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'unknosldkfj' }) self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'metadata' }) baseobj = { 'type': 'metadata', 'created_by_ref': self.created_by_ref, } origbase = copy.deepcopy(baseobj) # that when an MDBase object is created md = MDBase.create_obj(baseobj) # it doesn't modify the passed in object (when adding # generated properties) self.assertEqual(baseobj, origbase) # and it has the generted properties # Note: cannot mock the functions as they are already # referenced at creation time self.assertIn('uuid', md) self.assertIn('modified', md) # That you can create a new version using new_version md2 = md.new_version(('dc:creator', 'Jim Bob',)) # that they are different self.assertNotEqual(md, md2) # and that the new modified time is different from the old self.assertNotEqual(md.modified, md2.modified) # and that the modification is present self.assertEqual(md2['dc:creator'], [ 'Jim Bob' ]) def test_makehash(self): self.assertRaises(ValueError, ObjectStore.makehash, 'slkj') self.assertRaises(ValueError, ObjectStore.makehash, 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA') self.assertRaises(ValueError, ObjectStore.makehash, 'bogushash:9e0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA', strict=False) self.assertEqual(ObjectStore.makehash('cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e', strict=False), 'sha512:cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e') self.assertEqual(ObjectStore.makehash('e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', strict=False), 'sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855') def test_enumeratedir(self): files = enumeratedir(self.tempdir, self.created_by_ref) ftest = files[0] fname = 'test.txt' # make sure that they are of type MDBase self.assertIsInstance(ftest, MDBase) oldid = ftest.id self.assertEqual(ftest.filename, fname) self.assertEqual(ftest.dir, self.tempdir) # XXX - do we add host information? self.assertEqual(ftest.id, uuid.uuid5(_NAMESPACE_MEDASHARE_PATH, '/'.join(os.path.split(self.tempdir) + ( fname, )))) self.assertEqual(ftest.mtime, datetime.datetime(2019, 5, 20, 21, 47, 36)) self.assertEqual(ftest.size, 15) self.assertIn('sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f', ftest.hashes) # XXX - make sure works w/ relative dirs files = enumeratedir(os.path.relpath(self.tempdir), self.created_by_ref) self.assertEqual(oldid, files[0].id) def test_mdbaseoverlay(self): objst = ObjectStore(self.created_by_ref) # that a base object bid = uuid.uuid4() objst.loadobj({ 'type': 'metadata', 'uuid': bid, 'modified': datetime.datetime(2019, 6, 10, 14, 3, 10), 'created_by_ref': self.created_by_ref, 'hashes': [ 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada' ], 'someprop': [ 'somevalue' ], 'lang': 'en', }) # can have an overlay object oid = uuid.uuid4() dhash = 'sha256:a7c96262c21db9a06fd49e307d694fd95f624569f9b35bb3ffacd880440f9787' objst.loadobj({ 'type': 'metadata', 'uuid': oid, 'modified': datetime.datetime(2019, 6, 10, 18, 3, 10), 'created_by_ref': self.created_by_ref, 'hashes': [ dhash ], 'overlay_refs': [ bid ], 'lang': 'en', }) # and that when you get it's properties oobj = objst.by_id(oid) odict = dict(oobj.items()) # that is has the overlays property self.assertEqual(odict['overlay_refs'], [ bid ]) def test_objectstore(self): objst = ObjectStore.load(os.path.join('fixtures', 'sample.data.pasn1')) objst.loadobj({ 'type': 'metadata', 'uuid': 'c9a1d1e2-3109-4efd-8948-577dc15e44e7', 'modified': datetime.datetime(2019, 5, 31, 14, 3, 10), 'created_by_ref': self.created_by_ref, 'hashes': [ 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada' ], 'lang': 'en', }) lst = objst.by_hash('91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada') self.assertEqual(len(lst), 2) byid = objst.by_id('3e466e06-45de-4ecc-84ba-2d2a3d970e96') self.assertIsInstance(byid, MetaData) self.assertIn(byid, lst) r = byid self.assertEqual(r.uuid, '3e466e06-45de-4ecc-84ba-2d2a3d970e96') self.assertEqual(r['dc:creator'], [ u'John-Mark Gurney' ]) fname = 'testfile.pasn1' objst.store(fname) with open(fname) as fp: objs = _asn1coder.loads(fp.read()) os.unlink(fname) self.assertEqual(len(objs), len(objst)) self.assertEqual(objs['created_by_ref'], self.created_by_ref) for i in objs['objects']: self.assertEqual(objst.by_id(i['uuid']), i) testfname = os.path.join(self.tempdir, 'test.txt') self.assertEqual(objst.by_file(testfname), [ byid ]) self.assertEqual(objst.by_file(testfname), [ byid ]) self.assertRaises(KeyError, objst.by_file, '/dev/null') # XXX make sure that object store contains fileobject # Tests to add: # Non-duplicates when same metadata is located by multiple hashes. def test_main(self): # Test the main runner, this is only testing things that are # specific to running the program, like where the store is # created. # setup object store storefname = os.path.join(self.tempdir, 'storefname') shutil.copy(os.path.join('fixtures', 'sample.data.pasn1'), storefname) # setup test fname testfname = os.path.join(self.tempdir, 'test.txt') import sys import StringIO import itertools with mock.patch('os.path.expanduser', side_effect=itertools.repeat(storefname)) \ as eu: with nested(mock.patch('sys.stdout', StringIO.StringIO()), mock.patch('sys.argv', [ 'progname', '-l', testfname ])) as (stdout, argv): main() self.assertEqual(stdout.getvalue(), 'dc:creator:\tJohn-Mark Gurney\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n') eu.assert_called_with('~/.medashare_store.pasn1') with nested(mock.patch('sys.stdout', StringIO.StringIO()), mock.patch('sys.argv', [ 'progname', '-a', 'dc:creator=Another user', '-a', 'foo=bar=baz', testfname ])) as (stdout, argv): main() with nested(mock.patch('sys.stdout', StringIO.StringIO()), mock.patch('sys.argv', [ 'progname', '-l', testfname ])) as (stdout, argv): main() self.assertEqual(stdout.getvalue(), 'dc:creator:\tAnother user\ndc:creator:\tJohn-Mark Gurney\nfoo:\tbar=baz\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n') with nested(mock.patch('sys.stdout', StringIO.StringIO()), mock.patch('sys.argv', [ 'progname', '-d', 'dc:creator', testfname ])) as (stdout, argv): main() with nested(mock.patch('sys.stdout', StringIO.StringIO()), mock.patch('sys.argv', [ 'progname', '-l', testfname ])) as (stdout, argv): main() self.assertEqual(stdout.getvalue(), 'foo:\tbar=baz\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n') with nested(mock.patch('sys.stdout', StringIO.StringIO()), mock.patch('sys.argv', [ 'progname', '-a', 'foo=bleh', testfname ])) as (stdout, argv): main() with nested(mock.patch('sys.stdout', StringIO.StringIO()), mock.patch('sys.argv', [ 'progname', '-l', testfname ])) as (stdout, argv): main() self.assertEqual(stdout.getvalue(), 'foo:\tbar=baz\nfoo:\tbleh\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n') with nested(mock.patch('sys.stdout', StringIO.StringIO()), mock.patch('sys.argv', [ 'progname', '-d', 'foo=bar=baz', testfname ])) as (stdout, argv): main() with nested(mock.patch('sys.stdout', StringIO.StringIO()), mock.patch('sys.argv', [ 'progname', '-l', testfname ])) as (stdout, argv): main() self.assertEqual(stdout.getvalue(), 'foo:\tbleh\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')