MetaData Sharing
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

549 lines
16 KiB

  1. #!/usr/bin/env python
  2. #import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace()
  3. import copy
  4. import datetime
  5. import hashlib
  6. import mock
  7. import os.path
  8. import pasn1
  9. import shutil
  10. import string
  11. import tempfile
  12. import unittest
  13. import uuid
  14. from contextlib import nested
  15. # The UUID for the namespace representing the path to a file
  16. _NAMESPACE_MEDASHARE_PATH = uuid.UUID('f6f36b62-3770-4a68-bc3d-dc3e31e429e6')
  17. _defaulthash = 'sha512'
  18. _validhashes = set([ 'sha256', 'sha512' ])
  19. _hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in _validhashes }
  20. def _iterdictlist(obj):
  21. itms = obj.items()
  22. itms.sort()
  23. for k, v in itms:
  24. if isinstance(v, list):
  25. v = v[:]
  26. v.sort()
  27. for i in v:
  28. yield k, i
  29. else:
  30. yield k, v
  31. # XXX - add validation
  32. class MDBase(object):
  33. '''This is a simple wrapper that turns a JSON object into a pythonesc
  34. object where attribute accesses work.'''
  35. _generated_properties = {
  36. 'uuid': uuid.uuid4,
  37. 'modified': datetime.datetime.utcnow
  38. }
  39. _common_properties = [ 'type', 'created_by_ref' ] # XXX - add lang?
  40. _common_names = set(_common_properties + _generated_properties.keys())
  41. def __init__(self, obj):
  42. obj = copy.deepcopy(obj)
  43. if 'type' not in obj:
  44. obj['type'] = self._type
  45. for x in self._common_properties:
  46. if x not in obj:
  47. raise ValueError('common property %s not present' % `x`)
  48. for x, fun in self._generated_properties.iteritems():
  49. if x not in obj:
  50. obj[x] = fun()
  51. self._obj = obj
  52. @classmethod
  53. def create_obj(cls, obj):
  54. '''Using obj as a base, create an instead of MDBase of the
  55. correct type.
  56. If the correct type is not found, a ValueError is raised.'''
  57. if isinstance(obj, cls):
  58. obj = obj._obj
  59. ty = obj['type']
  60. for i in MDBase.__subclasses__():
  61. if i._type == ty:
  62. return i(obj)
  63. else:
  64. raise ValueError('Unable to find class for type %s' %
  65. `ty`)
  66. def new_version(self, *args):
  67. '''For each k, v pari, add the property k as an additional one
  68. (or new on if first), with the value v.'''
  69. obj = copy.deepcopy(self._obj)
  70. for k, v in args:
  71. obj.setdefault(k, []).append(v)
  72. del obj['modified']
  73. return self.create_obj(obj)
  74. def __getattr__(self, k):
  75. return self._obj[k]
  76. def __getitem__(self, k):
  77. return self._obj[k]
  78. def __to_dict__(self):
  79. return self._obj
  80. def __eq__(self, o):
  81. return cmp(self._obj, o) == 0
  82. def __contains__(self, k):
  83. return k in self._obj
  84. def items(self, skipcommon=True):
  85. return [ (k, v) for k, v in self._obj.items() if k not in
  86. self._common_names ]
  87. class MetaData(MDBase):
  88. _type = 'metadata'
  89. def _trytodict(o):
  90. if isinstance(o, uuid.UUID):
  91. return 'unicode', str(o)
  92. try:
  93. return 'dict', o.__to_dict__()
  94. except Exception: # pragma: no cover
  95. raise TypeError('unable to find __to_dict__ on %s: %s' % (type(o), `o`))
  96. _asn1coder = pasn1.ASN1DictCoder(coerce=_trytodict)
  97. class ObjectStore(object):
  98. '''A container to store for the various Metadata objects.'''
  99. # The _uuids property contains both the UUIDv4 for objects, and
  100. # looking up the UUIDv5 for FileObjects.
  101. def __init__(self, created_by_ref):
  102. self._created_by_ref = created_by_ref
  103. self._uuids = {}
  104. self._hashes = {}
  105. @staticmethod
  106. def makehash(hashstr, strict=True):
  107. '''Take a hash string, and return a valid hash string from it.
  108. This makes sure that it is of the correct type and length.
  109. If strict is False, the function will detect the length and
  110. return a valid hash if one can be found.'''
  111. try:
  112. hash, value = hashstr.split(':')
  113. except ValueError:
  114. if strict:
  115. raise
  116. hash = _hashlengths[len(hashstr)]
  117. value = hashstr
  118. if strict and len(str(value).translate(None, string.hexdigits.lower())) != 0:
  119. raise ValueError('value has invalid hex digits (must be lower case)', value)
  120. if hash in _validhashes:
  121. return ':'.join((hash, value))
  122. raise ValueError
  123. def __len__(self):
  124. return len(self._uuids)
  125. def store(self, fname):
  126. '''Write out the objects in the store to the file named
  127. fname.'''
  128. with open(fname, 'w') as fp:
  129. obj = {
  130. 'created_by_ref': self._created_by_ref,
  131. 'objects': self._uuids.values(),
  132. }
  133. fp.write(_asn1coder.dumps(obj))
  134. def loadobj(self, obj):
  135. '''Load obj into the data store.'''
  136. obj = MDBase.create_obj(obj)
  137. if not isinstance(obj.uuid, uuid.UUID):
  138. id = uuid.UUID(obj.uuid)
  139. else:
  140. id = obj.uuid
  141. self._uuids[id] = obj
  142. for j in obj.hashes:
  143. h = self.makehash(j)
  144. self._hashes.setdefault(h, []).append(obj)
  145. @classmethod
  146. def load(cls, fname):
  147. '''Load objects from the provided file name.
  148. Basic validation will be done on the objects in the file.
  149. The objects will be accessible via other methods.'''
  150. with open(fname) as fp:
  151. objs = _asn1coder.loads(fp.read())
  152. obj = cls(objs['created_by_ref'])
  153. for i in objs['objects']:
  154. obj.loadobj(i)
  155. return obj
  156. def by_id(self, id):
  157. '''Look up an object by it's UUID.'''
  158. if not isinstance(id, uuid.UUID):
  159. uid = uuid.UUID(id)
  160. else:
  161. uid = id
  162. return self._uuids[uid]
  163. def by_hash(self, hash):
  164. '''Look up an object by it's hash value.'''
  165. h = self.makehash(hash, strict=False)
  166. return self._hashes[h]
  167. def by_file(self, fname):
  168. '''Return a metadata object for the file named fname.'''
  169. fid = FileObject.make_id(fname)
  170. try:
  171. fobj = self.by_id(fid)
  172. except KeyError:
  173. # unable to find it
  174. fobj = FileObject.from_file(fname, self._created_by_ref)
  175. self.loadobj(fobj)
  176. for i in fobj.hashes:
  177. j = self.by_hash(i)
  178. # Filter out non-metadata objects
  179. j = [ x for x in j if x.type == 'metadata' ]
  180. if j:
  181. return j
  182. else:
  183. raise KeyError('unable to find metadata for file')
  184. def _hashfile(fname):
  185. hash = getattr(hashlib, _defaulthash)()
  186. with open(fname) as fp:
  187. r = fp.read()
  188. hash.update(r)
  189. return '%s:%s' % (_defaulthash, hash.hexdigest())
  190. class FileObject(MDBase):
  191. _type = 'file'
  192. @staticmethod
  193. def make_id(fname):
  194. '''Take a local file name, and make the id for it. Note that
  195. converts from the local path separator to a forward slash so
  196. that it will be the same between Windows and Unix systems.'''
  197. fname = os.path.realpath(fname)
  198. return uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  199. '/'.join(os.path.split(fname)))
  200. @classmethod
  201. def from_file(cls, filename, created_by_ref):
  202. s = os.stat(filename)
  203. obj = {
  204. 'dir': os.path.dirname(filename),
  205. 'created_by_ref': created_by_ref,
  206. 'filename': os.path.basename(filename),
  207. 'id': cls.make_id(filename),
  208. 'mtime': datetime.datetime.utcfromtimestamp(s.st_mtime),
  209. 'size': s.st_size,
  210. 'hashes': [ _hashfile(filename), ],
  211. }
  212. return cls(obj)
  213. def enumeratedir(_dir, created_by_ref):
  214. '''Enumerate all the files and directories (not recursive) in _dir.
  215. Returned is a list of FileObjects.'''
  216. return map(lambda x: FileObject.from_file(os.path.join(_dir, x), created_by_ref),
  217. os.listdir(_dir))
  218. def main():
  219. from optparse import OptionParser
  220. parser = OptionParser()
  221. parser.add_option('-a', action='append', dest='add',
  222. default=[], help='add the arg as metadata for files, tag=value')
  223. parser.add_option('-d', action='append', dest='delete',
  224. default=[], help='delete the arg as metadata from files. Either specify tag, and all tags are removed, or specify tag=value and that specific tag will be removed.')
  225. parser.add_option('-l', action='store_true', dest='list',
  226. default=False, help='list metadata')
  227. options, args = parser.parse_args()
  228. storefname = os.path.expanduser('~/.medashare_store.pasn1')
  229. import sys
  230. #print >>sys.stderr, `storefname`
  231. objstr = ObjectStore.load(storefname)
  232. if options.list:
  233. for i in args:
  234. for j in objstr.by_file(i):
  235. #print >>sys.stderr, `j._obj`
  236. for k, v in _iterdictlist(j):
  237. print '%s:\t%s' % (k, v)
  238. elif options.add:
  239. addprops = map(lambda x: x.split('=', 1), options.add)
  240. for i in args:
  241. for j in objstr.by_file(i):
  242. nobj = j.new_version(*addprops)
  243. objstr.loadobj(nobj)
  244. elif options.delete:
  245. for i in args:
  246. for j in objstr.by_file(i):
  247. obj = j.__to_dict__()
  248. for k in options.delete:
  249. try:
  250. key, v = k.split('=', 1)
  251. obj[key].remove(v)
  252. except ValueError:
  253. del obj[k]
  254. nobj = MDBase(obj)
  255. objstr.loadobj(nobj)
  256. else:
  257. raise NotImplementedError
  258. objstr.store(storefname)
  259. if __name__ == '__main__': # pragma: no cover
  260. main()
  261. class _TestCases(unittest.TestCase):
  262. created_by_ref = '867c7563-79ae-435c-a265-9d8509cefac5'
  263. def setUp(self):
  264. d = os.path.realpath(tempfile.mkdtemp())
  265. self.basetempdir = d
  266. self.tempdir = os.path.join(d, 'subdir')
  267. shutil.copytree(os.path.join('fixtures', 'testfiles'),
  268. self.tempdir)
  269. def tearDown(self):
  270. shutil.rmtree(self.basetempdir)
  271. self.tempdir = None
  272. def test_mdbase(self):
  273. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'unknosldkfj' })
  274. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'metadata' })
  275. baseobj = {
  276. 'type': 'metadata',
  277. 'created_by_ref': self.created_by_ref,
  278. }
  279. origbase = copy.deepcopy(baseobj)
  280. # that when an MDBase object is created
  281. md = MDBase.create_obj(baseobj)
  282. # it doesn't modify the passed in object (when adding
  283. # generated properties)
  284. self.assertEqual(baseobj, origbase)
  285. # and it has the generted properties
  286. # Note: cannot mock the functions as they are already
  287. # referenced at creation time
  288. self.assertIn('uuid', md)
  289. self.assertIn('modified', md)
  290. # That you can create a new version using new_version
  291. md2 = md.new_version(('dc:creator', 'Jim Bob',))
  292. # that they are different
  293. self.assertNotEqual(md, md2)
  294. # and that the new modified time is different from the old
  295. self.assertNotEqual(md.modified, md2.modified)
  296. # and that the modification is present
  297. self.assertEqual(md2['dc:creator'], [ 'Jim Bob' ])
  298. def test_makehash(self):
  299. self.assertRaises(ValueError, ObjectStore.makehash, 'slkj')
  300. self.assertRaises(ValueError, ObjectStore.makehash, 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA')
  301. self.assertRaises(ValueError, ObjectStore.makehash, 'bogushash:9e0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA', strict=False)
  302. self.assertEqual(ObjectStore.makehash('cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e', strict=False), 'sha512:cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
  303. self.assertEqual(ObjectStore.makehash('e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', strict=False), 'sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855')
  304. def test_enumeratedir(self):
  305. files = enumeratedir(self.tempdir, self.created_by_ref)
  306. ftest = files[0]
  307. fname = 'test.txt'
  308. # make sure that they are of type MDBase
  309. self.assertIsInstance(ftest, MDBase)
  310. oldid = ftest.id
  311. self.assertEqual(ftest.filename, fname)
  312. self.assertEqual(ftest.dir, self.tempdir)
  313. # XXX - do we add host information?
  314. self.assertEqual(ftest.id, uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  315. '/'.join(os.path.split(self.tempdir) +
  316. ( fname, ))))
  317. self.assertEqual(ftest.mtime, datetime.datetime(2019, 5, 20, 21, 47, 36))
  318. self.assertEqual(ftest.size, 15)
  319. self.assertIn('sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f', ftest.hashes)
  320. # XXX - make sure works w/ relative dirs
  321. files = enumeratedir(os.path.relpath(self.tempdir),
  322. self.created_by_ref)
  323. self.assertEqual(oldid, files[0].id)
  324. def test_objectstore(self):
  325. objst = ObjectStore.load(os.path.join('fixtures', 'sample.data.pasn1'))
  326. objst.loadobj({
  327. 'type': 'metadata',
  328. 'uuid': 'c9a1d1e2-3109-4efd-8948-577dc15e44e7',
  329. 'modified': datetime.datetime(2019, 5, 31, 14, 3, 10),
  330. 'created_by_ref': self.created_by_ref,
  331. 'hashes': [ 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada' ],
  332. 'lang': 'en',
  333. })
  334. lst = objst.by_hash('91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada')
  335. self.assertEqual(len(lst), 2)
  336. byid = objst.by_id('3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  337. self.assertIsInstance(byid, MetaData)
  338. self.assertIn(byid, lst)
  339. r = byid
  340. self.assertEqual(r.uuid, '3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  341. self.assertEqual(r['dc:creator'], [ u'John-Mark Gurney' ])
  342. fname = 'testfile.pasn1'
  343. objst.store(fname)
  344. with open(fname) as fp:
  345. objs = _asn1coder.loads(fp.read())
  346. os.unlink(fname)
  347. self.assertEqual(len(objs), len(objst))
  348. self.assertEqual(objs['created_by_ref'], self.created_by_ref)
  349. for i in objs['objects']:
  350. self.assertEqual(objst.by_id(i['uuid']), i)
  351. testfname = os.path.join(self.tempdir, 'test.txt')
  352. self.assertEqual(objst.by_file(testfname), [ byid ])
  353. self.assertEqual(objst.by_file(testfname), [ byid ])
  354. self.assertRaises(KeyError, objst.by_file, '/dev/null')
  355. # XXX make sure that object store contains fileobject
  356. # Tests to add:
  357. # Non-duplicates when same metadata is located by multiple hashes.
  358. def test_main(self):
  359. # Test the main runner, this is only testing things that are
  360. # specific to running the program, like where the store is
  361. # created.
  362. # setup object store
  363. storefname = os.path.join(self.tempdir, 'storefname')
  364. shutil.copy(os.path.join('fixtures', 'sample.data.pasn1'), storefname)
  365. # setup test fname
  366. testfname = os.path.join(self.tempdir, 'test.txt')
  367. import sys
  368. import StringIO
  369. import itertools
  370. with mock.patch('os.path.expanduser', side_effect=itertools.repeat(storefname)) \
  371. as eu:
  372. with nested(mock.patch('sys.stdout',
  373. StringIO.StringIO()), mock.patch('sys.argv',
  374. [ 'progname', '-l', testfname ])) as (stdout, argv):
  375. main()
  376. self.assertEqual(stdout.getvalue(),
  377. 'dc:creator:\tJohn-Mark Gurney\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  378. eu.assert_called_with('~/.medashare_store.pasn1')
  379. with nested(mock.patch('sys.stdout',
  380. StringIO.StringIO()), mock.patch('sys.argv',
  381. [ 'progname', '-a', 'dc:creator=Another user', '-a', 'foo=bar=baz', testfname ])) as (stdout, argv):
  382. main()
  383. with nested(mock.patch('sys.stdout',
  384. StringIO.StringIO()), mock.patch('sys.argv',
  385. [ 'progname', '-l', testfname ])) as (stdout, argv):
  386. main()
  387. self.assertEqual(stdout.getvalue(),
  388. 'dc:creator:\tAnother user\ndc:creator:\tJohn-Mark Gurney\nfoo:\tbar=baz\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  389. with nested(mock.patch('sys.stdout',
  390. StringIO.StringIO()), mock.patch('sys.argv',
  391. [ 'progname', '-d', 'dc:creator', testfname ])) as (stdout, argv):
  392. main()
  393. with nested(mock.patch('sys.stdout',
  394. StringIO.StringIO()), mock.patch('sys.argv',
  395. [ 'progname', '-l', testfname ])) as (stdout, argv):
  396. main()
  397. self.assertEqual(stdout.getvalue(),
  398. 'foo:\tbar=baz\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  399. with nested(mock.patch('sys.stdout',
  400. StringIO.StringIO()), mock.patch('sys.argv',
  401. [ 'progname', '-a', 'foo=bleh', testfname ])) as (stdout, argv):
  402. main()
  403. with nested(mock.patch('sys.stdout',
  404. StringIO.StringIO()), mock.patch('sys.argv',
  405. [ 'progname', '-l', testfname ])) as (stdout, argv):
  406. main()
  407. self.assertEqual(stdout.getvalue(),
  408. 'foo:\tbar=baz\nfoo:\tbleh\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  409. with nested(mock.patch('sys.stdout',
  410. StringIO.StringIO()), mock.patch('sys.argv',
  411. [ 'progname', '-d', 'foo=bar=baz', testfname ])) as (stdout, argv):
  412. main()
  413. with nested(mock.patch('sys.stdout',
  414. StringIO.StringIO()), mock.patch('sys.argv',
  415. [ 'progname', '-l', testfname ])) as (stdout, argv):
  416. main()
  417. self.assertEqual(stdout.getvalue(),
  418. 'foo:\tbleh\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')