MetaData Sharing
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

585 lines
17 KiB

  1. #!/usr/bin/env python
  2. #import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace()
  3. import copy
  4. import datetime
  5. import hashlib
  6. import mock
  7. import os.path
  8. import pasn1
  9. import shutil
  10. import string
  11. import tempfile
  12. import unittest
  13. import uuid
  14. from contextlib import nested
  15. # The UUID for the namespace representing the path to a file
  16. _NAMESPACE_MEDASHARE_PATH = uuid.UUID('f6f36b62-3770-4a68-bc3d-dc3e31e429e6')
  17. _defaulthash = 'sha512'
  18. _validhashes = set([ 'sha256', 'sha512' ])
  19. _hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in _validhashes }
  20. def _iterdictlist(obj):
  21. itms = obj.items()
  22. itms.sort()
  23. for k, v in itms:
  24. if isinstance(v, list):
  25. v = v[:]
  26. v.sort()
  27. for i in v:
  28. yield k, i
  29. else:
  30. yield k, v
  31. # XXX - add validation
  32. class MDBase(object):
  33. '''This is a simple wrapper that turns a JSON object into a pythonesc
  34. object where attribute accesses work.'''
  35. _generated_properties = {
  36. 'uuid': uuid.uuid4,
  37. 'modified': datetime.datetime.utcnow
  38. }
  39. _common_properties = [ 'type', 'created_by_ref' ] # XXX - add lang?
  40. _common_optional = [ 'overlay_refs' ]
  41. _common_names = set(_common_properties + _generated_properties.keys())
  42. def __init__(self, obj):
  43. obj = copy.deepcopy(obj)
  44. if 'type' not in obj:
  45. obj['type'] = self._type
  46. for x in self._common_properties:
  47. if x not in obj:
  48. raise ValueError('common property %s not present' % `x`)
  49. for x, fun in self._generated_properties.iteritems():
  50. if x not in obj:
  51. obj[x] = fun()
  52. self._obj = obj
  53. @classmethod
  54. def create_obj(cls, obj):
  55. '''Using obj as a base, create an instance of MDBase of the
  56. correct type.
  57. If the correct type is not found, a ValueError is raised.'''
  58. if isinstance(obj, cls):
  59. obj = obj._obj
  60. ty = obj['type']
  61. for i in MDBase.__subclasses__():
  62. if i._type == ty:
  63. return i(obj)
  64. else:
  65. raise ValueError('Unable to find class for type %s' %
  66. `ty`)
  67. def new_version(self, *args):
  68. '''For each k, v pari, add the property k as an additional one
  69. (or new on if first), with the value v.'''
  70. obj = copy.deepcopy(self._obj)
  71. for k, v in args:
  72. obj.setdefault(k, []).append(v)
  73. del obj['modified']
  74. return self.create_obj(obj)
  75. def __getattr__(self, k):
  76. return self._obj[k]
  77. def __getitem__(self, k):
  78. return self._obj[k]
  79. def __to_dict__(self):
  80. return self._obj
  81. def __eq__(self, o):
  82. return cmp(self._obj, o) == 0
  83. def __contains__(self, k):
  84. return k in self._obj
  85. def items(self, skipcommon=True):
  86. return [ (k, v) for k, v in self._obj.items() if k not in
  87. self._common_names ]
  88. class MetaData(MDBase):
  89. _type = 'metadata'
  90. def _trytodict(o):
  91. if isinstance(o, uuid.UUID):
  92. return 'unicode', str(o)
  93. try:
  94. return 'dict', o.__to_dict__()
  95. except Exception: # pragma: no cover
  96. raise TypeError('unable to find __to_dict__ on %s: %s' % (type(o), `o`))
  97. _asn1coder = pasn1.ASN1DictCoder(coerce=_trytodict)
  98. class ObjectStore(object):
  99. '''A container to store for the various Metadata objects.'''
  100. # The _uuids property contains both the UUIDv4 for objects, and
  101. # looking up the UUIDv5 for FileObjects.
  102. def __init__(self, created_by_ref):
  103. self._created_by_ref = created_by_ref
  104. self._uuids = {}
  105. self._hashes = {}
  106. @staticmethod
  107. def makehash(hashstr, strict=True):
  108. '''Take a hash string, and return a valid hash string from it.
  109. This makes sure that it is of the correct type and length.
  110. If strict is False, the function will detect the length and
  111. return a valid hash if one can be found.'''
  112. try:
  113. hash, value = hashstr.split(':')
  114. except ValueError:
  115. if strict:
  116. raise
  117. hash = _hashlengths[len(hashstr)]
  118. value = hashstr
  119. if strict and len(str(value).translate(None, string.hexdigits.lower())) != 0:
  120. raise ValueError('value has invalid hex digits (must be lower case)', value)
  121. if hash in _validhashes:
  122. return ':'.join((hash, value))
  123. raise ValueError
  124. def __len__(self):
  125. return len(self._uuids)
  126. def store(self, fname):
  127. '''Write out the objects in the store to the file named
  128. fname.'''
  129. with open(fname, 'w') as fp:
  130. obj = {
  131. 'created_by_ref': self._created_by_ref,
  132. 'objects': self._uuids.values(),
  133. }
  134. fp.write(_asn1coder.dumps(obj))
  135. def loadobj(self, obj):
  136. '''Load obj into the data store.'''
  137. obj = MDBase.create_obj(obj)
  138. if not isinstance(obj.uuid, uuid.UUID):
  139. id = uuid.UUID(obj.uuid)
  140. else:
  141. id = obj.uuid
  142. self._uuids[id] = obj
  143. for j in obj.hashes:
  144. h = self.makehash(j)
  145. self._hashes.setdefault(h, []).append(obj)
  146. @classmethod
  147. def load(cls, fname):
  148. '''Load objects from the provided file name.
  149. Basic validation will be done on the objects in the file.
  150. The objects will be accessible via other methods.'''
  151. with open(fname) as fp:
  152. objs = _asn1coder.loads(fp.read())
  153. obj = cls(objs['created_by_ref'])
  154. for i in objs['objects']:
  155. obj.loadobj(i)
  156. return obj
  157. def by_id(self, id):
  158. '''Look up an object by it's UUID.'''
  159. if not isinstance(id, uuid.UUID):
  160. uid = uuid.UUID(id)
  161. else:
  162. uid = id
  163. return self._uuids[uid]
  164. def by_hash(self, hash):
  165. '''Look up an object by it's hash value.'''
  166. h = self.makehash(hash, strict=False)
  167. return self._hashes[h]
  168. def by_file(self, fname):
  169. '''Return a metadata object for the file named fname.'''
  170. fid = FileObject.make_id(fname)
  171. try:
  172. fobj = self.by_id(fid)
  173. except KeyError:
  174. # unable to find it
  175. fobj = FileObject.from_file(fname, self._created_by_ref)
  176. self.loadobj(fobj)
  177. for i in fobj.hashes:
  178. j = self.by_hash(i)
  179. # Filter out non-metadata objects
  180. j = [ x for x in j if x.type == 'metadata' ]
  181. if j:
  182. return j
  183. else:
  184. raise KeyError('unable to find metadata for file')
  185. def _hashfile(fname):
  186. hash = getattr(hashlib, _defaulthash)()
  187. with open(fname) as fp:
  188. r = fp.read()
  189. hash.update(r)
  190. return '%s:%s' % (_defaulthash, hash.hexdigest())
  191. class FileObject(MDBase):
  192. _type = 'file'
  193. @staticmethod
  194. def make_id(fname):
  195. '''Take a local file name, and make the id for it. Note that
  196. converts from the local path separator to a forward slash so
  197. that it will be the same between Windows and Unix systems.'''
  198. fname = os.path.realpath(fname)
  199. return uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  200. '/'.join(os.path.split(fname)))
  201. @classmethod
  202. def from_file(cls, filename, created_by_ref):
  203. s = os.stat(filename)
  204. obj = {
  205. 'dir': os.path.dirname(filename),
  206. 'created_by_ref': created_by_ref,
  207. 'filename': os.path.basename(filename),
  208. 'id': cls.make_id(filename),
  209. 'mtime': datetime.datetime.utcfromtimestamp(s.st_mtime),
  210. 'size': s.st_size,
  211. 'hashes': [ _hashfile(filename), ],
  212. }
  213. return cls(obj)
  214. def enumeratedir(_dir, created_by_ref):
  215. '''Enumerate all the files and directories (not recursive) in _dir.
  216. Returned is a list of FileObjects.'''
  217. return map(lambda x: FileObject.from_file(os.path.join(_dir, x), created_by_ref),
  218. os.listdir(_dir))
  219. def main():
  220. from optparse import OptionParser
  221. parser = OptionParser()
  222. parser.add_option('-a', action='append', dest='add',
  223. default=[], help='add the arg as metadata for files, tag=value')
  224. parser.add_option('-d', action='append', dest='delete',
  225. default=[], help='delete the arg as metadata from files. Either specify tag, and all tags are removed, or specify tag=value and that specific tag will be removed.')
  226. parser.add_option('-l', action='store_true', dest='list',
  227. default=False, help='list metadata')
  228. options, args = parser.parse_args()
  229. storefname = os.path.expanduser('~/.medashare_store.pasn1')
  230. import sys
  231. #print >>sys.stderr, `storefname`
  232. objstr = ObjectStore.load(storefname)
  233. if options.list:
  234. for i in args:
  235. for j in objstr.by_file(i):
  236. #print >>sys.stderr, `j._obj`
  237. for k, v in _iterdictlist(j):
  238. print '%s:\t%s' % (k, v)
  239. elif options.add:
  240. addprops = map(lambda x: x.split('=', 1), options.add)
  241. for i in args:
  242. for j in objstr.by_file(i):
  243. nobj = j.new_version(*addprops)
  244. objstr.loadobj(nobj)
  245. elif options.delete:
  246. for i in args:
  247. for j in objstr.by_file(i):
  248. obj = j.__to_dict__()
  249. for k in options.delete:
  250. try:
  251. key, v = k.split('=', 1)
  252. obj[key].remove(v)
  253. except ValueError:
  254. del obj[k]
  255. nobj = MDBase(obj)
  256. objstr.loadobj(nobj)
  257. else:
  258. raise NotImplementedError
  259. objstr.store(storefname)
  260. if __name__ == '__main__': # pragma: no cover
  261. main()
  262. class _TestCases(unittest.TestCase):
  263. created_by_ref = '867c7563-79ae-435c-a265-9d8509cefac5'
  264. def setUp(self):
  265. d = os.path.realpath(tempfile.mkdtemp())
  266. self.basetempdir = d
  267. self.tempdir = os.path.join(d, 'subdir')
  268. shutil.copytree(os.path.join('fixtures', 'testfiles'),
  269. self.tempdir)
  270. def tearDown(self):
  271. shutil.rmtree(self.basetempdir)
  272. self.tempdir = None
  273. def test_mdbase(self):
  274. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'unknosldkfj' })
  275. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'metadata' })
  276. baseobj = {
  277. 'type': 'metadata',
  278. 'created_by_ref': self.created_by_ref,
  279. }
  280. origbase = copy.deepcopy(baseobj)
  281. # that when an MDBase object is created
  282. md = MDBase.create_obj(baseobj)
  283. # it doesn't modify the passed in object (when adding
  284. # generated properties)
  285. self.assertEqual(baseobj, origbase)
  286. # and it has the generted properties
  287. # Note: cannot mock the functions as they are already
  288. # referenced at creation time
  289. self.assertIn('uuid', md)
  290. self.assertIn('modified', md)
  291. # That you can create a new version using new_version
  292. md2 = md.new_version(('dc:creator', 'Jim Bob',))
  293. # that they are different
  294. self.assertNotEqual(md, md2)
  295. # and that the new modified time is different from the old
  296. self.assertNotEqual(md.modified, md2.modified)
  297. # and that the modification is present
  298. self.assertEqual(md2['dc:creator'], [ 'Jim Bob' ])
  299. def test_makehash(self):
  300. self.assertRaises(ValueError, ObjectStore.makehash, 'slkj')
  301. self.assertRaises(ValueError, ObjectStore.makehash, 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA')
  302. self.assertRaises(ValueError, ObjectStore.makehash, 'bogushash:9e0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA', strict=False)
  303. self.assertEqual(ObjectStore.makehash('cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e', strict=False), 'sha512:cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
  304. self.assertEqual(ObjectStore.makehash('e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', strict=False), 'sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855')
  305. def test_enumeratedir(self):
  306. files = enumeratedir(self.tempdir, self.created_by_ref)
  307. ftest = files[0]
  308. fname = 'test.txt'
  309. # make sure that they are of type MDBase
  310. self.assertIsInstance(ftest, MDBase)
  311. oldid = ftest.id
  312. self.assertEqual(ftest.filename, fname)
  313. self.assertEqual(ftest.dir, self.tempdir)
  314. # XXX - do we add host information?
  315. self.assertEqual(ftest.id, uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  316. '/'.join(os.path.split(self.tempdir) +
  317. ( fname, ))))
  318. self.assertEqual(ftest.mtime, datetime.datetime(2019, 5, 20, 21, 47, 36))
  319. self.assertEqual(ftest.size, 15)
  320. self.assertIn('sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f', ftest.hashes)
  321. # XXX - make sure works w/ relative dirs
  322. files = enumeratedir(os.path.relpath(self.tempdir),
  323. self.created_by_ref)
  324. self.assertEqual(oldid, files[0].id)
  325. def test_mdbaseoverlay(self):
  326. objst = ObjectStore(self.created_by_ref)
  327. # that a base object
  328. bid = uuid.uuid4()
  329. objst.loadobj({
  330. 'type': 'metadata',
  331. 'uuid': bid,
  332. 'modified': datetime.datetime(2019, 6, 10, 14, 3, 10),
  333. 'created_by_ref': self.created_by_ref,
  334. 'hashes': [ 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada' ],
  335. 'someprop': [ 'somevalue' ],
  336. 'lang': 'en',
  337. })
  338. # can have an overlay object
  339. oid = uuid.uuid4()
  340. dhash = 'sha256:a7c96262c21db9a06fd49e307d694fd95f624569f9b35bb3ffacd880440f9787'
  341. objst.loadobj({
  342. 'type': 'metadata',
  343. 'uuid': oid,
  344. 'modified': datetime.datetime(2019, 6, 10, 18, 3, 10),
  345. 'created_by_ref': self.created_by_ref,
  346. 'hashes': [ dhash ],
  347. 'overlay_refs': [ bid ],
  348. 'lang': 'en',
  349. })
  350. # and that when you get it's properties
  351. oobj = objst.by_id(oid)
  352. oitems = oobj.items()
  353. # that is has the overlays property
  354. print `oitems`
  355. def test_objectstore(self):
  356. objst = ObjectStore.load(os.path.join('fixtures', 'sample.data.pasn1'))
  357. objst.loadobj({
  358. 'type': 'metadata',
  359. 'uuid': 'c9a1d1e2-3109-4efd-8948-577dc15e44e7',
  360. 'modified': datetime.datetime(2019, 5, 31, 14, 3, 10),
  361. 'created_by_ref': self.created_by_ref,
  362. 'hashes': [ 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada' ],
  363. 'lang': 'en',
  364. })
  365. lst = objst.by_hash('91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada')
  366. self.assertEqual(len(lst), 2)
  367. byid = objst.by_id('3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  368. self.assertIsInstance(byid, MetaData)
  369. self.assertIn(byid, lst)
  370. r = byid
  371. self.assertEqual(r.uuid, '3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  372. self.assertEqual(r['dc:creator'], [ u'John-Mark Gurney' ])
  373. fname = 'testfile.pasn1'
  374. objst.store(fname)
  375. with open(fname) as fp:
  376. objs = _asn1coder.loads(fp.read())
  377. os.unlink(fname)
  378. self.assertEqual(len(objs), len(objst))
  379. self.assertEqual(objs['created_by_ref'], self.created_by_ref)
  380. for i in objs['objects']:
  381. self.assertEqual(objst.by_id(i['uuid']), i)
  382. testfname = os.path.join(self.tempdir, 'test.txt')
  383. self.assertEqual(objst.by_file(testfname), [ byid ])
  384. self.assertEqual(objst.by_file(testfname), [ byid ])
  385. self.assertRaises(KeyError, objst.by_file, '/dev/null')
  386. # XXX make sure that object store contains fileobject
  387. # Tests to add:
  388. # Non-duplicates when same metadata is located by multiple hashes.
  389. def test_main(self):
  390. # Test the main runner, this is only testing things that are
  391. # specific to running the program, like where the store is
  392. # created.
  393. # setup object store
  394. storefname = os.path.join(self.tempdir, 'storefname')
  395. shutil.copy(os.path.join('fixtures', 'sample.data.pasn1'), storefname)
  396. # setup test fname
  397. testfname = os.path.join(self.tempdir, 'test.txt')
  398. import sys
  399. import StringIO
  400. import itertools
  401. with mock.patch('os.path.expanduser', side_effect=itertools.repeat(storefname)) \
  402. as eu:
  403. with nested(mock.patch('sys.stdout',
  404. StringIO.StringIO()), mock.patch('sys.argv',
  405. [ 'progname', '-l', testfname ])) as (stdout, argv):
  406. main()
  407. self.assertEqual(stdout.getvalue(),
  408. 'dc:creator:\tJohn-Mark Gurney\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  409. eu.assert_called_with('~/.medashare_store.pasn1')
  410. with nested(mock.patch('sys.stdout',
  411. StringIO.StringIO()), mock.patch('sys.argv',
  412. [ 'progname', '-a', 'dc:creator=Another user', '-a', 'foo=bar=baz', testfname ])) as (stdout, argv):
  413. main()
  414. with nested(mock.patch('sys.stdout',
  415. StringIO.StringIO()), mock.patch('sys.argv',
  416. [ 'progname', '-l', testfname ])) as (stdout, argv):
  417. main()
  418. self.assertEqual(stdout.getvalue(),
  419. 'dc:creator:\tAnother user\ndc:creator:\tJohn-Mark Gurney\nfoo:\tbar=baz\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  420. with nested(mock.patch('sys.stdout',
  421. StringIO.StringIO()), mock.patch('sys.argv',
  422. [ 'progname', '-d', 'dc:creator', testfname ])) as (stdout, argv):
  423. main()
  424. with nested(mock.patch('sys.stdout',
  425. StringIO.StringIO()), mock.patch('sys.argv',
  426. [ 'progname', '-l', testfname ])) as (stdout, argv):
  427. main()
  428. self.assertEqual(stdout.getvalue(),
  429. 'foo:\tbar=baz\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  430. with nested(mock.patch('sys.stdout',
  431. StringIO.StringIO()), mock.patch('sys.argv',
  432. [ 'progname', '-a', 'foo=bleh', testfname ])) as (stdout, argv):
  433. main()
  434. with nested(mock.patch('sys.stdout',
  435. StringIO.StringIO()), mock.patch('sys.argv',
  436. [ 'progname', '-l', testfname ])) as (stdout, argv):
  437. main()
  438. self.assertEqual(stdout.getvalue(),
  439. 'foo:\tbar=baz\nfoo:\tbleh\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  440. with nested(mock.patch('sys.stdout',
  441. StringIO.StringIO()), mock.patch('sys.argv',
  442. [ 'progname', '-d', 'foo=bar=baz', testfname ])) as (stdout, argv):
  443. main()
  444. with nested(mock.patch('sys.stdout',
  445. StringIO.StringIO()), mock.patch('sys.argv',
  446. [ 'progname', '-l', testfname ])) as (stdout, argv):
  447. main()
  448. self.assertEqual(stdout.getvalue(),
  449. 'foo:\tbleh\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')