MetaData Sharing
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

474 lines
13 KiB

  1. #!/usr/bin/env python
  2. import copy
  3. import datetime
  4. import hashlib
  5. import mock
  6. import os.path
  7. import pasn1
  8. import shutil
  9. import string
  10. import tempfile
  11. import unittest
  12. import uuid
  13. from contextlib import nested
  14. # The UUID for the namespace representing the path to a file
  15. _NAMESPACE_MEDASHARE_PATH = uuid.UUID('f6f36b62-3770-4a68-bc3d-dc3e31e429e6')
  16. _defaulthash = 'sha512'
  17. _validhashes = set([ 'sha256', 'sha512' ])
  18. _hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in _validhashes }
  19. def _iterdictlist(obj):
  20. itms = obj.items()
  21. itms.sort()
  22. for k, v in itms:
  23. if isinstance(v, list):
  24. for i in v:
  25. yield k, i
  26. else:
  27. yield k, v
  28. # XXX - add validation
  29. class MDBase(object):
  30. '''This is a simple wrapper that turns a JSON object into a pythonesc
  31. object where attribute accesses work.'''
  32. _generated_properties = {
  33. 'uuid': uuid.uuid4,
  34. 'modified': datetime.datetime.utcnow
  35. }
  36. _common_properties = [ 'type', 'created_by_ref' ] # XXX - add lang?
  37. _common_names = set(_common_properties + _generated_properties.keys())
  38. def __init__(self, obj):
  39. obj = copy.deepcopy(obj)
  40. if 'type' not in obj:
  41. obj['type'] = self._type
  42. for x in self._common_properties:
  43. if x not in obj:
  44. raise ValueError('common property %s not present' % `x`)
  45. for x, fun in self._generated_properties.iteritems():
  46. if x not in obj:
  47. obj[x] = fun()
  48. self._obj = obj
  49. @classmethod
  50. def create_obj(cls, obj):
  51. '''Using obj as a base, create an instead of MDBase of the
  52. correct type.
  53. If the correct type is not found, a ValueError is raised.'''
  54. if isinstance(obj, cls):
  55. obj = obj._obj
  56. ty = obj['type']
  57. for i in MDBase.__subclasses__():
  58. if i._type == ty:
  59. return i(obj)
  60. else:
  61. raise ValueError('Unable to find class for type %s' %
  62. `ty`)
  63. def new_version(self, *args):
  64. '''Add the property k as an additional one (or new on if
  65. first), with the value v.'''
  66. obj = copy.deepcopy(self._obj)
  67. for k, v in args:
  68. obj.setdefault(k, []).append(v)
  69. del obj['modified']
  70. return self.create_obj(obj)
  71. def __getattr__(self, k):
  72. return self._obj[k]
  73. def __getitem__(self, k):
  74. return self._obj[k]
  75. def __to_dict__(self):
  76. return self._obj
  77. def __eq__(self, o):
  78. return cmp(self._obj, o) == 0
  79. def __contains__(self, k):
  80. return k in self._obj
  81. def items(self, skipcommon=True):
  82. return [ (k, v) for k, v in self._obj.items() if k not in
  83. self._common_names ]
  84. class MetaData(MDBase):
  85. _type = 'metadata'
  86. def _trytodict(o):
  87. try:
  88. return 'dict', o.__to_dict__()
  89. except Exception: # pragma: no cover
  90. raise TypeError('unable to find __to_dict__ on %s' % type(o))
  91. _asn1coder = pasn1.ASN1DictCoder(coerce=_trytodict)
  92. class ObjectStore(object):
  93. '''A container to store for the various Metadata objects.'''
  94. # The _uuids property contains both the UUIDv4 for objects, and
  95. # looking up the UUIDv5 for FileObjects.
  96. def __init__(self, created_by_ref):
  97. self._created_by_ref = created_by_ref
  98. self._uuids = {}
  99. self._hashes = {}
  100. @staticmethod
  101. def makehash(hashstr, strict=True):
  102. '''Take a hash string, and return a valid hash string from it.
  103. This makes sure that it is of the correct type and length.
  104. If strict is False, the function will detect the length and
  105. return a valid hash if one can be found.'''
  106. try:
  107. hash, value = hashstr.split(':')
  108. except ValueError:
  109. if strict:
  110. raise
  111. hash = _hashlengths[len(hashstr)]
  112. value = hashstr
  113. if strict and len(str(value).translate(None, string.hexdigits.lower())) != 0:
  114. raise ValueError('value has invalid hex digits (must be lower case)', value)
  115. if hash in _validhashes:
  116. return ':'.join((hash, value))
  117. raise ValueError
  118. def __len__(self):
  119. return len(self._uuids)
  120. def store(self, fname):
  121. '''Write out the objects in the store to the file named
  122. fname.'''
  123. with open(fname, 'w') as fp:
  124. obj = {
  125. 'created_by_ref': self._created_by_ref,
  126. 'objects': self._uuids.values(),
  127. }
  128. fp.write(_asn1coder.dumps(obj))
  129. def loadobj(self, obj):
  130. '''Load obj into the data store.'''
  131. obj = MDBase.create_obj(obj)
  132. if not isinstance(obj.uuid, uuid.UUID):
  133. id = uuid.UUID(obj.uuid)
  134. else:
  135. id = obj.uuid
  136. self._uuids[id] = obj
  137. for j in obj.hashes:
  138. h = self.makehash(j)
  139. self._hashes.setdefault(h, []).append(obj)
  140. @classmethod
  141. def load(cls, fname):
  142. '''Load objects from the provided file name.
  143. Basic validation will be done on the objects in the file.
  144. The objects will be accessible via other methods.'''
  145. with open(fname) as fp:
  146. objs = _asn1coder.loads(fp.read())
  147. obj = cls(objs['created_by_ref'])
  148. for i in objs['objects']:
  149. obj.loadobj(i)
  150. return obj
  151. def by_id(self, id):
  152. '''Look up an object by it's UUID.'''
  153. if not isinstance(id, uuid.UUID):
  154. uid = uuid.UUID(id)
  155. else:
  156. uid = id
  157. return self._uuids[uid]
  158. def by_hash(self, hash):
  159. '''Look up an object by it's hash value.'''
  160. h = self.makehash(hash, strict=False)
  161. return self._hashes[h]
  162. def by_file(self, fname):
  163. '''Return a metadata object for the file named fname.'''
  164. fid = FileObject.make_id(fname)
  165. try:
  166. fobj = self.by_id(fid)
  167. except KeyError:
  168. # unable to find it
  169. fobj = FileObject.from_file(fname, self._created_by_ref)
  170. self.loadobj(fobj)
  171. for i in fobj.hashes:
  172. j = self.by_hash(i)
  173. # Filter out non-metadata objects
  174. j = [ x for x in j if x.type == 'metadata' ]
  175. if j:
  176. return j
  177. else:
  178. raise KeyError('unable to find metadata for file')
  179. def _hashfile(fname):
  180. hash = getattr(hashlib, _defaulthash)()
  181. with open(fname) as fp:
  182. r = fp.read()
  183. hash.update(r)
  184. return '%s:%s' % (_defaulthash, hash.hexdigest())
  185. class FileObject(MDBase):
  186. _type = 'file'
  187. @staticmethod
  188. def make_id(fname):
  189. '''Take a local file name, and make the id for it. Note that
  190. converts from the local path separator to a forward slash so
  191. that it will be the same between Windows and Unix systems.'''
  192. fname = os.path.realpath(fname)
  193. return uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  194. '/'.join(os.path.split(fname)))
  195. @classmethod
  196. def from_file(cls, filename, created_by_ref):
  197. s = os.stat(filename)
  198. obj = {
  199. 'dir': os.path.dirname(filename),
  200. 'created_by_ref': created_by_ref,
  201. 'filename': os.path.basename(filename),
  202. 'id': cls.make_id(filename),
  203. 'mtime': datetime.datetime.utcfromtimestamp(s.st_mtime),
  204. 'size': s.st_size,
  205. 'hashes': ( _hashfile(filename), ),
  206. }
  207. return cls(obj)
  208. def enumeratedir(_dir, created_by_ref):
  209. '''Enumerate all the files and directories (not recursive) in _dir.
  210. Returned is a list of FileObjects.'''
  211. return map(lambda x: FileObject.from_file(os.path.join(_dir, x), created_by_ref),
  212. os.listdir(_dir))
  213. def main():
  214. from optparse import OptionParser
  215. parser = OptionParser()
  216. parser.add_option('-l', action='store_true', dest='list',
  217. default=False, help='list metadata')
  218. options, args = parser.parse_args()
  219. storefname = os.path.expanduser('~/.medashare_store.pasn1')
  220. import sys
  221. #print >>sys.stderr, `storefname`
  222. objstr = ObjectStore.load(storefname)
  223. for i in args:
  224. for j in objstr.by_file(i):
  225. for k, v in _iterdictlist(j):
  226. print '%s:\t%s' % (k, v)
  227. #objstr.store()
  228. if __name__ == '__main__': # pragma: no cover
  229. main()
  230. class _TestCases(unittest.TestCase):
  231. created_by_ref = '867c7563-79ae-435c-a265-9d8509cefac5'
  232. def setUp(self):
  233. d = os.path.realpath(tempfile.mkdtemp())
  234. self.basetempdir = d
  235. self.tempdir = os.path.join(d, 'subdir')
  236. shutil.copytree(os.path.join('fixtures', 'testfiles'),
  237. self.tempdir)
  238. def tearDown(self):
  239. shutil.rmtree(self.basetempdir)
  240. self.tempdir = None
  241. def test_mdbase(self):
  242. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'unknosldkfj' })
  243. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'metadata' })
  244. baseobj = {
  245. 'type': 'metadata',
  246. 'created_by_ref': self.created_by_ref,
  247. }
  248. origbase = copy.deepcopy(baseobj)
  249. # that when an MDBase object is created
  250. md = MDBase.create_obj(baseobj)
  251. # it doesn't modify the passed in object (when adding
  252. # generated properties)
  253. self.assertEqual(baseobj, origbase)
  254. # and it has the generted properties
  255. # Note: cannot mock the functions as they are already
  256. # referenced at creation time
  257. self.assertIn('uuid', md)
  258. self.assertIn('modified', md)
  259. # That you can create a new version using new_version
  260. md2 = md.new_version(('dc:creator', 'Jim Bob',))
  261. # that they are different
  262. self.assertNotEqual(md, md2)
  263. # and that the new modified time is different from the old
  264. self.assertNotEqual(md.modified, md2.modified)
  265. # and that the modification is present
  266. self.assertEqual(md2['dc:creator'], [ 'Jim Bob' ])
  267. def test_makehash(self):
  268. self.assertRaises(ValueError, ObjectStore.makehash, 'slkj')
  269. self.assertRaises(ValueError, ObjectStore.makehash, 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA')
  270. self.assertRaises(ValueError, ObjectStore.makehash, 'bogushash:9e0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA', strict=False)
  271. self.assertEqual(ObjectStore.makehash('cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e', strict=False), 'sha512:cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
  272. self.assertEqual(ObjectStore.makehash('e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', strict=False), 'sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855')
  273. def test_enumeratedir(self):
  274. files = enumeratedir(self.tempdir, self.created_by_ref)
  275. ftest = files[0]
  276. fname = 'test.txt'
  277. # make sure that they are of type MDBase
  278. self.assertIsInstance(ftest, MDBase)
  279. oldid = ftest.id
  280. self.assertEqual(ftest.filename, fname)
  281. self.assertEqual(ftest.dir, self.tempdir)
  282. # XXX - do we add host information?
  283. self.assertEqual(ftest.id, uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  284. '/'.join(os.path.split(self.tempdir) +
  285. ( fname, ))))
  286. self.assertEqual(ftest.mtime, datetime.datetime(2019, 5, 20, 21, 47, 36))
  287. self.assertEqual(ftest.size, 15)
  288. self.assertIn('sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f', ftest.hashes)
  289. # XXX - make sure works w/ relative dirs
  290. files = enumeratedir(os.path.relpath(self.tempdir),
  291. self.created_by_ref)
  292. self.assertEqual(oldid, files[0].id)
  293. def test_objectstore(self):
  294. objst = ObjectStore.load(os.path.join('fixtures', 'sample.data.pasn1'))
  295. objst.loadobj({
  296. 'type': 'metadata',
  297. 'uuid': 'c9a1d1e2-3109-4efd-8948-577dc15e44e7',
  298. 'modified': datetime.datetime(2019, 5, 31, 14, 3, 10),
  299. 'created_by_ref': self.created_by_ref,
  300. 'hashes': [ 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada' ],
  301. 'lang': 'en',
  302. })
  303. lst = objst.by_hash('91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada')
  304. self.assertEqual(len(lst), 2)
  305. byid = objst.by_id('3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  306. self.assertIsInstance(byid, MetaData)
  307. self.assertIn(byid, lst)
  308. r = byid
  309. self.assertEqual(r.uuid, '3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  310. self.assertEqual(r['dc:creator'], [ u'John-Mark Gurney' ])
  311. fname = 'testfile.pasn1'
  312. objst.store(fname)
  313. with open(fname) as fp:
  314. objs = _asn1coder.loads(fp.read())
  315. os.unlink(fname)
  316. self.assertEqual(len(objs), len(objst))
  317. self.assertEqual(objs['created_by_ref'], self.created_by_ref)
  318. for i in objs['objects']:
  319. self.assertEqual(objst.by_id(i['uuid']), i)
  320. testfname = os.path.join(self.tempdir, 'test.txt')
  321. self.assertEqual(objst.by_file(testfname), [ byid ])
  322. self.assertEqual(objst.by_file(testfname), [ byid ])
  323. self.assertRaises(KeyError, objst.by_file, '/dev/null')
  324. # XXX make sure that object store contains fileobject
  325. # Tests to add:
  326. # Non-duplicates when same metadata is located by multiple hashes.
  327. def test_main(self):
  328. # Test the main runner, this is only testing things that are
  329. # specific to running the program, like where the store is
  330. # created.
  331. # setup object store
  332. storefname = os.path.join(self.tempdir, 'storefname')
  333. shutil.copy(os.path.join('fixtures', 'sample.data.pasn1'), storefname)
  334. # setup test fname
  335. testfname = os.path.join(self.tempdir, 'test.txt')
  336. import sys
  337. import StringIO
  338. with mock.patch('os.path.expanduser', side_effect=(storefname, )) \
  339. as eu:
  340. with nested(mock.patch('sys.stdout',
  341. StringIO.StringIO()), mock.patch('sys.argv',
  342. [ 'progname', '-l', testfname ])) as (stdout, argv):
  343. main()
  344. self.assertEqual(stdout.getvalue(),
  345. 'dc:creator:\tJohn-Mark Gurney\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  346. eu.assert_called_with('~/.medashare_store.pasn1')
  347. if False: # pragma: no cover
  348. # Example how to force proper output
  349. with mock.patch('sys.stdout', StringIO.StringIO()) as ssw:
  350. print 'foobar'
  351. self.assertEqual(ssw.getvalue(), 'foobar\n')