MetaData Sharing
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

298 lines
8.0 KiB

  1. #!/usr/bin/env python
  2. import copy
  3. import datetime
  4. import hashlib
  5. import pasn1
  6. import os.path
  7. import shutil
  8. import string
  9. import tempfile
  10. import unittest
  11. import uuid
  12. # The UUID for the namespace representing the path to a file
  13. _NAMESPACE_MEDASHARE_PATH = uuid.UUID('f6f36b62-3770-4a68-bc3d-dc3e31e429e6')
  14. _defaulthash = 'sha512'
  15. _validhashes = set([ 'sha256', 'sha512' ])
  16. _hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in _validhashes }
  17. # XXX - add validation
  18. class MDBase(object):
  19. '''This is a simple wrapper that turns a JSON object into a pythonesc
  20. object where attribute accesses work.'''
  21. _generated_properties = {
  22. 'uuid': uuid.uuid4,
  23. 'modified': datetime.datetime.utcnow
  24. }
  25. _common_properties = [ 'created_by_ref' ]
  26. def __init__(self, obj):
  27. obj = copy.deepcopy(obj)
  28. if 'type' not in obj:
  29. obj['type'] = self._type
  30. for x in self._common_properties:
  31. if x not in obj:
  32. raise ValueError('common property %s not present' % `x`)
  33. for x, fun in self._generated_properties.iteritems():
  34. if x not in obj:
  35. obj[x] = fun()
  36. self._obj = obj
  37. @classmethod
  38. def create_obj(cls, obj):
  39. '''Using obj as a base, create an instead of MDBase of the
  40. correct type.
  41. If the correct type is not found, a ValueError is raised.'''
  42. ty = obj['type']
  43. for i in cls.__subclasses__():
  44. if i._type == ty:
  45. return i(obj)
  46. else:
  47. raise ValueError('Unable to find class for type %s' % `ty`)
  48. def __getattr__(self, k):
  49. return self._obj[k]
  50. def __getitem__(self, k):
  51. return self._obj[k]
  52. def __to_dict__(self):
  53. return self._obj
  54. def __eq__(self, o):
  55. return cmp(self._obj, o) == 0
  56. class MetaData(MDBase):
  57. _type = 'metadata'
  58. def _trytodict(o):
  59. try:
  60. return 'dict', o.__to_dict__()
  61. except Exception:
  62. raise TypeError('unable to find __to_dict__ on %s' % type(o))
  63. _asn1coder = pasn1.ASN1DictCoder(coerce=_trytodict)
  64. class ObjectStore(object):
  65. '''A container to store for the various Metadata objects.'''
  66. def __init__(self):
  67. self._uuids = {}
  68. self._hashes = {}
  69. @staticmethod
  70. def makehash(hashstr, strict=True):
  71. '''Take a hash string, and return a valid hash string from it.
  72. This makes sure that it is of the correct type and length.
  73. If strict is False, the function will detect the length and
  74. return a valid hash if one can be found.'''
  75. try:
  76. hash, value = hashstr.split(':')
  77. except ValueError:
  78. if strict:
  79. raise
  80. hash = _hashlengths[len(hashstr)]
  81. value = hashstr
  82. if strict and len(str(value).translate(None, string.hexdigits.lower())) != 0:
  83. raise ValueError('value has invalid hex digits (must be lower case)', value)
  84. if hash in _validhashes:
  85. return ':'.join((hash, value))
  86. raise ValueError
  87. def __len__(self):
  88. return len(self._uuids)
  89. def store(self, fname):
  90. '''Write out the objects in the store to the file named
  91. fname.'''
  92. with open(fname, 'w') as fp:
  93. fp.write(_asn1coder.dumps(self._uuids.values()))
  94. def loadobj(self, obj):
  95. '''Load obj into the data store.'''
  96. obj = MDBase.create_obj(obj)
  97. id = uuid.UUID(obj.uuid)
  98. self._uuids[id] = obj
  99. for j in obj.hashes:
  100. h = self.makehash(j)
  101. self._hashes.setdefault(h, []).append(obj)
  102. def load(self, fname):
  103. '''Load objects from the provided file name.
  104. Basic validation will be done on the objects in the file.
  105. The objects will be accessible via other methods.'''
  106. with open(fname) as fp:
  107. objs = _asn1coder.loads(fp.read())
  108. for i in objs:
  109. self.loadobj(i)
  110. def by_id(self, id):
  111. '''Look up an object by it's UUID.'''
  112. uid = uuid.UUID(id)
  113. return self._uuids[uid]
  114. def by_hash(self, hash):
  115. '''Look up an object by it's hash value.'''
  116. h = self.makehash(hash, strict=False)
  117. return self._hashes[h]
  118. def _hashfile(fname):
  119. hash = getattr(hashlib, _defaulthash)()
  120. with open(fname) as fp:
  121. r = fp.read()
  122. hash.update(r)
  123. return '%s:%s' % (_defaulthash, hash.hexdigest())
  124. class FileObject(MDBase):
  125. _type = 'file'
  126. @classmethod
  127. def from_file(cls, _dir, filename, created_by_ref):
  128. _dir = os.path.realpath(_dir)
  129. fname = os.path.join(_dir, filename)
  130. s = os.stat(fname)
  131. obj = {
  132. 'dir': _dir,
  133. 'created_by_ref': created_by_ref,
  134. 'filename': filename,
  135. 'id': uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  136. '/'.join(os.path.split(fname))),
  137. 'mtime': datetime.datetime.utcfromtimestamp(s.st_mtime),
  138. 'size': s.st_size,
  139. 'hashes': ( _hashfile(fname), ),
  140. }
  141. return cls(obj)
  142. def enumeratedir(_dir, created_by_ref):
  143. '''Enumerate all the files and directories (not recursive) in _dir.
  144. Returned is a list of FileObjects.'''
  145. return map(lambda x: FileObject.from_file(_dir, x, created_by_ref),
  146. os.listdir(_dir))
  147. class _TestCases(unittest.TestCase):
  148. def setUp(self):
  149. d = os.path.realpath(tempfile.mkdtemp())
  150. self.basetempdir = d
  151. self.tempdir = os.path.join(d, 'subdir')
  152. shutil.copytree(os.path.join('fixtures', 'testfiles'),
  153. self.tempdir)
  154. def tearDown(self):
  155. shutil.rmtree(self.basetempdir)
  156. self.tempdir = None
  157. def test_mdbase(self):
  158. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'unknosldkfj' })
  159. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'metadata' })
  160. baseobj = {
  161. 'type': 'metadata',
  162. 'created_by_ref': '867c7563-79ae-435c-a265-9d8509cefac5',
  163. }
  164. origbase = copy.deepcopy(baseobj)
  165. md = MDBase.create_obj(baseobj)
  166. self.assertEqual(baseobj, origbase)
  167. def test_makehash(self):
  168. self.assertRaises(ValueError, ObjectStore.makehash, 'slkj')
  169. self.assertRaises(ValueError, ObjectStore.makehash, 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA')
  170. self.assertEqual(ObjectStore.makehash('cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e', strict=False), 'sha512:cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
  171. self.assertEqual(ObjectStore.makehash('e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', strict=False), 'sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855')
  172. def test_enumeratedir(self):
  173. files = enumeratedir(self.tempdir, '867c7563-79ae-435c-a265-9d8509cefac5')
  174. ftest = files[0]
  175. fname = 'test.txt'
  176. # make sure that they are of type MDBase
  177. self.assertIsInstance(ftest, MDBase)
  178. oldid = ftest.id
  179. self.assertEqual(ftest.filename, fname)
  180. self.assertEqual(ftest.dir, self.tempdir)
  181. # XXX - do we add host information?
  182. self.assertEqual(ftest.id, uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  183. '/'.join(os.path.split(self.tempdir) +
  184. ( fname, ))))
  185. self.assertEqual(ftest.mtime, datetime.datetime(2019, 5, 20, 21, 47, 36))
  186. self.assertEqual(ftest.size, 15)
  187. self.assertIn('sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f', ftest.hashes)
  188. # XXX - make sure works w/ relative dirs
  189. files = enumeratedir(os.path.relpath(self.tempdir),
  190. '867c7563-79ae-435c-a265-9d8509cefac5')
  191. self.assertEqual(oldid, files[0].id)
  192. def test_objectstore(self):
  193. objst = ObjectStore()
  194. objst.load(os.path.join('fixtures', 'sample.data.pasn1'))
  195. objst.loadobj({
  196. 'type': 'metadata',
  197. 'uuid': 'c9a1d1e2-3109-4efd-8948-577dc15e44e7',
  198. 'modified': datetime.datetime(2019, 5, 31, 14, 3, 10),
  199. 'created_by_ref': '867c7563-79ae-435c-a265-9d8509cefac5',
  200. 'hashes': [ 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada' ],
  201. 'lang': 'en',
  202. })
  203. lst = objst.by_hash('91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada')
  204. self.assertEqual(len(lst), 2)
  205. byid = objst.by_id('3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  206. self.assertIsInstance(byid, MetaData)
  207. self.assertIn(byid, lst)
  208. r = byid
  209. self.assertEqual(r.uuid, '3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  210. self.assertEqual(r['dc:author'], 'John-Mark Gurney')
  211. fname = 'testfile.pasn1'
  212. objst.store(fname)
  213. with open(fname) as fp:
  214. objs = _asn1coder.loads(fp.read())
  215. os.unlink(fname)
  216. self.assertEqual(len(objs), len(objst))
  217. for i in objs:
  218. self.assertEqual(objst.by_id(i['uuid']), i)