MetaData Sharing
 
 
 
 

298 lines
7.7 KiB

  1. #!/usr/bin/env python
  2. import datetime
  3. import hashlib
  4. import pasn1
  5. import os.path
  6. import shutil
  7. import string
  8. import tempfile
  9. import unittest
  10. import uuid
  11. # The UUID for the namespace representing the path to a file
  12. _NAMESPACE_MEDASHARE_PATH = uuid.UUID('f6f36b62-3770-4a68-bc3d-dc3e31e429e6')
  13. _defaulthash = 'sha512'
  14. _validhashes = set([ 'sha256', 'sha512' ])
  15. _hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in _validhashes }
  16. # XXX - add validation
  17. class MDBase(object):
  18. '''This is a simple wrapper that turns a JSON object into a pythonesc
  19. object where attribute accesses work.'''
  20. _common_properties = [ 'uuid', 'type', 'modified' ]
  21. def __init__(self, obj):
  22. for x in self._common_properties:
  23. if x not in obj:
  24. raise ValueError('common property %s not present' % `x`)
  25. self._obj = obj
  26. @classmethod
  27. def create_obj(cls, obj):
  28. '''Using obj as a base, create an instead of MDBase of the
  29. correct type.
  30. If the correct type is not found, a ValueError is raised.'''
  31. ty = obj['type']
  32. for i in cls.__subclasses__():
  33. if i._type == ty:
  34. return i(obj)
  35. else:
  36. raise ValueError('Unable to find class for type %s' % `ty`)
  37. def __getattr__(self, k):
  38. return self._obj[k]
  39. def __getitem__(self, k):
  40. return self._obj[k]
  41. def __to_dict__(self):
  42. return self._obj
  43. def __eq__(self, o):
  44. return cmp(self._obj, o) == 0
  45. class MetaData(MDBase):
  46. _type = 'metadata'
  47. def _trytodict(o):
  48. try:
  49. return 'dict', o.__to_dict__()
  50. except Exception:
  51. raise TypeError('unable to find __to_dict__ on %s' % type(o))
  52. _asn1coder = pasn1.ASN1DictCoder(coerce=_trytodict)
  53. class ObjectStore(object):
  54. '''A container to store for the various Metadata objects.'''
  55. def __init__(self):
  56. self._uuids = {}
  57. self._hashes = {}
  58. @staticmethod
  59. def makehash(hashstr, strict=True):
  60. '''Take a hash string, and return a valid hash string from it.
  61. This makes sure that it is of the correct type and length.
  62. If strict is False, the function will detect the length and
  63. return a valid hash if one can be found.'''
  64. try:
  65. hash, value = hashstr.split(':')
  66. except ValueError:
  67. if strict:
  68. raise
  69. hash = _hashlengths[len(hashstr)]
  70. value = hashstr
  71. if strict and len(str(value).translate(None, string.hexdigits.lower())) != 0:
  72. raise ValueError('value has invalid hex digits (must be lower case)', value)
  73. if hash in _validhashes:
  74. return ':'.join((hash, value))
  75. raise ValueError
  76. def __len__(self):
  77. return len(self._uuids)
  78. def store(self, fname):
  79. '''Write out the objects in the store to the file named
  80. fname.'''
  81. with open(fname, 'w') as fp:
  82. fp.write(_asn1coder.dumps(self._uuids.values()))
  83. def loadobj(self, obj):
  84. '''Load obj into the data store.'''
  85. obj = MDBase.create_obj(obj)
  86. id = uuid.UUID(obj.uuid)
  87. self._uuids[id] = obj
  88. for j in obj.hashes:
  89. h = self.makehash(j)
  90. self._hashes.setdefault(h, []).append(obj)
  91. def load(self, fname):
  92. '''Load objects from the provided file name.
  93. Basic validation will be done on the objects in the file.
  94. The objects will be accessible via other methods.'''
  95. with open(fname) as fp:
  96. objs = _asn1coder.loads(fp.read())
  97. for i in objs:
  98. self.loadobj(i)
  99. def by_id(self, id):
  100. '''Look up an object by it's UUID.'''
  101. uid = uuid.UUID(id)
  102. return self._uuids[uid]
  103. def by_hash(self, hash):
  104. '''Look up an object by it's hash value.'''
  105. h = self.makehash(hash, strict=False)
  106. return self._hashes[h]
  107. def _hashfile(fname):
  108. hash = getattr(hashlib, _defaulthash)()
  109. with open(fname) as fp:
  110. r = fp.read()
  111. hash.update(r)
  112. return '%s:%s' % (_defaulthash, hash.hexdigest())
  113. class FileObject(object):
  114. def __init__(self, _dir, filename):
  115. self._dir = os.path.realpath(_dir)
  116. self._fname = filename
  117. # XXX make sure this is correct
  118. self._id = uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  119. '/'.join(os.path.split(self._dir) + ( self._fname, )))
  120. fname = os.path.join(_dir, filename)
  121. s = os.stat(fname)
  122. self._mtime = datetime.datetime.utcfromtimestamp(s.st_mtime)
  123. self._size = s.st_size
  124. self._hashes = ( _hashfile(fname), )
  125. @property
  126. def hashes(self):
  127. '''The hashes for this file.'''
  128. # XXX - should return a frozen dict
  129. return self._hashes
  130. @property
  131. def mtime(self):
  132. '''The last modified date of the file.'''
  133. return self._mtime
  134. @property
  135. def size(self):
  136. '''The length of the file in bytes.'''
  137. return self._size
  138. @property
  139. def filename(self):
  140. '''The name of the file.'''
  141. return self._fname
  142. @property
  143. def dir(self):
  144. '''The directory of the file.'''
  145. return self._dir
  146. @property
  147. def id(self):
  148. '''The UUID of the path to this file.'''
  149. return self._id
  150. def enumeratedir(_dir='.'):
  151. '''Enumerate all the files and directories (not recursive) in _dir.
  152. Returned is a list of FileObjects.'''
  153. return map(lambda x: FileObject(_dir, x), os.listdir(_dir))
  154. class _TestCases(unittest.TestCase):
  155. def setUp(self):
  156. d = os.path.realpath(tempfile.mkdtemp())
  157. self.basetempdir = d
  158. self.tempdir = os.path.join(d, 'subdir')
  159. shutil.copytree(os.path.join('fixtures', 'testfiles'),
  160. self.tempdir)
  161. def tearDown(self):
  162. shutil.rmtree(self.basetempdir)
  163. self.tempdir = None
  164. def test_mdbase(self):
  165. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'unknosldkfj' })
  166. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'metadata' })
  167. def test_makehash(self):
  168. self.assertRaises(ValueError, ObjectStore.makehash, 'slkj')
  169. self.assertRaises(ValueError, ObjectStore.makehash, 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA')
  170. self.assertEqual(ObjectStore.makehash('cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e', strict=False), 'sha512:cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
  171. self.assertEqual(ObjectStore.makehash('e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', strict=False), 'sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855')
  172. def test_enumeratedir(self):
  173. files = enumeratedir(self.tempdir)
  174. ftest = files[0]
  175. fname = 'test.txt'
  176. oldid = ftest.id
  177. self.assertEqual(ftest.filename, fname)
  178. self.assertEqual(ftest.dir, self.tempdir)
  179. # XXX - do we add host information?
  180. self.assertEqual(ftest.id, uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  181. '/'.join(os.path.split(self.tempdir) +
  182. ( fname, ))))
  183. self.assertEqual(ftest.mtime, datetime.datetime(2019, 5, 20, 21, 47, 36))
  184. self.assertEqual(ftest.size, 15)
  185. self.assertIn('sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f', ftest.hashes)
  186. # XXX - make sure works w/ relative dirs
  187. files = enumeratedir(os.path.relpath(self.tempdir))
  188. self.assertEqual(oldid, files[0].id)
  189. def test_objectstore(self):
  190. objst = ObjectStore()
  191. objst.load(os.path.join('fixtures', 'sample.data.pasn1'))
  192. objst.loadobj({
  193. 'type': 'metadata',
  194. 'uuid': 'c9a1d1e2-3109-4efd-8948-577dc15e44e7',
  195. 'modified': datetime.datetime(2019, 5, 31, 14, 3, 10),
  196. 'hashes': [ 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada' ],
  197. 'lang': 'en',
  198. })
  199. lst = objst.by_hash('91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada')
  200. self.assertEqual(len(lst), 2)
  201. byid = objst.by_id('3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  202. self.assertIsInstance(byid, MetaData)
  203. self.assertIn(byid, lst)
  204. r = byid
  205. self.assertEqual(r.uuid, '3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  206. self.assertEqual(r['dc:author'], 'John-Mark Gurney')
  207. fname = 'testfile.pasn1'
  208. objst.store(fname)
  209. with open(fname) as fp:
  210. objs = _asn1coder.loads(fp.read())
  211. os.unlink(fname)
  212. self.assertEqual(len(objs), len(objst))
  213. for i in objs:
  214. self.assertEqual(objst.by_id(i['uuid']), i)