MetaData Sharing
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

433 lines
12 KiB

  1. #!/usr/bin/env python
  2. import copy
  3. import datetime
  4. import hashlib
  5. import mock
  6. import os.path
  7. import pasn1
  8. import shutil
  9. import string
  10. import tempfile
  11. import unittest
  12. import uuid
  13. from contextlib import nested
  14. # The UUID for the namespace representing the path to a file
  15. _NAMESPACE_MEDASHARE_PATH = uuid.UUID('f6f36b62-3770-4a68-bc3d-dc3e31e429e6')
  16. _defaulthash = 'sha512'
  17. _validhashes = set([ 'sha256', 'sha512' ])
  18. _hashlengths = { len(getattr(hashlib, x)().hexdigest()): x for x in _validhashes }
  19. def _iterdictlist(obj):
  20. itms = obj.items()
  21. itms.sort()
  22. for k, v in itms:
  23. if isinstance(v, list):
  24. for i in v:
  25. yield k, i
  26. else:
  27. yield k, v
  28. # XXX - add validation
  29. class MDBase(object):
  30. '''This is a simple wrapper that turns a JSON object into a pythonesc
  31. object where attribute accesses work.'''
  32. _generated_properties = {
  33. 'uuid': uuid.uuid4,
  34. 'modified': datetime.datetime.utcnow
  35. }
  36. _common_properties = [ 'type', 'created_by_ref' ] # XXX - add lang?
  37. _common_names = set(_common_properties + _generated_properties.keys())
  38. def __init__(self, obj):
  39. obj = copy.deepcopy(obj)
  40. if 'type' not in obj:
  41. obj['type'] = self._type
  42. for x in self._common_properties:
  43. if x not in obj:
  44. raise ValueError('common property %s not present' % `x`)
  45. for x, fun in self._generated_properties.iteritems():
  46. if x not in obj:
  47. obj[x] = fun()
  48. self._obj = obj
  49. @classmethod
  50. def create_obj(cls, obj):
  51. '''Using obj as a base, create an instead of MDBase of the
  52. correct type.
  53. If the correct type is not found, a ValueError is raised.'''
  54. if isinstance(obj, cls):
  55. # XXX - copy?
  56. return obj
  57. ty = obj['type']
  58. for i in cls.__subclasses__():
  59. if i._type == ty:
  60. return i(obj)
  61. else:
  62. raise ValueError('Unable to find class for type %s' %
  63. `ty`)
  64. def __getattr__(self, k):
  65. return self._obj[k]
  66. def __getitem__(self, k):
  67. return self._obj[k]
  68. def __to_dict__(self):
  69. return self._obj
  70. def __eq__(self, o):
  71. return cmp(self._obj, o) == 0
  72. def items(self, skipcommon=True):
  73. return [ (k, v) for k, v in self._obj.items() if k not in
  74. self._common_names ]
  75. class MetaData(MDBase):
  76. _type = 'metadata'
  77. def _trytodict(o):
  78. try:
  79. return 'dict', o.__to_dict__()
  80. except Exception: # pragma: no cover
  81. raise TypeError('unable to find __to_dict__ on %s' % type(o))
  82. _asn1coder = pasn1.ASN1DictCoder(coerce=_trytodict)
  83. class ObjectStore(object):
  84. '''A container to store for the various Metadata objects.'''
  85. def __init__(self, created_by_ref):
  86. self._created_by_ref = created_by_ref
  87. self._uuids = {}
  88. self._hashes = {}
  89. @staticmethod
  90. def makehash(hashstr, strict=True):
  91. '''Take a hash string, and return a valid hash string from it.
  92. This makes sure that it is of the correct type and length.
  93. If strict is False, the function will detect the length and
  94. return a valid hash if one can be found.'''
  95. try:
  96. hash, value = hashstr.split(':')
  97. except ValueError:
  98. if strict:
  99. raise
  100. hash = _hashlengths[len(hashstr)]
  101. value = hashstr
  102. if strict and len(str(value).translate(None, string.hexdigits.lower())) != 0:
  103. raise ValueError('value has invalid hex digits (must be lower case)', value)
  104. if hash in _validhashes:
  105. return ':'.join((hash, value))
  106. raise ValueError
  107. def __len__(self):
  108. return len(self._uuids)
  109. def store(self, fname):
  110. '''Write out the objects in the store to the file named
  111. fname.'''
  112. with open(fname, 'w') as fp:
  113. obj = {
  114. 'created_by_ref': self._created_by_ref,
  115. 'objects': self._uuids.values(),
  116. }
  117. fp.write(_asn1coder.dumps(obj))
  118. def loadobj(self, obj):
  119. '''Load obj into the data store.'''
  120. obj = MDBase.create_obj(obj)
  121. if not isinstance(obj.uuid, uuid.UUID):
  122. id = uuid.UUID(obj.uuid)
  123. else:
  124. id = obj.uuid
  125. self._uuids[id] = obj
  126. for j in obj.hashes:
  127. h = self.makehash(j)
  128. self._hashes.setdefault(h, []).append(obj)
  129. @classmethod
  130. def load(cls, fname):
  131. '''Load objects from the provided file name.
  132. Basic validation will be done on the objects in the file.
  133. The objects will be accessible via other methods.'''
  134. with open(fname) as fp:
  135. objs = _asn1coder.loads(fp.read())
  136. obj = cls(objs['created_by_ref'])
  137. for i in objs['objects']:
  138. obj.loadobj(i)
  139. return obj
  140. def by_id(self, id):
  141. '''Look up an object by it's UUID.'''
  142. if not isinstance(id, uuid.UUID):
  143. uid = uuid.UUID(id)
  144. else:
  145. uid = id
  146. return self._uuids[uid]
  147. def by_hash(self, hash):
  148. '''Look up an object by it's hash value.'''
  149. h = self.makehash(hash, strict=False)
  150. return self._hashes[h]
  151. def by_file(self, fname):
  152. '''Return a metadata object for the file named fname.'''
  153. fid = FileObject.make_id(fname)
  154. try:
  155. fobj = self.by_id(fid)
  156. except KeyError:
  157. # unable to find it
  158. fobj = FileObject.from_file(fname, self._created_by_ref)
  159. self.loadobj(fobj)
  160. for i in fobj.hashes:
  161. j = self.by_hash(i)
  162. # Filter out non-metadata objects
  163. j = [ x for x in j if x.type == 'metadata' ]
  164. if j:
  165. return j
  166. else:
  167. raise KeyError('unable to find metadata for file')
  168. def _hashfile(fname):
  169. hash = getattr(hashlib, _defaulthash)()
  170. with open(fname) as fp:
  171. r = fp.read()
  172. hash.update(r)
  173. return '%s:%s' % (_defaulthash, hash.hexdigest())
  174. class FileObject(MDBase):
  175. _type = 'file'
  176. @staticmethod
  177. def make_id(fname):
  178. '''Take a local file name, and make the id for it. Note that
  179. converts from the local path separator to a forward slash so
  180. that it will be the same between Windows and Unix systems.'''
  181. fname = os.path.realpath(fname)
  182. return uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  183. '/'.join(os.path.split(fname)))
  184. @classmethod
  185. def from_file(cls, filename, created_by_ref):
  186. s = os.stat(filename)
  187. obj = {
  188. 'dir': os.path.dirname(filename),
  189. 'created_by_ref': created_by_ref,
  190. 'filename': os.path.basename(filename),
  191. 'id': cls.make_id(filename),
  192. 'mtime': datetime.datetime.utcfromtimestamp(s.st_mtime),
  193. 'size': s.st_size,
  194. 'hashes': ( _hashfile(filename), ),
  195. }
  196. return cls(obj)
  197. def enumeratedir(_dir, created_by_ref):
  198. '''Enumerate all the files and directories (not recursive) in _dir.
  199. Returned is a list of FileObjects.'''
  200. return map(lambda x: FileObject.from_file(os.path.join(_dir, x), created_by_ref),
  201. os.listdir(_dir))
  202. def main():
  203. from optparse import OptionParser
  204. parser = OptionParser()
  205. parser.add_option('-l', action='store_true', dest='list',
  206. default=False, help='list metadata')
  207. options, args = parser.parse_args()
  208. storefname = os.path.expanduser('~/.medashare_store.pasn1')
  209. import sys
  210. #print >>sys.stderr, `storefname`
  211. objstr = ObjectStore.load(storefname)
  212. for i in args:
  213. for j in objstr.by_file(i):
  214. for k, v in _iterdictlist(j):
  215. print '%s:\t%s' % (k, v)
  216. #objstr.store()
  217. if __name__ == '__main__': # pragma: no cover
  218. main()
  219. class _TestCases(unittest.TestCase):
  220. created_by_ref = '867c7563-79ae-435c-a265-9d8509cefac5'
  221. def setUp(self):
  222. d = os.path.realpath(tempfile.mkdtemp())
  223. self.basetempdir = d
  224. self.tempdir = os.path.join(d, 'subdir')
  225. shutil.copytree(os.path.join('fixtures', 'testfiles'),
  226. self.tempdir)
  227. def tearDown(self):
  228. shutil.rmtree(self.basetempdir)
  229. self.tempdir = None
  230. def test_mdbase(self):
  231. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'unknosldkfj' })
  232. self.assertRaises(ValueError, MDBase.create_obj, { 'type': 'metadata' })
  233. baseobj = {
  234. 'type': 'metadata',
  235. 'created_by_ref': self.created_by_ref,
  236. }
  237. origbase = copy.deepcopy(baseobj)
  238. md = MDBase.create_obj(baseobj)
  239. self.assertEqual(baseobj, origbase)
  240. def test_makehash(self):
  241. self.assertRaises(ValueError, ObjectStore.makehash, 'slkj')
  242. self.assertRaises(ValueError, ObjectStore.makehash, 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA')
  243. self.assertRaises(ValueError, ObjectStore.makehash, 'bogushash:9e0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ADA', strict=False)
  244. self.assertEqual(ObjectStore.makehash('cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e', strict=False), 'sha512:cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e')
  245. self.assertEqual(ObjectStore.makehash('e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855', strict=False), 'sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855')
  246. def test_enumeratedir(self):
  247. files = enumeratedir(self.tempdir, self.created_by_ref)
  248. ftest = files[0]
  249. fname = 'test.txt'
  250. # make sure that they are of type MDBase
  251. self.assertIsInstance(ftest, MDBase)
  252. oldid = ftest.id
  253. self.assertEqual(ftest.filename, fname)
  254. self.assertEqual(ftest.dir, self.tempdir)
  255. # XXX - do we add host information?
  256. self.assertEqual(ftest.id, uuid.uuid5(_NAMESPACE_MEDASHARE_PATH,
  257. '/'.join(os.path.split(self.tempdir) +
  258. ( fname, ))))
  259. self.assertEqual(ftest.mtime, datetime.datetime(2019, 5, 20, 21, 47, 36))
  260. self.assertEqual(ftest.size, 15)
  261. self.assertIn('sha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f', ftest.hashes)
  262. # XXX - make sure works w/ relative dirs
  263. files = enumeratedir(os.path.relpath(self.tempdir),
  264. self.created_by_ref)
  265. self.assertEqual(oldid, files[0].id)
  266. def test_objectstore(self):
  267. objst = ObjectStore.load(os.path.join('fixtures', 'sample.data.pasn1'))
  268. objst.loadobj({
  269. 'type': 'metadata',
  270. 'uuid': 'c9a1d1e2-3109-4efd-8948-577dc15e44e7',
  271. 'modified': datetime.datetime(2019, 5, 31, 14, 3, 10),
  272. 'created_by_ref': self.created_by_ref,
  273. 'hashes': [ 'sha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada' ],
  274. 'lang': 'en',
  275. })
  276. lst = objst.by_hash('91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada')
  277. self.assertEqual(len(lst), 2)
  278. byid = objst.by_id('3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  279. self.assertIsInstance(byid, MetaData)
  280. self.assertIn(byid, lst)
  281. r = byid
  282. self.assertEqual(r.uuid, '3e466e06-45de-4ecc-84ba-2d2a3d970e96')
  283. self.assertEqual(r['dc:author'], 'John-Mark Gurney')
  284. fname = 'testfile.pasn1'
  285. objst.store(fname)
  286. with open(fname) as fp:
  287. objs = _asn1coder.loads(fp.read())
  288. os.unlink(fname)
  289. self.assertEqual(len(objs), len(objst))
  290. self.assertEqual(objs['created_by_ref'], self.created_by_ref)
  291. for i in objs['objects']:
  292. self.assertEqual(objst.by_id(i['uuid']), i)
  293. testfname = os.path.join(self.tempdir, 'test.txt')
  294. self.assertEqual(objst.by_file(testfname), [ byid ])
  295. # XXX make sure that object store contains fileobject
  296. def test_main(self):
  297. # Test the main runner, this is only testing things that are
  298. # specific to running the program, like where the store is
  299. # created.
  300. # setup object store
  301. storefname = os.path.join(self.tempdir, 'storefname')
  302. shutil.copy(os.path.join('fixtures', 'sample.data.pasn1'), storefname)
  303. # setup test fname
  304. testfname = os.path.join(self.tempdir, 'test.txt')
  305. import sys
  306. import StringIO
  307. with mock.patch('os.path.expanduser', side_effect=(storefname, )) \
  308. as eu:
  309. with nested(mock.patch('sys.stdout',
  310. StringIO.StringIO()), mock.patch('sys.argv',
  311. [ 'progname', '-l', testfname ])) as (stdout, argv):
  312. main()
  313. self.assertEqual(stdout.getvalue(),
  314. 'dc:author:\tJohn-Mark Gurney\nhashes:\tsha256:91751cee0a1ab8414400238a761411daa29643ab4b8243e9a91649e25be53ada\nhashes:\tsha512:7d5768d47b6bc27dc4fa7e9732cfa2de506ca262a2749cb108923e5dddffde842bbfee6cb8d692fb43aca0f12946c521cce2633887914ca1f96898478d10ad3f\nlang:\ten\n')
  315. eu.assert_called_with('~/.medashare_store.pasn1')
  316. if False: # pragma: no cover
  317. # Example how to force proper output
  318. with mock.patch('sys.stdout', StringIO.StringIO()) as ssw:
  319. print 'foobar'
  320. self.assertEqual(ssw.getvalue(), 'foobar\n')
  321. # XXX - how to do created_by for object store?
  322. # store it in the loaded object?
  323. # if so, have to restructure how we handle loading