improve search performance.. minor dump improvements by uuid or hash..

dump improvements need tests..
2 years ago · 6056bbbdc7
--- a/ui/medashare/alembic/versions/bb98c5a2e486_create_index_on_metadata_object_type.py
+++ b/ui/medashare/alembic/versions/bb98c5a2e486_create_index_on_metadata_object_type.py
@@ -0,0 +1,38 @@
 """create index on metadata object type

 Revision ID: bb98c5a2e486
 Revises: dff0d9ed0be1
 Create Date: 2023-04-13 02:16:52.359947

 """
 from alembic import op
 import sqlalchemy as sa
 import medashare
 from medashare import mdb


 # revision identifiers, used by Alembic.
 revision = 'bb98c5a2e486'
 down_revision = 'dff0d9ed0be1'
 branch_labels = None
 depends_on = None


 def upgrade() -> None:
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_index('idx_type', 'metadata_objects', ['type'], unique=False)
    # ### end Alembic commands ###

    connection = op.get_bind()

    mdo = sa.schema.MetaData()
    #mdotbl = sa.Table('metadata_objects', mdo, autoload_with=connection.engine)

    #stmt = sa.select(mdotbl.c.uuid, mdotbl.c.data)
    #newtypes = [ dict(olduuid=uuid, newtype=mdb.MDBase.decode(data).type) for
    #    uuid, data in connection.execute(stmt) ]

 def downgrade() -> None:
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_index('idx_type', table_name='metadata_objects')
    # ### end Alembic commands ###
--- a/ui/medashare/alembic/versions/dff0d9ed0be1_add_property_index.py
+++ b/ui/medashare/alembic/versions/dff0d9ed0be1_add_property_index.py
@@ -0,0 +1,62 @@
 """add property index

 Revision ID: dff0d9ed0be1
 Revises: f2131e9ae4db
 Create Date: 2023-04-12 11:45:53.995445

 """
 from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.orm import Session
 import medashare
 from medashare import mdb
 from medashare.cli import StringCache, ObjectStore


 # revision identifiers, used by Alembic.
 revision = 'dff0d9ed0be1'
 down_revision = 'f2131e9ae4db'
 branch_labels = None
 depends_on = None


 def upgrade() -> None:
    # ### commands auto generated by Alembic - please adjust! ###
    op.create_table('strings',
    sa.Column('id', sa.Integer(), nullable=False),
    sa.Column('str', sa.String(), nullable=False),
    sa.PrimaryKeyConstraint('id'),
    sa.UniqueConstraint('str')
    )
    op.create_table('propmap',
    sa.Column('id', sa.Integer(), nullable=False),
    sa.Column('obj', medashare.orm.UUID(length=32), nullable=False),
    sa.Column('keyid', sa.Integer(), nullable=False),
    sa.Column('valueid', sa.Integer(), nullable=True),
    sa.ForeignKeyConstraint(['keyid'], ['strings.id'], ),
    sa.ForeignKeyConstraint(['obj'], ['metadata_objects.uuid'], ),
    sa.ForeignKeyConstraint(['valueid'], ['strings.id'], ),
    sa.PrimaryKeyConstraint('id'),
    sa.UniqueConstraint('obj', 'keyid', 'valueid')
    )
    # ### end Alembic commands ###

    connection = op.get_bind()

    mdo = sa.schema.MetaData()
    mdotbl = sa.Table('metadata_objects', mdo, autoload_with=connection.engine)

    stmt = sa.select(mdotbl.c.data).where(mdotbl.c.type == 'metadata')

    with Session(connection) as session:
        strcache = StringCache(session)

        for (data, ) in connection.execute(stmt):
            obj = mdb.MDBase.decode(data)
            ObjectStore._update_metadata_indexes(session, obj, strcache)

 def downgrade() -> None:
    # ### commands auto generated by Alembic - please adjust! ###
    op.drop_table('propmap')
    op.drop_table('strings')
    # ### end Alembic commands ###
--- a/ui/medashare/cli.py
+++ b/ui/medashare/cli.py
@@ -21,6 +21,10 @@ if False:

 from .utils import _debprint

 def _getquery(q, objstr):
 	return repr(str(q.compile(objstr._engine,
 	    compile_kwargs={"literal_binds": True})).replace('\n', ' '))

 #import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace()

 from edgold.ed448 import EDDSA448
@@ -54,7 +58,7 @@ import shutil
 import socket
 import sqlalchemy
 from sqlalchemy import create_engine, select, insert, func, delete
 from sqlalchemy.orm import sessionmaker, aliased
 from sqlalchemy.orm import sessionmaker, aliased, load_only
 import string
 import subprocess
 import sys
@@ -279,6 +283,7 @@ class ObjectStore(object):
 	# looking up the UUIDv5 for FileObjects.

 	def __init__(self, engine, version='head'):
 		# Uncomment when working on the db schema
 		#orm.Base.metadata.create_all(engine)

 		self._engine = engine
@@ -407,6 +412,38 @@ class ObjectStore(object):
 			d = orm.Dummy(id=1)
 			session.add(d)

 	@staticmethod
 	def _update_metadata_indexes(session, obj, strcache):
 		# sqlalchemy doesn't cache inserts, so don't insert dups
 		# ourselves
 		propmapcache = set()

 		# clear out old data
 		stmt = delete(orm.PropertyMapping).where(
 		    orm.PropertyMapping.obj == obj.uuid)
 		session.execute(stmt)

 		props = [ x for x in obj.items() if x[0] not in {
 		    'hashes',
 		    'sig',
 		    'parent_refs',
 		    } ]
 		for k, vids in props:
 			kid = strcache[k]

 			if not isinstance(vids, list):
 				vids = [ vids ]

 			vids = [ strcache[sv] for sv in vids ]

 			for v in vids:
 				if (obj.uuid, kid, v) in propmapcache:
 					continue

 				session.add(orm.PropertyMapping(obj=obj.uuid,
 				    keyid=kid, valueid=v))
 				propmapcache.add((obj.uuid, kid, v))

 	def loadobj(self, obj):
 		'''Load obj into the data store.'''

@@ -457,6 +494,10 @@ class ObjectStore(object):
 				    hostid=uuid.UUID(a), objid=obj.uuid))(
 				    *x.split(':', 1)) for x in obj.mapping ]
 				session.add_all(maps)
 			elif obj.type == 'metadata':
 				self._update_metadata_indexes(session, obj,
 				    StringCache(session))

 			try:
 				hashes = obj.hashes
 			except AttributeError:
@@ -834,6 +875,32 @@ def _get_paths(options):

 	return ( os.path.expanduser('~/' + x) for x in fnames )

 class StringCache:
 	def __init__(self, session):
 		self._ses = session
 		self._cache = {}

 	def __getitem__(self, k):
 		try:
 			return self._cache[k]
 		except KeyError:
 			pass

 		v = self._ses.execute(select(orm.StringTable.id).where(
 		    orm.StringTable.str == k)).first()
 		if v is None:
 			# not present, insert it
 			st = self._ses.add(orm.StringTable(str=k))

 			v = self._ses.execute(select(orm.StringTable.id)
 			    .where(orm.StringTable.str == k)).first()

 		v = v[0]

 		self._cache[k] = v

 		return v

 def init_datastructs(f):
 	@functools.wraps(f)
 	def wrapper(options):
@@ -1266,6 +1333,16 @@ def cmd_interactive(options, persona, objstr, cache):

@init_datastructs
 def cmd_dump(options, persona, objstr, cache):
 	if options.dump_uuids or options.dump_hashes:
 		for i in options.dump_uuids:
 			print(objstr.by_id(i).encode('json'))

 		for i in options.dump_hashes:
 			for j in objstr.by_hash(i):
 				print(j.encode('json'))

 		return

 	print(persona.get_identity().encode('json'))

 	for i in objstr:
@@ -1470,56 +1547,73 @@ def cmd_search(options, persona, objstr, cache):
 	searches = [ (x[0], ) + tuple(x[1:].split('=', 1)) for x in args ]
 	#print(repr(searches), file=_real_stderr)

 	def testfun(x, s=searches):
 		try:
 			x = objstr.by_hash(x['hashes'][0], ('metadata',))[0]
 		except IndexError:
 			# no metadata object
 	# base object
 	mdofile = aliased(orm.MetaDataObject)
 	# hashes of base object
 	htfile = aliased(orm.HashTable)

 			# if we need anything, it's not present
 			if any(x[0] == '+' for x in s):
 				return False
 	sel = select(mdofile.data).execution_options(yield_per=10).where(
 		# we are operating on files
 		mdofile.type == 'file',
 		# we get all the hashes for the files
 		mdofile.uuid == htfile.uuid)

 			return True
 	for i in searches:
 		# hashes to metadata objects
 		htmd = aliased(orm.HashTable)
 		# metadataobjects
 		mdomd = aliased(orm.MetaDataObject)

 		propmap = aliased(orm.PropertyMapping)
 		skeymap = aliased(orm.StringTable)
 		svaluemap = aliased(orm.StringTable)

 		try:
 			for i in s:
 				try:
 					op, key, value = i
 				except ValueError:
 					op, key = i
 					value = None

 				if op == '+':
 					if value is None:
 						if key not in x:
 							return False
 					elif value not in x[key]:
 						return False
 				elif op == '-':
 					if value is None:
 						if key in x:
 							return False
 					elif value in x[key]:
 						return False
 				else:
 					raise ValueError('unhandled op: %s' % repr(op))
 			else:
 				return True
 			op, key, value = i
 		except ValueError:
 			op, key = i
 			value = None

 		except KeyError:
 			return False
 		subq = select(func.count(propmap.id)).where(
 		    # match to metadata objects
 		    htfile.hash == htmd.hash,
 		    # we get all the metadata for those hashes
 		    htmd.uuid == mdomd.uuid,
 		    mdomd.type == 'metadata',

 	r = ( x for x in objstr if x.type == 'file' and testfun(x) )
 		    # find their properties
 		    mdomd.uuid == propmap.obj,

 	if _type == 'file':
 		mapping = FileObject.prep_mapping(objstr.get_hostmappings())
 		r = ( x.get_lcl_name(mapping) for x in r )
 	else:
 		raise ValueError('unhandled type: %s' % repr(_type))
 		    # that match the key
 		    propmap.keyid == skeymap.id, skeymap.str == key)

 		if value is not None:
 			subq = subq.where(propmap.valueid == svaluemap.id,
 			    svaluemap.str == value)

 		subq = subq.scalar_subquery()

 		if op == '+':
 			sel = sel.where(subq != 0)
 		elif op == '-':
 			sel = sel.where(subq == 0)
 		else:
 			raise ValueError('unhandled op: %s' % repr(op))

 	for i in r:
 		print(i)
 	#_debprint('sel:', _getquery(sel, objstr))

 	with objstr._ses() as session:
 		r = ( x[0] for x in session.execute(sel) )

 		if _type == 'file':
 			mapping = FileObject.prep_mapping(
 			    objstr.get_hostmappings())
 			r = ( x.get_lcl_name(mapping) for x in r )
 		else:
 			raise ValueError('unhandled type: %s' % repr(_type))

 		for i in r:
 			print(i)

 def main():
 	import argparse
@@ -1592,6 +1686,12 @@ def main():
 	parser_interactive.set_defaults(func=cmd_interactive)

 	parser_dump = subparsers.add_parser('dump', help='dump all the objects')
 	parser_dump.add_argument('--uuid', dest='dump_uuids', action='append',
 	    default=[],
 	    help='dump the object with the specified UUID')
 	parser_dump.add_argument('--hash', dest='dump_hashes', action='append',
 	    default=[],
 	    help='dump the object(s) associated w/ the specified hash')
 	parser_dump.set_defaults(func=cmd_dump)

 	parser_import = subparsers.add_parser('import',
@@ -1614,7 +1714,12 @@ def main():

 	options = parser.parse_args()

 	fun = options.func
 	try:
 		fun = options.func
 	except AttributeError:
 		parser.print_help()
 		sys.exit(0)

 	fun(options)

 if __name__ == '__main__':	# pragma: no cover
@@ -1690,6 +1795,34 @@ class _TestMigrations(unittest.TestCase):
 		#	for i in session.query(orm.MetaDataObject).all():
 		#		_debprint('c:', repr(i))

 	def test_dff0d(self):
 		# That an object store generated at the start
 		objstr = ObjectStore(self._engine, 'dff0d9ed0be1')

 		pers = Persona()
 		pers.generate_key()
 		objstr.loadobj(pers.get_identity())

 		obj = pers.MetaData({ 'other': 'baz'})

 		# That has a metadata object
 		objstr.loadobj(obj)

 		# migrate the database forward
 		objstr._handle_migration('head')

 		with objstr._ses() as session:
 			# that string table has entries
 			other = session.execute(select(orm.StringTable.id)
 			    .where(orm.StringTable.str == 'other')).first()[0]
 			baz = session.execute(select(orm.StringTable.id)
 			    .where(orm.StringTable.str == 'baz')).first()[0]

 			# that propertymapping was populated
 			pm = { (x.obj, x.keyid, x.valueid) for (x,) in
 			    session.execute(select(orm.PropertyMapping)) }
 			self.assertEqual(pm, { (obj.uuid, other, baz) })

 class _TestCases(unittest.TestCase):
 	def setUp(self):
 		self.fixtures = pathlib.Path('fixtures').resolve()
@@ -2332,7 +2465,16 @@ class _TestCases(unittest.TestCase):
 				stdin.seek(0)

 				with self.assertRaises(SystemExit) as cm:
 					main()
 					try:
 						main()
 					except AssertionError:
 						# used to nab a copy of the
 						# failed sqlite3 db
 						if False:
 							shutil.copyfile(
 							    storefname,
 							    '/tmp/failure.sqlite3')
 						raise

 					# XXX - Minor hack till other tests fixed
 					sys.exit(0)
@@ -2404,6 +2546,18 @@ class _TestCases(unittest.TestCase):

 		self.assertEqual(dashhelp, subhelp)

 		with mock.patch('sys.stdout', io.StringIO()) as stdout, \
 		    mock.patch('sys.argv', [ 'progname', ]) as argv:
 			with self.assertRaises(SystemExit) as cm:
 				main()

 				# XXX - Minor hack till other tests fixed
 				sys.exit(0)

 			subhelp = stdout.getvalue()

 		self.assertEqual(dashhelp, subhelp)

 	#@unittest.skip('temp')
 	def test_cmds(self):
 		cmds = sorted(self.fixtures.glob('cmd.*.json'))
@@ -2553,5 +2707,6 @@ class _TestCases(unittest.TestCase):
 				    'ERROR: file not found: \'foo\'\n')

 # Tests to add:
 # add tests for dump options uuid and hash
 # expand mappings to multiple mappings, that is a -> b, b -> c, implies a -> c
 # support host names in --create
--- a/ui/medashare/orm.py
+++ b/ui/medashare/orm.py
@@ -1,5 +1,6 @@
 import uuid
 from sqlalchemy import Table, Column, DateTime, String, Integer, LargeBinary
 from sqlalchemy import Table, Column, ForeignKey, UniqueConstraint, Index
 from sqlalchemy import DateTime, String, Integer, LargeBinary
 from sqlalchemy import types
 from sqlalchemy.orm import declarative_base
 from .cli import _debprint
@@ -61,6 +62,12 @@ class HashTable(Base):
 	hash = Column(String, primary_key=True)
 	uuid = Column(UUID, primary_key=True)

 class StringTable(Base):
 	__tablename__ = 'strings'

 	id = Column(Integer, primary_key=True)
 	str = Column(String, unique=True, nullable=False)

 class MetaDataObject(Base):
 	__tablename__ = 'metadata_objects'

@@ -69,8 +76,24 @@ class MetaDataObject(Base):
 	type = Column(String)
 	data = Column(MDBaseType)

 	Index("idx_type", type)

 	def __repr__(self):
 		return \
 		    'MetaDataObject(uuid=%s, type=%s, modified=%s,' \
 		    ' data=%s)' % (repr(self.uuid), repr(self.type),
 		    repr(self.modified), repr(self.data))

 class PropertyMapping(Base):
 	__tablename__ = 'propmap'

 	id = Column(Integer, primary_key=True)

 	obj = Column(UUID, ForeignKey(MetaDataObject.uuid), nullable=False)
 	keyid = Column(Integer, ForeignKey(StringTable.id), nullable=False)
 	valueid = Column(Integer, ForeignKey(StringTable.id))

 	UniqueConstraint(obj, keyid, valueid, sqlite_on_conflict='IGNORE')

 	# Do not add an index on keyid only.  In real world testing, it
 	# slows down performance by about 50%.