diff --git a/ui/medashare/alembic/versions/bb98c5a2e486_create_index_on_metadata_object_type.py b/ui/medashare/alembic/versions/bb98c5a2e486_create_index_on_metadata_object_type.py new file mode 100644 index 0000000..16cc498 --- /dev/null +++ b/ui/medashare/alembic/versions/bb98c5a2e486_create_index_on_metadata_object_type.py @@ -0,0 +1,38 @@ +"""create index on metadata object type + +Revision ID: bb98c5a2e486 +Revises: dff0d9ed0be1 +Create Date: 2023-04-13 02:16:52.359947 + +""" +from alembic import op +import sqlalchemy as sa +import medashare +from medashare import mdb + + +# revision identifiers, used by Alembic. +revision = 'bb98c5a2e486' +down_revision = 'dff0d9ed0be1' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_index('idx_type', 'metadata_objects', ['type'], unique=False) + # ### end Alembic commands ### + + connection = op.get_bind() + + mdo = sa.schema.MetaData() + #mdotbl = sa.Table('metadata_objects', mdo, autoload_with=connection.engine) + + #stmt = sa.select(mdotbl.c.uuid, mdotbl.c.data) + #newtypes = [ dict(olduuid=uuid, newtype=mdb.MDBase.decode(data).type) for + # uuid, data in connection.execute(stmt) ] + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index('idx_type', table_name='metadata_objects') + # ### end Alembic commands ### diff --git a/ui/medashare/alembic/versions/dff0d9ed0be1_add_property_index.py b/ui/medashare/alembic/versions/dff0d9ed0be1_add_property_index.py new file mode 100644 index 0000000..6098611 --- /dev/null +++ b/ui/medashare/alembic/versions/dff0d9ed0be1_add_property_index.py @@ -0,0 +1,62 @@ +"""add property index + +Revision ID: dff0d9ed0be1 +Revises: f2131e9ae4db +Create Date: 2023-04-12 11:45:53.995445 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.orm import Session +import medashare +from medashare import mdb +from medashare.cli import StringCache, ObjectStore + + +# revision identifiers, used by Alembic. +revision = 'dff0d9ed0be1' +down_revision = 'f2131e9ae4db' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('strings', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('str', sa.String(), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('str') + ) + op.create_table('propmap', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('obj', medashare.orm.UUID(length=32), nullable=False), + sa.Column('keyid', sa.Integer(), nullable=False), + sa.Column('valueid', sa.Integer(), nullable=True), + sa.ForeignKeyConstraint(['keyid'], ['strings.id'], ), + sa.ForeignKeyConstraint(['obj'], ['metadata_objects.uuid'], ), + sa.ForeignKeyConstraint(['valueid'], ['strings.id'], ), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('obj', 'keyid', 'valueid') + ) + # ### end Alembic commands ### + + connection = op.get_bind() + + mdo = sa.schema.MetaData() + mdotbl = sa.Table('metadata_objects', mdo, autoload_with=connection.engine) + + stmt = sa.select(mdotbl.c.data).where(mdotbl.c.type == 'metadata') + + with Session(connection) as session: + strcache = StringCache(session) + + for (data, ) in connection.execute(stmt): + obj = mdb.MDBase.decode(data) + ObjectStore._update_metadata_indexes(session, obj, strcache) + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table('propmap') + op.drop_table('strings') + # ### end Alembic commands ### diff --git a/ui/medashare/cli.py b/ui/medashare/cli.py index 71f6a9c..6f40fce 100644 --- a/ui/medashare/cli.py +++ b/ui/medashare/cli.py @@ -21,6 +21,10 @@ if False: from .utils import _debprint +def _getquery(q, objstr): + return repr(str(q.compile(objstr._engine, + compile_kwargs={"literal_binds": True})).replace('\n', ' ')) + #import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace() from edgold.ed448 import EDDSA448 @@ -54,7 +58,7 @@ import shutil import socket import sqlalchemy from sqlalchemy import create_engine, select, insert, func, delete -from sqlalchemy.orm import sessionmaker, aliased +from sqlalchemy.orm import sessionmaker, aliased, load_only import string import subprocess import sys @@ -279,6 +283,7 @@ class ObjectStore(object): # looking up the UUIDv5 for FileObjects. def __init__(self, engine, version='head'): + # Uncomment when working on the db schema #orm.Base.metadata.create_all(engine) self._engine = engine @@ -407,6 +412,38 @@ class ObjectStore(object): d = orm.Dummy(id=1) session.add(d) + @staticmethod + def _update_metadata_indexes(session, obj, strcache): + # sqlalchemy doesn't cache inserts, so don't insert dups + # ourselves + propmapcache = set() + + # clear out old data + stmt = delete(orm.PropertyMapping).where( + orm.PropertyMapping.obj == obj.uuid) + session.execute(stmt) + + props = [ x for x in obj.items() if x[0] not in { + 'hashes', + 'sig', + 'parent_refs', + } ] + for k, vids in props: + kid = strcache[k] + + if not isinstance(vids, list): + vids = [ vids ] + + vids = [ strcache[sv] for sv in vids ] + + for v in vids: + if (obj.uuid, kid, v) in propmapcache: + continue + + session.add(orm.PropertyMapping(obj=obj.uuid, + keyid=kid, valueid=v)) + propmapcache.add((obj.uuid, kid, v)) + def loadobj(self, obj): '''Load obj into the data store.''' @@ -457,6 +494,10 @@ class ObjectStore(object): hostid=uuid.UUID(a), objid=obj.uuid))( *x.split(':', 1)) for x in obj.mapping ] session.add_all(maps) + elif obj.type == 'metadata': + self._update_metadata_indexes(session, obj, + StringCache(session)) + try: hashes = obj.hashes except AttributeError: @@ -834,6 +875,32 @@ def _get_paths(options): return ( os.path.expanduser('~/' + x) for x in fnames ) +class StringCache: + def __init__(self, session): + self._ses = session + self._cache = {} + + def __getitem__(self, k): + try: + return self._cache[k] + except KeyError: + pass + + v = self._ses.execute(select(orm.StringTable.id).where( + orm.StringTable.str == k)).first() + if v is None: + # not present, insert it + st = self._ses.add(orm.StringTable(str=k)) + + v = self._ses.execute(select(orm.StringTable.id) + .where(orm.StringTable.str == k)).first() + + v = v[0] + + self._cache[k] = v + + return v + def init_datastructs(f): @functools.wraps(f) def wrapper(options): @@ -1266,6 +1333,16 @@ def cmd_interactive(options, persona, objstr, cache): @init_datastructs def cmd_dump(options, persona, objstr, cache): + if options.dump_uuids or options.dump_hashes: + for i in options.dump_uuids: + print(objstr.by_id(i).encode('json')) + + for i in options.dump_hashes: + for j in objstr.by_hash(i): + print(j.encode('json')) + + return + print(persona.get_identity().encode('json')) for i in objstr: @@ -1470,56 +1547,73 @@ def cmd_search(options, persona, objstr, cache): searches = [ (x[0], ) + tuple(x[1:].split('=', 1)) for x in args ] #print(repr(searches), file=_real_stderr) - def testfun(x, s=searches): - try: - x = objstr.by_hash(x['hashes'][0], ('metadata',))[0] - except IndexError: - # no metadata object + # base object + mdofile = aliased(orm.MetaDataObject) + # hashes of base object + htfile = aliased(orm.HashTable) - # if we need anything, it's not present - if any(x[0] == '+' for x in s): - return False + sel = select(mdofile.data).execution_options(yield_per=10).where( + # we are operating on files + mdofile.type == 'file', + # we get all the hashes for the files + mdofile.uuid == htfile.uuid) - return True + for i in searches: + # hashes to metadata objects + htmd = aliased(orm.HashTable) + # metadataobjects + mdomd = aliased(orm.MetaDataObject) + + propmap = aliased(orm.PropertyMapping) + skeymap = aliased(orm.StringTable) + svaluemap = aliased(orm.StringTable) try: - for i in s: - try: - op, key, value = i - except ValueError: - op, key = i - value = None - - if op == '+': - if value is None: - if key not in x: - return False - elif value not in x[key]: - return False - elif op == '-': - if value is None: - if key in x: - return False - elif value in x[key]: - return False - else: - raise ValueError('unhandled op: %s' % repr(op)) - else: - return True + op, key, value = i + except ValueError: + op, key = i + value = None - except KeyError: - return False + subq = select(func.count(propmap.id)).where( + # match to metadata objects + htfile.hash == htmd.hash, + # we get all the metadata for those hashes + htmd.uuid == mdomd.uuid, + mdomd.type == 'metadata', - r = ( x for x in objstr if x.type == 'file' and testfun(x) ) + # find their properties + mdomd.uuid == propmap.obj, - if _type == 'file': - mapping = FileObject.prep_mapping(objstr.get_hostmappings()) - r = ( x.get_lcl_name(mapping) for x in r ) - else: - raise ValueError('unhandled type: %s' % repr(_type)) + # that match the key + propmap.keyid == skeymap.id, skeymap.str == key) + + if value is not None: + subq = subq.where(propmap.valueid == svaluemap.id, + svaluemap.str == value) + + subq = subq.scalar_subquery() + + if op == '+': + sel = sel.where(subq != 0) + elif op == '-': + sel = sel.where(subq == 0) + else: + raise ValueError('unhandled op: %s' % repr(op)) - for i in r: - print(i) + #_debprint('sel:', _getquery(sel, objstr)) + + with objstr._ses() as session: + r = ( x[0] for x in session.execute(sel) ) + + if _type == 'file': + mapping = FileObject.prep_mapping( + objstr.get_hostmappings()) + r = ( x.get_lcl_name(mapping) for x in r ) + else: + raise ValueError('unhandled type: %s' % repr(_type)) + + for i in r: + print(i) def main(): import argparse @@ -1592,6 +1686,12 @@ def main(): parser_interactive.set_defaults(func=cmd_interactive) parser_dump = subparsers.add_parser('dump', help='dump all the objects') + parser_dump.add_argument('--uuid', dest='dump_uuids', action='append', + default=[], + help='dump the object with the specified UUID') + parser_dump.add_argument('--hash', dest='dump_hashes', action='append', + default=[], + help='dump the object(s) associated w/ the specified hash') parser_dump.set_defaults(func=cmd_dump) parser_import = subparsers.add_parser('import', @@ -1614,7 +1714,12 @@ def main(): options = parser.parse_args() - fun = options.func + try: + fun = options.func + except AttributeError: + parser.print_help() + sys.exit(0) + fun(options) if __name__ == '__main__': # pragma: no cover @@ -1690,6 +1795,34 @@ class _TestMigrations(unittest.TestCase): # for i in session.query(orm.MetaDataObject).all(): # _debprint('c:', repr(i)) + def test_dff0d(self): + # That an object store generated at the start + objstr = ObjectStore(self._engine, 'dff0d9ed0be1') + + pers = Persona() + pers.generate_key() + objstr.loadobj(pers.get_identity()) + + obj = pers.MetaData({ 'other': 'baz'}) + + # That has a metadata object + objstr.loadobj(obj) + + # migrate the database forward + objstr._handle_migration('head') + + with objstr._ses() as session: + # that string table has entries + other = session.execute(select(orm.StringTable.id) + .where(orm.StringTable.str == 'other')).first()[0] + baz = session.execute(select(orm.StringTable.id) + .where(orm.StringTable.str == 'baz')).first()[0] + + # that propertymapping was populated + pm = { (x.obj, x.keyid, x.valueid) for (x,) in + session.execute(select(orm.PropertyMapping)) } + self.assertEqual(pm, { (obj.uuid, other, baz) }) + class _TestCases(unittest.TestCase): def setUp(self): self.fixtures = pathlib.Path('fixtures').resolve() @@ -2332,7 +2465,16 @@ class _TestCases(unittest.TestCase): stdin.seek(0) with self.assertRaises(SystemExit) as cm: - main() + try: + main() + except AssertionError: + # used to nab a copy of the + # failed sqlite3 db + if False: + shutil.copyfile( + storefname, + '/tmp/failure.sqlite3') + raise # XXX - Minor hack till other tests fixed sys.exit(0) @@ -2404,6 +2546,18 @@ class _TestCases(unittest.TestCase): self.assertEqual(dashhelp, subhelp) + with mock.patch('sys.stdout', io.StringIO()) as stdout, \ + mock.patch('sys.argv', [ 'progname', ]) as argv: + with self.assertRaises(SystemExit) as cm: + main() + + # XXX - Minor hack till other tests fixed + sys.exit(0) + + subhelp = stdout.getvalue() + + self.assertEqual(dashhelp, subhelp) + #@unittest.skip('temp') def test_cmds(self): cmds = sorted(self.fixtures.glob('cmd.*.json')) @@ -2553,5 +2707,6 @@ class _TestCases(unittest.TestCase): 'ERROR: file not found: \'foo\'\n') # Tests to add: +# add tests for dump options uuid and hash # expand mappings to multiple mappings, that is a -> b, b -> c, implies a -> c # support host names in --create diff --git a/ui/medashare/orm.py b/ui/medashare/orm.py index 9e3334d..09f6e9e 100644 --- a/ui/medashare/orm.py +++ b/ui/medashare/orm.py @@ -1,5 +1,6 @@ import uuid -from sqlalchemy import Table, Column, DateTime, String, Integer, LargeBinary +from sqlalchemy import Table, Column, ForeignKey, UniqueConstraint, Index +from sqlalchemy import DateTime, String, Integer, LargeBinary from sqlalchemy import types from sqlalchemy.orm import declarative_base from .cli import _debprint @@ -61,6 +62,12 @@ class HashTable(Base): hash = Column(String, primary_key=True) uuid = Column(UUID, primary_key=True) +class StringTable(Base): + __tablename__ = 'strings' + + id = Column(Integer, primary_key=True) + str = Column(String, unique=True, nullable=False) + class MetaDataObject(Base): __tablename__ = 'metadata_objects' @@ -69,8 +76,24 @@ class MetaDataObject(Base): type = Column(String) data = Column(MDBaseType) + Index("idx_type", type) + def __repr__(self): return \ 'MetaDataObject(uuid=%s, type=%s, modified=%s,' \ ' data=%s)' % (repr(self.uuid), repr(self.type), repr(self.modified), repr(self.data)) + +class PropertyMapping(Base): + __tablename__ = 'propmap' + + id = Column(Integer, primary_key=True) + + obj = Column(UUID, ForeignKey(MetaDataObject.uuid), nullable=False) + keyid = Column(Integer, ForeignKey(StringTable.id), nullable=False) + valueid = Column(Integer, ForeignKey(StringTable.id)) + + UniqueConstraint(obj, keyid, valueid, sqlite_on_conflict='IGNORE') + + # Do not add an index on keyid only. In real world testing, it + # slows down performance by about 50%.