Browse Source

improve search performance.. minor dump improvements by uuid or hash..

dump improvements need tests..
main
John-Mark Gurney 1 year ago
parent
commit
6056bbbdc7
4 changed files with 324 additions and 46 deletions
  1. +38
    -0
      ui/medashare/alembic/versions/bb98c5a2e486_create_index_on_metadata_object_type.py
  2. +62
    -0
      ui/medashare/alembic/versions/dff0d9ed0be1_add_property_index.py
  3. +200
    -45
      ui/medashare/cli.py
  4. +24
    -1
      ui/medashare/orm.py

+ 38
- 0
ui/medashare/alembic/versions/bb98c5a2e486_create_index_on_metadata_object_type.py View File

@@ -0,0 +1,38 @@
"""create index on metadata object type

Revision ID: bb98c5a2e486
Revises: dff0d9ed0be1
Create Date: 2023-04-13 02:16:52.359947

"""
from alembic import op
import sqlalchemy as sa
import medashare
from medashare import mdb


# revision identifiers, used by Alembic.
revision = 'bb98c5a2e486'
down_revision = 'dff0d9ed0be1'
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_index('idx_type', 'metadata_objects', ['type'], unique=False)
# ### end Alembic commands ###

connection = op.get_bind()

mdo = sa.schema.MetaData()
#mdotbl = sa.Table('metadata_objects', mdo, autoload_with=connection.engine)

#stmt = sa.select(mdotbl.c.uuid, mdotbl.c.data)
#newtypes = [ dict(olduuid=uuid, newtype=mdb.MDBase.decode(data).type) for
# uuid, data in connection.execute(stmt) ]

def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index('idx_type', table_name='metadata_objects')
# ### end Alembic commands ###

+ 62
- 0
ui/medashare/alembic/versions/dff0d9ed0be1_add_property_index.py View File

@@ -0,0 +1,62 @@
"""add property index

Revision ID: dff0d9ed0be1
Revises: f2131e9ae4db
Create Date: 2023-04-12 11:45:53.995445

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.orm import Session
import medashare
from medashare import mdb
from medashare.cli import StringCache, ObjectStore


# revision identifiers, used by Alembic.
revision = 'dff0d9ed0be1'
down_revision = 'f2131e9ae4db'
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('strings',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('str', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('str')
)
op.create_table('propmap',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('obj', medashare.orm.UUID(length=32), nullable=False),
sa.Column('keyid', sa.Integer(), nullable=False),
sa.Column('valueid', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['keyid'], ['strings.id'], ),
sa.ForeignKeyConstraint(['obj'], ['metadata_objects.uuid'], ),
sa.ForeignKeyConstraint(['valueid'], ['strings.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('obj', 'keyid', 'valueid')
)
# ### end Alembic commands ###

connection = op.get_bind()

mdo = sa.schema.MetaData()
mdotbl = sa.Table('metadata_objects', mdo, autoload_with=connection.engine)

stmt = sa.select(mdotbl.c.data).where(mdotbl.c.type == 'metadata')

with Session(connection) as session:
strcache = StringCache(session)

for (data, ) in connection.execute(stmt):
obj = mdb.MDBase.decode(data)
ObjectStore._update_metadata_indexes(session, obj, strcache)

def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('propmap')
op.drop_table('strings')
# ### end Alembic commands ###

+ 200
- 45
ui/medashare/cli.py View File

@@ -21,6 +21,10 @@ if False:

from .utils import _debprint

def _getquery(q, objstr):
return repr(str(q.compile(objstr._engine,
compile_kwargs={"literal_binds": True})).replace('\n', ' '))

#import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace()

from edgold.ed448 import EDDSA448
@@ -54,7 +58,7 @@ import shutil
import socket
import sqlalchemy
from sqlalchemy import create_engine, select, insert, func, delete
from sqlalchemy.orm import sessionmaker, aliased
from sqlalchemy.orm import sessionmaker, aliased, load_only
import string
import subprocess
import sys
@@ -279,6 +283,7 @@ class ObjectStore(object):
# looking up the UUIDv5 for FileObjects.

def __init__(self, engine, version='head'):
# Uncomment when working on the db schema
#orm.Base.metadata.create_all(engine)

self._engine = engine
@@ -407,6 +412,38 @@ class ObjectStore(object):
d = orm.Dummy(id=1)
session.add(d)

@staticmethod
def _update_metadata_indexes(session, obj, strcache):
# sqlalchemy doesn't cache inserts, so don't insert dups
# ourselves
propmapcache = set()

# clear out old data
stmt = delete(orm.PropertyMapping).where(
orm.PropertyMapping.obj == obj.uuid)
session.execute(stmt)

props = [ x for x in obj.items() if x[0] not in {
'hashes',
'sig',
'parent_refs',
} ]
for k, vids in props:
kid = strcache[k]

if not isinstance(vids, list):
vids = [ vids ]

vids = [ strcache[sv] for sv in vids ]

for v in vids:
if (obj.uuid, kid, v) in propmapcache:
continue

session.add(orm.PropertyMapping(obj=obj.uuid,
keyid=kid, valueid=v))
propmapcache.add((obj.uuid, kid, v))

def loadobj(self, obj):
'''Load obj into the data store.'''

@@ -457,6 +494,10 @@ class ObjectStore(object):
hostid=uuid.UUID(a), objid=obj.uuid))(
*x.split(':', 1)) for x in obj.mapping ]
session.add_all(maps)
elif obj.type == 'metadata':
self._update_metadata_indexes(session, obj,
StringCache(session))

try:
hashes = obj.hashes
except AttributeError:
@@ -834,6 +875,32 @@ def _get_paths(options):

return ( os.path.expanduser('~/' + x) for x in fnames )

class StringCache:
def __init__(self, session):
self._ses = session
self._cache = {}

def __getitem__(self, k):
try:
return self._cache[k]
except KeyError:
pass

v = self._ses.execute(select(orm.StringTable.id).where(
orm.StringTable.str == k)).first()
if v is None:
# not present, insert it
st = self._ses.add(orm.StringTable(str=k))

v = self._ses.execute(select(orm.StringTable.id)
.where(orm.StringTable.str == k)).first()

v = v[0]

self._cache[k] = v

return v

def init_datastructs(f):
@functools.wraps(f)
def wrapper(options):
@@ -1266,6 +1333,16 @@ def cmd_interactive(options, persona, objstr, cache):

@init_datastructs
def cmd_dump(options, persona, objstr, cache):
if options.dump_uuids or options.dump_hashes:
for i in options.dump_uuids:
print(objstr.by_id(i).encode('json'))

for i in options.dump_hashes:
for j in objstr.by_hash(i):
print(j.encode('json'))

return

print(persona.get_identity().encode('json'))

for i in objstr:
@@ -1470,56 +1547,73 @@ def cmd_search(options, persona, objstr, cache):
searches = [ (x[0], ) + tuple(x[1:].split('=', 1)) for x in args ]
#print(repr(searches), file=_real_stderr)

def testfun(x, s=searches):
try:
x = objstr.by_hash(x['hashes'][0], ('metadata',))[0]
except IndexError:
# no metadata object
# base object
mdofile = aliased(orm.MetaDataObject)
# hashes of base object
htfile = aliased(orm.HashTable)

# if we need anything, it's not present
if any(x[0] == '+' for x in s):
return False
sel = select(mdofile.data).execution_options(yield_per=10).where(
# we are operating on files
mdofile.type == 'file',
# we get all the hashes for the files
mdofile.uuid == htfile.uuid)

return True
for i in searches:
# hashes to metadata objects
htmd = aliased(orm.HashTable)
# metadataobjects
mdomd = aliased(orm.MetaDataObject)

propmap = aliased(orm.PropertyMapping)
skeymap = aliased(orm.StringTable)
svaluemap = aliased(orm.StringTable)

try:
for i in s:
try:
op, key, value = i
except ValueError:
op, key = i
value = None

if op == '+':
if value is None:
if key not in x:
return False
elif value not in x[key]:
return False
elif op == '-':
if value is None:
if key in x:
return False
elif value in x[key]:
return False
else:
raise ValueError('unhandled op: %s' % repr(op))
else:
return True
op, key, value = i
except ValueError:
op, key = i
value = None

except KeyError:
return False
subq = select(func.count(propmap.id)).where(
# match to metadata objects
htfile.hash == htmd.hash,
# we get all the metadata for those hashes
htmd.uuid == mdomd.uuid,
mdomd.type == 'metadata',

r = ( x for x in objstr if x.type == 'file' and testfun(x) )
# find their properties
mdomd.uuid == propmap.obj,

if _type == 'file':
mapping = FileObject.prep_mapping(objstr.get_hostmappings())
r = ( x.get_lcl_name(mapping) for x in r )
else:
raise ValueError('unhandled type: %s' % repr(_type))
# that match the key
propmap.keyid == skeymap.id, skeymap.str == key)

if value is not None:
subq = subq.where(propmap.valueid == svaluemap.id,
svaluemap.str == value)

subq = subq.scalar_subquery()

if op == '+':
sel = sel.where(subq != 0)
elif op == '-':
sel = sel.where(subq == 0)
else:
raise ValueError('unhandled op: %s' % repr(op))

for i in r:
print(i)
#_debprint('sel:', _getquery(sel, objstr))

with objstr._ses() as session:
r = ( x[0] for x in session.execute(sel) )

if _type == 'file':
mapping = FileObject.prep_mapping(
objstr.get_hostmappings())
r = ( x.get_lcl_name(mapping) for x in r )
else:
raise ValueError('unhandled type: %s' % repr(_type))

for i in r:
print(i)

def main():
import argparse
@@ -1592,6 +1686,12 @@ def main():
parser_interactive.set_defaults(func=cmd_interactive)

parser_dump = subparsers.add_parser('dump', help='dump all the objects')
parser_dump.add_argument('--uuid', dest='dump_uuids', action='append',
default=[],
help='dump the object with the specified UUID')
parser_dump.add_argument('--hash', dest='dump_hashes', action='append',
default=[],
help='dump the object(s) associated w/ the specified hash')
parser_dump.set_defaults(func=cmd_dump)

parser_import = subparsers.add_parser('import',
@@ -1614,7 +1714,12 @@ def main():

options = parser.parse_args()

fun = options.func
try:
fun = options.func
except AttributeError:
parser.print_help()
sys.exit(0)

fun(options)

if __name__ == '__main__': # pragma: no cover
@@ -1690,6 +1795,34 @@ class _TestMigrations(unittest.TestCase):
# for i in session.query(orm.MetaDataObject).all():
# _debprint('c:', repr(i))

def test_dff0d(self):
# That an object store generated at the start
objstr = ObjectStore(self._engine, 'dff0d9ed0be1')

pers = Persona()
pers.generate_key()
objstr.loadobj(pers.get_identity())

obj = pers.MetaData({ 'other': 'baz'})

# That has a metadata object
objstr.loadobj(obj)

# migrate the database forward
objstr._handle_migration('head')

with objstr._ses() as session:
# that string table has entries
other = session.execute(select(orm.StringTable.id)
.where(orm.StringTable.str == 'other')).first()[0]
baz = session.execute(select(orm.StringTable.id)
.where(orm.StringTable.str == 'baz')).first()[0]

# that propertymapping was populated
pm = { (x.obj, x.keyid, x.valueid) for (x,) in
session.execute(select(orm.PropertyMapping)) }
self.assertEqual(pm, { (obj.uuid, other, baz) })

class _TestCases(unittest.TestCase):
def setUp(self):
self.fixtures = pathlib.Path('fixtures').resolve()
@@ -2332,7 +2465,16 @@ class _TestCases(unittest.TestCase):
stdin.seek(0)

with self.assertRaises(SystemExit) as cm:
main()
try:
main()
except AssertionError:
# used to nab a copy of the
# failed sqlite3 db
if False:
shutil.copyfile(
storefname,
'/tmp/failure.sqlite3')
raise

# XXX - Minor hack till other tests fixed
sys.exit(0)
@@ -2404,6 +2546,18 @@ class _TestCases(unittest.TestCase):

self.assertEqual(dashhelp, subhelp)

with mock.patch('sys.stdout', io.StringIO()) as stdout, \
mock.patch('sys.argv', [ 'progname', ]) as argv:
with self.assertRaises(SystemExit) as cm:
main()

# XXX - Minor hack till other tests fixed
sys.exit(0)

subhelp = stdout.getvalue()

self.assertEqual(dashhelp, subhelp)

#@unittest.skip('temp')
def test_cmds(self):
cmds = sorted(self.fixtures.glob('cmd.*.json'))
@@ -2553,5 +2707,6 @@ class _TestCases(unittest.TestCase):
'ERROR: file not found: \'foo\'\n')

# Tests to add:
# add tests for dump options uuid and hash
# expand mappings to multiple mappings, that is a -> b, b -> c, implies a -> c
# support host names in --create

+ 24
- 1
ui/medashare/orm.py View File

@@ -1,5 +1,6 @@
import uuid
from sqlalchemy import Table, Column, DateTime, String, Integer, LargeBinary
from sqlalchemy import Table, Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy import DateTime, String, Integer, LargeBinary
from sqlalchemy import types
from sqlalchemy.orm import declarative_base
from .cli import _debprint
@@ -61,6 +62,12 @@ class HashTable(Base):
hash = Column(String, primary_key=True)
uuid = Column(UUID, primary_key=True)

class StringTable(Base):
__tablename__ = 'strings'

id = Column(Integer, primary_key=True)
str = Column(String, unique=True, nullable=False)

class MetaDataObject(Base):
__tablename__ = 'metadata_objects'

@@ -69,8 +76,24 @@ class MetaDataObject(Base):
type = Column(String)
data = Column(MDBaseType)

Index("idx_type", type)

def __repr__(self):
return \
'MetaDataObject(uuid=%s, type=%s, modified=%s,' \
' data=%s)' % (repr(self.uuid), repr(self.type),
repr(self.modified), repr(self.data))

class PropertyMapping(Base):
__tablename__ = 'propmap'

id = Column(Integer, primary_key=True)

obj = Column(UUID, ForeignKey(MetaDataObject.uuid), nullable=False)
keyid = Column(Integer, ForeignKey(StringTable.id), nullable=False)
valueid = Column(Integer, ForeignKey(StringTable.id))

UniqueConstraint(obj, keyid, valueid, sqlite_on_conflict='IGNORE')

# Do not add an index on keyid only. In real world testing, it
# slows down performance by about 50%.

Loading…
Cancel
Save