Browse Source

improve search performance.. minor dump improvements by uuid or hash..

dump improvements need tests..
main
John-Mark Gurney 2 years ago
parent
commit
6056bbbdc7
4 changed files with 324 additions and 46 deletions
  1. +38
    -0
      ui/medashare/alembic/versions/bb98c5a2e486_create_index_on_metadata_object_type.py
  2. +62
    -0
      ui/medashare/alembic/versions/dff0d9ed0be1_add_property_index.py
  3. +200
    -45
      ui/medashare/cli.py
  4. +24
    -1
      ui/medashare/orm.py

+ 38
- 0
ui/medashare/alembic/versions/bb98c5a2e486_create_index_on_metadata_object_type.py View File

@@ -0,0 +1,38 @@
"""create index on metadata object type

Revision ID: bb98c5a2e486
Revises: dff0d9ed0be1
Create Date: 2023-04-13 02:16:52.359947

"""
from alembic import op
import sqlalchemy as sa
import medashare
from medashare import mdb


# revision identifiers, used by Alembic.
revision = 'bb98c5a2e486'
down_revision = 'dff0d9ed0be1'
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_index('idx_type', 'metadata_objects', ['type'], unique=False)
# ### end Alembic commands ###

connection = op.get_bind()

mdo = sa.schema.MetaData()
#mdotbl = sa.Table('metadata_objects', mdo, autoload_with=connection.engine)

#stmt = sa.select(mdotbl.c.uuid, mdotbl.c.data)
#newtypes = [ dict(olduuid=uuid, newtype=mdb.MDBase.decode(data).type) for
# uuid, data in connection.execute(stmt) ]

def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index('idx_type', table_name='metadata_objects')
# ### end Alembic commands ###

+ 62
- 0
ui/medashare/alembic/versions/dff0d9ed0be1_add_property_index.py View File

@@ -0,0 +1,62 @@
"""add property index

Revision ID: dff0d9ed0be1
Revises: f2131e9ae4db
Create Date: 2023-04-12 11:45:53.995445

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.orm import Session
import medashare
from medashare import mdb
from medashare.cli import StringCache, ObjectStore


# revision identifiers, used by Alembic.
revision = 'dff0d9ed0be1'
down_revision = 'f2131e9ae4db'
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('strings',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('str', sa.String(), nullable=False),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('str')
)
op.create_table('propmap',
sa.Column('id', sa.Integer(), nullable=False),
sa.Column('obj', medashare.orm.UUID(length=32), nullable=False),
sa.Column('keyid', sa.Integer(), nullable=False),
sa.Column('valueid', sa.Integer(), nullable=True),
sa.ForeignKeyConstraint(['keyid'], ['strings.id'], ),
sa.ForeignKeyConstraint(['obj'], ['metadata_objects.uuid'], ),
sa.ForeignKeyConstraint(['valueid'], ['strings.id'], ),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('obj', 'keyid', 'valueid')
)
# ### end Alembic commands ###

connection = op.get_bind()

mdo = sa.schema.MetaData()
mdotbl = sa.Table('metadata_objects', mdo, autoload_with=connection.engine)

stmt = sa.select(mdotbl.c.data).where(mdotbl.c.type == 'metadata')

with Session(connection) as session:
strcache = StringCache(session)

for (data, ) in connection.execute(stmt):
obj = mdb.MDBase.decode(data)
ObjectStore._update_metadata_indexes(session, obj, strcache)

def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('propmap')
op.drop_table('strings')
# ### end Alembic commands ###

+ 200
- 45
ui/medashare/cli.py View File

@@ -21,6 +21,10 @@ if False:


from .utils import _debprint from .utils import _debprint


def _getquery(q, objstr):
return repr(str(q.compile(objstr._engine,
compile_kwargs={"literal_binds": True})).replace('\n', ' '))

#import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace() #import pdb, sys; mypdb = pdb.Pdb(stdout=sys.stderr); mypdb.set_trace()


from edgold.ed448 import EDDSA448 from edgold.ed448 import EDDSA448
@@ -54,7 +58,7 @@ import shutil
import socket import socket
import sqlalchemy import sqlalchemy
from sqlalchemy import create_engine, select, insert, func, delete from sqlalchemy import create_engine, select, insert, func, delete
from sqlalchemy.orm import sessionmaker, aliased
from sqlalchemy.orm import sessionmaker, aliased, load_only
import string import string
import subprocess import subprocess
import sys import sys
@@ -279,6 +283,7 @@ class ObjectStore(object):
# looking up the UUIDv5 for FileObjects. # looking up the UUIDv5 for FileObjects.


def __init__(self, engine, version='head'): def __init__(self, engine, version='head'):
# Uncomment when working on the db schema
#orm.Base.metadata.create_all(engine) #orm.Base.metadata.create_all(engine)


self._engine = engine self._engine = engine
@@ -407,6 +412,38 @@ class ObjectStore(object):
d = orm.Dummy(id=1) d = orm.Dummy(id=1)
session.add(d) session.add(d)


@staticmethod
def _update_metadata_indexes(session, obj, strcache):
# sqlalchemy doesn't cache inserts, so don't insert dups
# ourselves
propmapcache = set()

# clear out old data
stmt = delete(orm.PropertyMapping).where(
orm.PropertyMapping.obj == obj.uuid)
session.execute(stmt)

props = [ x for x in obj.items() if x[0] not in {
'hashes',
'sig',
'parent_refs',
} ]
for k, vids in props:
kid = strcache[k]

if not isinstance(vids, list):
vids = [ vids ]

vids = [ strcache[sv] for sv in vids ]

for v in vids:
if (obj.uuid, kid, v) in propmapcache:
continue

session.add(orm.PropertyMapping(obj=obj.uuid,
keyid=kid, valueid=v))
propmapcache.add((obj.uuid, kid, v))

def loadobj(self, obj): def loadobj(self, obj):
'''Load obj into the data store.''' '''Load obj into the data store.'''


@@ -457,6 +494,10 @@ class ObjectStore(object):
hostid=uuid.UUID(a), objid=obj.uuid))( hostid=uuid.UUID(a), objid=obj.uuid))(
*x.split(':', 1)) for x in obj.mapping ] *x.split(':', 1)) for x in obj.mapping ]
session.add_all(maps) session.add_all(maps)
elif obj.type == 'metadata':
self._update_metadata_indexes(session, obj,
StringCache(session))

try: try:
hashes = obj.hashes hashes = obj.hashes
except AttributeError: except AttributeError:
@@ -834,6 +875,32 @@ def _get_paths(options):


return ( os.path.expanduser('~/' + x) for x in fnames ) return ( os.path.expanduser('~/' + x) for x in fnames )


class StringCache:
def __init__(self, session):
self._ses = session
self._cache = {}

def __getitem__(self, k):
try:
return self._cache[k]
except KeyError:
pass

v = self._ses.execute(select(orm.StringTable.id).where(
orm.StringTable.str == k)).first()
if v is None:
# not present, insert it
st = self._ses.add(orm.StringTable(str=k))

v = self._ses.execute(select(orm.StringTable.id)
.where(orm.StringTable.str == k)).first()

v = v[0]

self._cache[k] = v

return v

def init_datastructs(f): def init_datastructs(f):
@functools.wraps(f) @functools.wraps(f)
def wrapper(options): def wrapper(options):
@@ -1266,6 +1333,16 @@ def cmd_interactive(options, persona, objstr, cache):


@init_datastructs @init_datastructs
def cmd_dump(options, persona, objstr, cache): def cmd_dump(options, persona, objstr, cache):
if options.dump_uuids or options.dump_hashes:
for i in options.dump_uuids:
print(objstr.by_id(i).encode('json'))

for i in options.dump_hashes:
for j in objstr.by_hash(i):
print(j.encode('json'))

return

print(persona.get_identity().encode('json')) print(persona.get_identity().encode('json'))


for i in objstr: for i in objstr:
@@ -1470,56 +1547,73 @@ def cmd_search(options, persona, objstr, cache):
searches = [ (x[0], ) + tuple(x[1:].split('=', 1)) for x in args ] searches = [ (x[0], ) + tuple(x[1:].split('=', 1)) for x in args ]
#print(repr(searches), file=_real_stderr) #print(repr(searches), file=_real_stderr)


def testfun(x, s=searches):
try:
x = objstr.by_hash(x['hashes'][0], ('metadata',))[0]
except IndexError:
# no metadata object
# base object
mdofile = aliased(orm.MetaDataObject)
# hashes of base object
htfile = aliased(orm.HashTable)


# if we need anything, it's not present
if any(x[0] == '+' for x in s):
return False
sel = select(mdofile.data).execution_options(yield_per=10).where(
# we are operating on files
mdofile.type == 'file',
# we get all the hashes for the files
mdofile.uuid == htfile.uuid)


return True
for i in searches:
# hashes to metadata objects
htmd = aliased(orm.HashTable)
# metadataobjects
mdomd = aliased(orm.MetaDataObject)

propmap = aliased(orm.PropertyMapping)
skeymap = aliased(orm.StringTable)
svaluemap = aliased(orm.StringTable)


try: try:
for i in s:
try:
op, key, value = i
except ValueError:
op, key = i
value = None

if op == '+':
if value is None:
if key not in x:
return False
elif value not in x[key]:
return False
elif op == '-':
if value is None:
if key in x:
return False
elif value in x[key]:
return False
else:
raise ValueError('unhandled op: %s' % repr(op))
else:
return True
op, key, value = i
except ValueError:
op, key = i
value = None


except KeyError:
return False
subq = select(func.count(propmap.id)).where(
# match to metadata objects
htfile.hash == htmd.hash,
# we get all the metadata for those hashes
htmd.uuid == mdomd.uuid,
mdomd.type == 'metadata',


r = ( x for x in objstr if x.type == 'file' and testfun(x) )
# find their properties
mdomd.uuid == propmap.obj,


if _type == 'file':
mapping = FileObject.prep_mapping(objstr.get_hostmappings())
r = ( x.get_lcl_name(mapping) for x in r )
else:
raise ValueError('unhandled type: %s' % repr(_type))
# that match the key
propmap.keyid == skeymap.id, skeymap.str == key)

if value is not None:
subq = subq.where(propmap.valueid == svaluemap.id,
svaluemap.str == value)

subq = subq.scalar_subquery()

if op == '+':
sel = sel.where(subq != 0)
elif op == '-':
sel = sel.where(subq == 0)
else:
raise ValueError('unhandled op: %s' % repr(op))


for i in r:
print(i)
#_debprint('sel:', _getquery(sel, objstr))

with objstr._ses() as session:
r = ( x[0] for x in session.execute(sel) )

if _type == 'file':
mapping = FileObject.prep_mapping(
objstr.get_hostmappings())
r = ( x.get_lcl_name(mapping) for x in r )
else:
raise ValueError('unhandled type: %s' % repr(_type))

for i in r:
print(i)


def main(): def main():
import argparse import argparse
@@ -1592,6 +1686,12 @@ def main():
parser_interactive.set_defaults(func=cmd_interactive) parser_interactive.set_defaults(func=cmd_interactive)


parser_dump = subparsers.add_parser('dump', help='dump all the objects') parser_dump = subparsers.add_parser('dump', help='dump all the objects')
parser_dump.add_argument('--uuid', dest='dump_uuids', action='append',
default=[],
help='dump the object with the specified UUID')
parser_dump.add_argument('--hash', dest='dump_hashes', action='append',
default=[],
help='dump the object(s) associated w/ the specified hash')
parser_dump.set_defaults(func=cmd_dump) parser_dump.set_defaults(func=cmd_dump)


parser_import = subparsers.add_parser('import', parser_import = subparsers.add_parser('import',
@@ -1614,7 +1714,12 @@ def main():


options = parser.parse_args() options = parser.parse_args()


fun = options.func
try:
fun = options.func
except AttributeError:
parser.print_help()
sys.exit(0)

fun(options) fun(options)


if __name__ == '__main__': # pragma: no cover if __name__ == '__main__': # pragma: no cover
@@ -1690,6 +1795,34 @@ class _TestMigrations(unittest.TestCase):
# for i in session.query(orm.MetaDataObject).all(): # for i in session.query(orm.MetaDataObject).all():
# _debprint('c:', repr(i)) # _debprint('c:', repr(i))


def test_dff0d(self):
# That an object store generated at the start
objstr = ObjectStore(self._engine, 'dff0d9ed0be1')

pers = Persona()
pers.generate_key()
objstr.loadobj(pers.get_identity())

obj = pers.MetaData({ 'other': 'baz'})

# That has a metadata object
objstr.loadobj(obj)

# migrate the database forward
objstr._handle_migration('head')

with objstr._ses() as session:
# that string table has entries
other = session.execute(select(orm.StringTable.id)
.where(orm.StringTable.str == 'other')).first()[0]
baz = session.execute(select(orm.StringTable.id)
.where(orm.StringTable.str == 'baz')).first()[0]

# that propertymapping was populated
pm = { (x.obj, x.keyid, x.valueid) for (x,) in
session.execute(select(orm.PropertyMapping)) }
self.assertEqual(pm, { (obj.uuid, other, baz) })

class _TestCases(unittest.TestCase): class _TestCases(unittest.TestCase):
def setUp(self): def setUp(self):
self.fixtures = pathlib.Path('fixtures').resolve() self.fixtures = pathlib.Path('fixtures').resolve()
@@ -2332,7 +2465,16 @@ class _TestCases(unittest.TestCase):
stdin.seek(0) stdin.seek(0)


with self.assertRaises(SystemExit) as cm: with self.assertRaises(SystemExit) as cm:
main()
try:
main()
except AssertionError:
# used to nab a copy of the
# failed sqlite3 db
if False:
shutil.copyfile(
storefname,
'/tmp/failure.sqlite3')
raise


# XXX - Minor hack till other tests fixed # XXX - Minor hack till other tests fixed
sys.exit(0) sys.exit(0)
@@ -2404,6 +2546,18 @@ class _TestCases(unittest.TestCase):


self.assertEqual(dashhelp, subhelp) self.assertEqual(dashhelp, subhelp)


with mock.patch('sys.stdout', io.StringIO()) as stdout, \
mock.patch('sys.argv', [ 'progname', ]) as argv:
with self.assertRaises(SystemExit) as cm:
main()

# XXX - Minor hack till other tests fixed
sys.exit(0)

subhelp = stdout.getvalue()

self.assertEqual(dashhelp, subhelp)

#@unittest.skip('temp') #@unittest.skip('temp')
def test_cmds(self): def test_cmds(self):
cmds = sorted(self.fixtures.glob('cmd.*.json')) cmds = sorted(self.fixtures.glob('cmd.*.json'))
@@ -2553,5 +2707,6 @@ class _TestCases(unittest.TestCase):
'ERROR: file not found: \'foo\'\n') 'ERROR: file not found: \'foo\'\n')


# Tests to add: # Tests to add:
# add tests for dump options uuid and hash
# expand mappings to multiple mappings, that is a -> b, b -> c, implies a -> c # expand mappings to multiple mappings, that is a -> b, b -> c, implies a -> c
# support host names in --create # support host names in --create

+ 24
- 1
ui/medashare/orm.py View File

@@ -1,5 +1,6 @@
import uuid import uuid
from sqlalchemy import Table, Column, DateTime, String, Integer, LargeBinary
from sqlalchemy import Table, Column, ForeignKey, UniqueConstraint, Index
from sqlalchemy import DateTime, String, Integer, LargeBinary
from sqlalchemy import types from sqlalchemy import types
from sqlalchemy.orm import declarative_base from sqlalchemy.orm import declarative_base
from .cli import _debprint from .cli import _debprint
@@ -61,6 +62,12 @@ class HashTable(Base):
hash = Column(String, primary_key=True) hash = Column(String, primary_key=True)
uuid = Column(UUID, primary_key=True) uuid = Column(UUID, primary_key=True)


class StringTable(Base):
__tablename__ = 'strings'

id = Column(Integer, primary_key=True)
str = Column(String, unique=True, nullable=False)

class MetaDataObject(Base): class MetaDataObject(Base):
__tablename__ = 'metadata_objects' __tablename__ = 'metadata_objects'


@@ -69,8 +76,24 @@ class MetaDataObject(Base):
type = Column(String) type = Column(String)
data = Column(MDBaseType) data = Column(MDBaseType)


Index("idx_type", type)

def __repr__(self): def __repr__(self):
return \ return \
'MetaDataObject(uuid=%s, type=%s, modified=%s,' \ 'MetaDataObject(uuid=%s, type=%s, modified=%s,' \
' data=%s)' % (repr(self.uuid), repr(self.type), ' data=%s)' % (repr(self.uuid), repr(self.type),
repr(self.modified), repr(self.data)) repr(self.modified), repr(self.data))

class PropertyMapping(Base):
__tablename__ = 'propmap'

id = Column(Integer, primary_key=True)

obj = Column(UUID, ForeignKey(MetaDataObject.uuid), nullable=False)
keyid = Column(Integer, ForeignKey(StringTable.id), nullable=False)
valueid = Column(Integer, ForeignKey(StringTable.id))

UniqueConstraint(obj, keyid, valueid, sqlite_on_conflict='IGNORE')

# Do not add an index on keyid only. In real world testing, it
# slows down performance by about 50%.

Loading…
Cancel
Save