From 9279a559f5bbddf09fbff62bfdc80d9e7cfe24ee Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Fri, 14 Apr 2023 13:50:04 -0700 Subject: [PATCH] significantly improve search results, especially in the exclusion case.. The original query applied a complicated test, which sqlite couldn't tell if it applied to all.. In the case of any inclusion, it's easy, only search metadata, and match to files. If all exclusion, make two parts, the part w/ a metadata object that doesn't have the exclusions, and the part w/o any metadata objects.. Both of these later two queries can be satified more simply and with proper indices.. The old query might have worked fine on a more advanced DB, but was necessary for decent performance.. --- ui/medashare/cli.py | 116 ++++++++++++++++++++++++++++++++------------ 1 file changed, 84 insertions(+), 32 deletions(-) diff --git a/ui/medashare/cli.py b/ui/medashare/cli.py index b593e45..b4b7993 100644 --- a/ui/medashare/cli.py +++ b/ui/medashare/cli.py @@ -1552,26 +1552,17 @@ def cmd_search(options, persona, objstr, cache): _type = args.pop(0) searches = [ (x[0], ) + tuple(x[1:].split('=', 1)) for x in args ] - #print(repr(searches), file=_real_stderr) + #_debprint(repr(searches)) - # base object - mdofile = aliased(orm.MetaDataObject) - # hashes of base object - htfile = aliased(orm.HashTable) + propmap = aliased(orm.PropertyMapping) - sel = select(mdofile.data).execution_options(yield_per=10).where( - # we are operating on files - mdofile.type == 'file', - # we get all the hashes for the files - mdofile.uuid == htfile.uuid) + # propobj only returns what can match query + propobj = select(propmap.obj) - for i in searches: - # hashes to metadata objects - htmd = aliased(orm.HashTable) - # metadataobjects - mdomd = aliased(orm.MetaDataObject) + onlyexclusions = True - propmap = aliased(orm.PropertyMapping) + for i in searches: + propmapsub = aliased(orm.PropertyMapping) skeymap = aliased(orm.StringTable) svaluemap = aliased(orm.StringTable) @@ -1581,32 +1572,79 @@ def cmd_search(options, persona, objstr, cache): op, key = i value = None - subq = select(func.count(propmap.id)).where( - # match to metadata objects - htfile.hash == htmd.hash, - # we get all the metadata for those hashes - htmd.uuid == mdomd.uuid, - mdomd.type == 'metadata', - - # find their properties - mdomd.uuid == propmap.obj, - + subq = select(propmapsub.obj).where( # that match the key - propmap.keyid == skeymap.id, skeymap.str == key) + propmapsub.keyid == skeymap.id, skeymap.str == key) if value is not None: - subq = subq.where(propmap.valueid == svaluemap.id, + subq = subq.where(propmapsub.valueid == svaluemap.id, svaluemap.str == value) - subq = subq.scalar_subquery() + #subq = subq.subquery() if op == '+': - sel = sel.where(subq != 0) + onlyexclusions = False + propobj = propobj.where(propmap.obj.in_(subq)) elif op == '-': - sel = sel.where(subq == 0) + propobj = propobj.where(propmap.obj.notin_(subq)) else: raise ValueError('unhandled op: %s' % repr(op)) + # propobj should have all the ones we need selected, map back to + # the object we need + + # base object (file) + mdofile = aliased(orm.MetaDataObject) + # hashes of base object + htfile = aliased(orm.HashTable) + + # hashes to metadata objects + htmd = aliased(orm.HashTable) + # metadataobjects + mdomd = aliased(orm.MetaDataObject) + + sel = select(mdofile.data).where( + # we are operating on files + mdofile.type == 'file', + # we get all the hashes for the files + mdofile.uuid == htfile.uuid, + htfile.hash == htmd.hash, + mdomd.uuid == htmd.uuid, + mdomd.type == 'metadata', + ) + + if onlyexclusions: + sel = sel.where(mdomd.uuid.in_(propobj)) + + # base object (file) + mdofile = aliased(orm.MetaDataObject) + # hashes of base object + htfile = aliased(orm.HashTable) + + # hashes to metadata objects + htmd = aliased(orm.HashTable) + # metadataobjects + mdomd = aliased(orm.MetaDataObject) + + # query for hashes w/o metadata + selwomd = select(mdofile.data).where( + # we are operating on files + mdofile.type == 'file', + # we get all the hashes for the files + mdofile.uuid == htfile.uuid, + htfile.hash.notin_( + select(htmd.hash).where( + htmd.uuid == mdomd.uuid, mdomd.type == 'metadata' + ) + ) + ) + + sel = sel.union(selwomd) + else: + sel = sel.where(mdomd.uuid.in_(propobj)) + + sel = sel.execution_options(yield_per=10) + #_debprint('sel:', _getquery(sel, objstr)) with objstr._ses() as session: @@ -2348,12 +2386,26 @@ class _TestCases(unittest.TestCase): _debprint('remaining objs:', repr(fullobjs)) self.fail('Unable to find objects %s in dump' % missing) + def save_db(f): # pragma: no cover + @functools.wraps(f) + def wrapper(self, *args, **kwargs): + try: + return f(self, *args, **kwargs) + except: + shutil.copyfile(self.storefname, + '/tmp/failure.sqlite3') + raise + + return wrapper + + #@save_db def run_command_file(self, f): with open(f) as fp: cmds = json.load(fp) # setup object store storefname = self.tempdir / 'storefname' + self.storefname = storefname identfname = self.tempdir / 'identfname' cachefname = self.tempdir / 'cachefname' @@ -2474,7 +2526,7 @@ class _TestCases(unittest.TestCase): with self.assertRaises(SystemExit) as cm: try: main() - except AssertionError: + except AssertionError: # pragma: no cover # used to nab a copy of the # failed sqlite3 db if False: