From 0865595d3ae811c201cd0cecc523c3a5f1eedff0 Mon Sep 17 00:00:00 2001 From: John-Mark Gurney Date: Tue, 23 Aug 2022 16:51:36 -0700 Subject: [PATCH] covert to iterating via files instead of pieces.. This uses an index to quickly look up what pieces are part of a file, and then checks that they are all valid, this should be faster as it is likely that the torrent has more pieces than files (few large files, vs many, many small files).. --- __init__.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/__init__.py b/__init__.py index 2170be4..e646c0d 100644 --- a/__init__.py +++ b/__init__.py @@ -21,8 +21,16 @@ class Storage: self._buildindex() def _filepaths(self): + '''Iterates over all the files in the torrent. + + Each item is a tuple of: + array of file path components (undecoded) + a pathlib.PurePath for the file + a pathlib.Path for file on disk + ''' + for curfile in self._files: - fname = pathlib.Path( + fname = pathlib.PurePath( *(x.decode(self._encoding) for x in curfile['path'])) curfilepath = self._rootpath / fname @@ -34,7 +42,8 @@ class Storage: yield curfilepath def _buildindex(self): - self._index = [] + self._pieceindex = [] + self._fileindex = {} files = self._filepaths() left = 0 curfile = None @@ -50,7 +59,9 @@ class Storage: if left == 0: current = [] - self._index.append(current) + self._fileindex.setdefault(fname, + []).append(len(self._pieceindex)) + self._pieceindex.append(current) left = self._piecelen sz = min(curfile['length'] - curfileoff, left) @@ -61,12 +72,15 @@ class Storage: curfileoff += sz left -= sz + def filepieces(self): + return self._fileindex.items() + def filesforpiece(self, idx): - for x in self._index[idx]: + for x in self._pieceindex[idx]: yield x['file'] def apply_piece(self, idx, fun): - for i in self._index[idx]: + for i in self._pieceindex[idx]: with open(i['file'], 'rb') as fp: fp.seek(i['offset']) fun(fp.read(i['size'])) @@ -83,7 +97,8 @@ def validate(torrent, basedir): torrentdir = basedir / info['name'].decode(encoding) - stor = Storage(torrentdir, info['files'], info['piece length'], encoding) + stor = Storage(torrentdir, info['files'], info['piece length'], + encoding) pieces = info['pieces'] piecescnt = len(pieces) // 20 @@ -102,10 +117,8 @@ def validate(torrent, basedir): # if any piece of a file is bad, it's bad allfiles = set(stor.allfiles()) - badpieces = [ x for x, v in enumerate(valid) if not v ] - - badfiles = reduce(set.__or__, (set(stor.filesforpiece(x)) for x in - badpieces), set()) + badfiles = { torrentdir / x for x, y in stor.filepieces() if + not all(valid[i] for i in y) } return allfiles - badfiles, badfiles @@ -198,5 +211,7 @@ class _TestCases(unittest.TestCase): val, inval = validate(self.torrent, self.basetempdir) - self.assertEqual(set(val), { sd / x for x in missingfiles.keys() if x not in badfiles }) - self.assertEqual(set(inval), { sd / x for x in badfiles.keys() }) + self.assertEqual(set(val), { sd / x for x in + missingfiles.keys() if x not in badfiles }) + self.assertEqual(set(inval), { sd / x for x in + badfiles.keys() })