From 92928497737af6115579f66afaa7924b350f8cdc Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 13:55:38 +0100 Subject: [PATCH 01/11] initial commit of indexmgr --- pyterrier_colbert/indexing.py | 42 ++++++++++++++++++++++++++++++----- pyterrier_colbert/ranking.py | 35 +++++++++++++++++++++++++++++ tests/test_indexing.py | 24 +++++++++++--------- 3 files changed, 85 insertions(+), 16 deletions(-) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index 46bac41..15841a9 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -41,8 +41,31 @@ DEBUG=False +class TorchStorageIndexManager(IndexManager): + """ + A ColBERT IndexManager for torch.HalfStorage, which support mmap + """ + + def save(self, tensor, output_file): + output_file = output_file.replace(".pt", ".store") + size = tensor.shape[0] * tensor.shape[1] + out_tensor = torch.HalfStorage.from_file(output_file, True, size) + torch.HalfTensor(out_tensor).copy_(tensor.view(-1)) + +class NumpyIndexManager(IndexManager): + """ + A ColBERT IndexManager for numpy files, which support both mmap and direct loading + """ + def save(self, tensor, output_file): + import numpy as np + output_file = output_file.replace(".pt", ".npm") + memmap = np.memmap(output_file, dtype=np.uint64, mode='w+', shape=tensor.shape) + memmap[ : ] = tensor[ : ] + memmap.flush() + del(memmap) + class CollectionEncoder(): - def __init__(self, args, process_idx, num_processes): + def __init__(self, args, process_idx, num_processes, indexmgr=None): self.args = args self.collection = args.collection self.process_idx = process_idx @@ -68,7 +91,13 @@ def __init__(self, args, process_idx, num_processes): self.print_main(f"#> self.possible_subset_sizes = {self.possible_subset_sizes}") self._load_model() - self.indexmgr = IndexManager(args.dim) + + if indexmgr == 'numpy': + self.indexmgr = NumpyIndexManager(args.dim) + elif indexmgr == 'half': + self.indexmgr = TorchStorageIndexManager(args.dim) + else: + self.indexmgr = IndexManager(args.dim) def _initialize_iterator(self): return open(self.collection) @@ -449,7 +478,7 @@ def merge_colbert_files(src_dirs, dst_dir): """Re-count and sym-link ColBERT index files in src_dirs folders into a unified ColBERT index in dst_dir folder""" - FILE_PATTERNS = ["%d.pt", "%d.sample", "%d.tokenids", "doclens.%d.json"] + FILE_PATTERNS = ["%d.pt", "%d.store", "%d.np", "%d.sample", "%d.tokenids", "doclens.%d.json"] src_sizes = [count_parts(d) for d in src_dirs] @@ -457,9 +486,10 @@ def merge_colbert_files(src_dirs, dst_dir): for src_size, src_dir in zip(src_sizes, src_dirs): for i in range(src_size): for file in FILE_PATTERNS: - src_file = os.path.join(src_dir, file % i) - dst_file = os.path.join(dst_dir, file % (offset + i)) - os.symlink(src_file, dst_file) + if os.path.exists(src_file): + src_file = os.path.join(src_dir, file % i) + dst_file = os.path.join(dst_dir, file % (offset + i)) + os.symlink(src_file, dst_file) offset += src_size def make_new_faiss(index_root, index_name, **kwargs): diff --git a/pyterrier_colbert/ranking.py b/pyterrier_colbert/ranking.py index 072e61a..905d0a2 100644 --- a/pyterrier_colbert/ranking.py +++ b/pyterrier_colbert/ranking.py @@ -55,6 +55,37 @@ def get_embedding(self, pid): endpos = self.endpos[pid] return self.mmap[startpos:endpos,:] +class numpy_file_part_mmap: + def __init__(self, file_path, file_doclens): + self.dim = 128 # TODO + file_path = file_path.replace(".pt", ".np") + self.doclens = file_doclens + self.endpos = np.cumsum(self.doclens) + self.startpos = self.endpos - self.doclens + import numpy as np + self.mmap = torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r')) + print(self.mmap.shape) + + def get_embedding(self, pid): + startpos = self.startpos[pid] + endpos = self.endpos[pid] + return self.mmap[startpos:endpos,:] + +class numpy_file_part_mem: + def __init__(self, file_path, file_doclens): + self.dim = 128 # TODO + file_path = file_path.replace(".pt", ".np") + self.doclens = file_doclens + self.endpos = np.cumsum(self.doclens) + self.startpos = self.endpos - self.doclens + import numpy as np + self.mmap = torch.from_numpy(np.load(file_path, dtype=np.uint64, mode='r')) + print(self.mmap.shape) + + def get_embedding(self, pid): + startpos = self.startpos[pid] + endpos = self.endpos[pid] + return self.mmap[startpos:endpos,:] class Object(object): pass @@ -101,6 +132,10 @@ def _load_parts(index_path, part_doclens, memtype="mmap"): mmaps = [file_part_mmap(path, doclens) for path, doclens in zip(all_parts_paths, part_doclens)] elif memtype == "mem": mmaps = [file_part_mem(path, doclens) for path, doclens in tqdm(zip(all_parts_paths, part_doclens), total=len(all_parts_paths), desc="Loading index shards to memory", unit="shard")] + elif memtype == "numpy": + mmaps = [numpy_file_part_mem(path, doclens) for path, doclens in tqdm(zip(all_parts_paths, part_doclens), total=len(all_parts_paths), desc="Loading index shards to memory", unit="shard")] + elif memtype == "numpy_mmap": + mmaps = [numpy_file_part_mmap(path, doclens) for path, doclens in tqdm(zip(all_parts_paths, part_doclens), total=len(all_parts_paths), desc="Loading index shards to memory", unit="shard")] else: assert False, "Unknown memtype %s" % memtype return mmaps diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 5211050..2bde6dd 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -4,7 +4,7 @@ CHECKPOINT="http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip" class TestIndexing(unittest.TestCase): - def _indexing_1doc(self, indexmgr): + def _indexing_1doc(self, indexmgr, indexread): #minimum test case size is 100 docs, 40 Wordpiece tokens, and nx > k. we found 200 worked import pyterrier as pt from pyterrier_colbert.indexing import ColBERTIndexer @@ -13,7 +13,7 @@ def _indexing_1doc(self, indexmgr): CHECKPOINT, os.path.dirname(self.test_dir),os.path.basename(self.test_dir), chunksize=3, - #indexmgr=indexmgr, + indexmgr=indexmgr, gpu=False) iter = pt.get_dataset("vaswani").get_corpus_iter() @@ -22,6 +22,7 @@ def _indexing_1doc(self, indexmgr): import pyterrier_colbert.pruning as pruning for factory in [indexer.ranking_factory()]: + factory.memtype = indexread for pipe, has_score, name in [ (factory.end_to_end(), True, "E2E"), @@ -58,12 +59,6 @@ def _indexing_1doc(self, indexmgr): else: self.assertFalse("score" in dfOut.columns) - # def test_indexing_1doc_numpy(self): - # self._indexing_1doc('numpy') - - # def test_indexing_1doc_half(self): - # self._indexing_1doc('half') - def indexing_empty(self): #minimum test case size is 100 docs, 40 Wordpiece tokens, and nx > k. we found 200 worked import pyterrier as pt @@ -108,8 +103,17 @@ def indexing_merged(self): factory = ColBERTFactory(CHECKPOINT, index_root, "index_part", faiss_partitions=100, gpu=False) self.assertEqual(400, len(factory.docid2docno)) - def test_indexing_1doc_torch(self): - self._indexing_1doc('torch') + def test_indexing_1doc_torch_mem(self): + self._indexing_1doc('torch', "mem") + + def test_indexing_1doc_torch_mem(self): + self._indexing_1doc('half', "mmap") + + def test_indexing_1doc_numpy(self): + self._indexing_1doc('numpy', 'numpy') + + def test_indexing_1doc_numpy_mmap(self): + self._indexing_1doc('numpy', 'numpy_mmap') def setUp(self): import pyterrier as pt From a6a8a15ee03d86e2e78103355317865b77671da4 Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 14:04:58 +0100 Subject: [PATCH 02/11] pass through indexmgr --- pyterrier_colbert/indexing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index 15841a9..d50594e 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -254,8 +254,8 @@ class Object(object): class CollectionEncoder_Generator(CollectionEncoder): - def __init__(self, *args, prepend_title=False): - super().__init__(*args) + def __init__(self, *args, prepend_title=False, **kwargs): + super().__init__(*args, **kwargs) self.prepend_title = prepend_title def _initialize_iterator(self): @@ -282,7 +282,7 @@ def _preprocess_batch(self, offset, lines): class ColBERTIndexer(IterDictIndexerBase): - def __init__(self, checkpoint, index_root, index_name, chunksize, prepend_title=False, num_docs=None, ids=True, gpu=True): + def __init__(self, checkpoint, index_root, index_name, chunksize, prepend_title=False, num_docs=None, ids=True, gpu=True, indexmgr='None'): args = Object() args.similarity = 'cosine' args.dim = 128 @@ -310,6 +310,7 @@ def __init__(self, checkpoint, index_root, index_name, chunksize, prepend_title= self.prepend_title = prepend_title self.num_docs = num_docs self.gpu = gpu + self.indexmgr = indexmgr if not gpu: warn("Gpu disabled, YMMV") import colbert.parameters @@ -348,7 +349,7 @@ def convert_gen(iterator): docid+=1 yield l self.args.generator = convert_gen(iterator) - ceg = CollectionEncoderIds(self.args,0,1) if self.ids else CollectionEncoder_Generator(self.args,0,1) + ceg = CollectionEncoderIds(self.args,0,1, indexmgr=self.indexmgr) if self.ids else CollectionEncoder_Generator(self.args,0,1, indexmgr=self.indexmgr) create_directory(self.args.index_root) create_directory(self.args.index_path) From a5391dfdb0b1bd4a465e33233aeb5ff419da828c Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 14:50:03 +0100 Subject: [PATCH 03/11] monkey patching for numpy --- pyterrier_colbert/indexing.py | 14 ++++++++++++++ tests/test_indexing.py | 4 ++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index d50594e..cbac1ce 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -41,6 +41,18 @@ DEBUG=False +def load_index_part_torch(filename, verbose=True): + mmap_storage = torch.HalfStorage.from_file(file_path, False, sum(self.doclens) * self.dim) + return torch.HalfTensor(mmap_storage).view(sum(self.doclens), self.dim) + +def load_index_part_torchhalf(filename, verbose=True): + return torch.load(filename) + +def load_index_part_numpy(filename): + filename = filename.replace(".pt", ".np") + #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r')) + return torch.from_numpy(np.load(filename, dtype=np.uint64, mode='r')) + class TorchStorageIndexManager(IndexManager): """ A ColBERT IndexManager for torch.HalfStorage, which support mmap @@ -94,6 +106,8 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None): if indexmgr == 'numpy': self.indexmgr = NumpyIndexManager(args.dim) + import colbert.indexing.index_manager + colbert.indexing.index_manager.load_index_part = load_index_part_numpy elif indexmgr == 'half': self.indexmgr = TorchStorageIndexManager(args.dim) else: diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 2bde6dd..a6802a9 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -106,8 +106,8 @@ def indexing_merged(self): def test_indexing_1doc_torch_mem(self): self._indexing_1doc('torch', "mem") - def test_indexing_1doc_torch_mem(self): - self._indexing_1doc('half', "mmap") + # def test_indexing_1doc_torch_mem(self): + # self._indexing_1doc('half', "mmap") def test_indexing_1doc_numpy(self): self._indexing_1doc('numpy', 'numpy') From 495d6ef483ccdc0cf862f26973b6d3ef3ae7aa07 Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 15:10:23 +0100 Subject: [PATCH 04/11] change dtype for numpy --- pyterrier_colbert/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index cbac1ce..f50a997 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -51,7 +51,7 @@ def load_index_part_torchhalf(filename, verbose=True): def load_index_part_numpy(filename): filename = filename.replace(".pt", ".np") #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r')) - return torch.from_numpy(np.load(filename, dtype=np.uint64, mode='r')) + return torch.from_numpy(np.load(filename, dtype=np.float16, mode='r')) class TorchStorageIndexManager(IndexManager): """ @@ -71,7 +71,7 @@ class NumpyIndexManager(IndexManager): def save(self, tensor, output_file): import numpy as np output_file = output_file.replace(".pt", ".npm") - memmap = np.memmap(output_file, dtype=np.uint64, mode='w+', shape=tensor.shape) + memmap = np.memmap(output_file, dtype=np.float16, mode='w+', shape=tensor.shape) memmap[ : ] = tensor[ : ] memmap.flush() del(memmap) From 1cdf6bcd8aba96efdff68f5caa9057c4354246fc Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 16:30:23 +0100 Subject: [PATCH 05/11] check for other exts --- pyterrier_colbert/indexing.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index f50a997..392efdd 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -41,6 +41,28 @@ DEBUG=False +def get_parts_ext(directory): + #extension of get_parts to check for other file types + extensions = ['.pt', '.np', '.store'] + + parts=[] + for extension in extensions: + parts = sorted([int(filename[: -1 * len(extension)]) for filename in os.listdir(directory) + if filename.endswith(extension)]) + if len(parts) > 0: + print("Found index files with ext %s" % extension) + break + if len(parts) == 0: + raise ValueError("found no index embeddings files") + + assert list(range(len(parts))) == parts, parts + + # Integer-sortedness matters. + parts_paths = [os.path.join(directory, '{}{}'.format(filename, extension)) for filename in parts] + samples_paths = [os.path.join(directory, '{}.sample'.format(filename)) for filename in parts] + + return parts, parts_paths, samples_paths + def load_index_part_torch(filename, verbose=True): mmap_storage = torch.HalfStorage.from_file(file_path, False, sum(self.doclens) * self.dim) return torch.HalfTensor(mmap_storage).view(sum(self.doclens), self.dim) @@ -108,6 +130,9 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None): self.indexmgr = NumpyIndexManager(args.dim) import colbert.indexing.index_manager colbert.indexing.index_manager.load_index_part = load_index_part_numpy + import colbert.indexing.loaders + colbert.indexing.loaders.get_parts = get_parts_ext + elif indexmgr == 'half': self.indexmgr = TorchStorageIndexManager(args.dim) else: From 5bd40ea151cb50babf89eff975d0d49f404d1c81 Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 18:35:04 +0100 Subject: [PATCH 06/11] more work on numpy index format --- pyterrier_colbert/indexing.py | 32 ++++++++++++++++---------------- pyterrier_colbert/ranking.py | 4 ++-- tests/test_indexing.py | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index 392efdd..5d53335 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -46,11 +46,12 @@ def get_parts_ext(directory): extensions = ['.pt', '.np', '.store'] parts=[] - for extension in extensions: - parts = sorted([int(filename[: -1 * len(extension)]) for filename in os.listdir(directory) - if filename.endswith(extension)]) + for ext in extensions: + parts = sorted([int(filename[: -1 * len(ext)]) for filename in os.listdir(directory) + if filename.endswith(ext)]) if len(parts) > 0: - print("Found index files with ext %s" % extension) + extension = ext + print("Found %d index files with ext %s" % (len(parts), extension)) break if len(parts) == 0: raise ValueError("found no index embeddings files") @@ -60,7 +61,6 @@ def get_parts_ext(directory): # Integer-sortedness matters. parts_paths = [os.path.join(directory, '{}{}'.format(filename, extension)) for filename in parts] samples_paths = [os.path.join(directory, '{}.sample'.format(filename)) for filename in parts] - return parts, parts_paths, samples_paths def load_index_part_torch(filename, verbose=True): @@ -73,7 +73,7 @@ def load_index_part_torchhalf(filename, verbose=True): def load_index_part_numpy(filename): filename = filename.replace(".pt", ".np") #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r')) - return torch.from_numpy(np.load(filename, dtype=np.float16, mode='r')) + return torch.from_numpy(np.load(filename, mode='r')) class TorchStorageIndexManager(IndexManager): """ @@ -92,11 +92,12 @@ class NumpyIndexManager(IndexManager): """ def save(self, tensor, output_file): import numpy as np - output_file = output_file.replace(".pt", ".npm") - memmap = np.memmap(output_file, dtype=np.float16, mode='w+', shape=tensor.shape) - memmap[ : ] = tensor[ : ] - memmap.flush() - del(memmap) + output_file = output_file.replace(".pt", ".np") + np.save(output_file, tensor.detach().numpy()) + #memmap = np.memmap(output_file, dtype=np.float16, mode='w+', shape=tensor.shape) + #memmap[ : ] = tensor[ : ] + #memmap.flush() + #del(memmap) class CollectionEncoder(): def __init__(self, args, process_idx, num_processes, indexmgr=None): @@ -128,11 +129,10 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None): if indexmgr == 'numpy': self.indexmgr = NumpyIndexManager(args.dim) - import colbert.indexing.index_manager - colbert.indexing.index_manager.load_index_part = load_index_part_numpy - import colbert.indexing.loaders - colbert.indexing.loaders.get_parts = get_parts_ext - + import colbert.indexing.index_manager, colbert.indexing.loaders, colbert.indexing.faiss + colbert.indexing.faiss.load_index_part = colbert.indexing.index_manager.load_index_part = load_index_part_numpy + colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext + elif indexmgr == 'half': self.indexmgr = TorchStorageIndexManager(args.dim) else: diff --git a/pyterrier_colbert/ranking.py b/pyterrier_colbert/ranking.py index 905d0a2..eb402e4 100644 --- a/pyterrier_colbert/ranking.py +++ b/pyterrier_colbert/ranking.py @@ -63,7 +63,7 @@ def __init__(self, file_path, file_doclens): self.endpos = np.cumsum(self.doclens) self.startpos = self.endpos - self.doclens import numpy as np - self.mmap = torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r')) + self.mmap = torch.from_numpy(np.load(file_path, mmap_mode='r')) print(self.mmap.shape) def get_embedding(self, pid): @@ -79,7 +79,7 @@ def __init__(self, file_path, file_doclens): self.endpos = np.cumsum(self.doclens) self.startpos = self.endpos - self.doclens import numpy as np - self.mmap = torch.from_numpy(np.load(file_path, dtype=np.uint64, mode='r')) + self.mmap = torch.from_numpy(np.load(file_path)) print(self.mmap.shape) def get_embedding(self, pid): diff --git a/tests/test_indexing.py b/tests/test_indexing.py index a6802a9..93a2d6a 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -109,7 +109,7 @@ def test_indexing_1doc_torch_mem(self): # def test_indexing_1doc_torch_mem(self): # self._indexing_1doc('half', "mmap") - def test_indexing_1doc_numpy(self): + def test_indexing_1doc_numpy_mem(self): self._indexing_1doc('numpy', 'numpy') def test_indexing_1doc_numpy_mmap(self): From 671d0daa0ad8eed20c11ca862445e18a9853f865 Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 20:41:42 +0100 Subject: [PATCH 07/11] fix for numpy.load --- pyterrier_colbert/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index 5d53335..012bf61 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -73,7 +73,7 @@ def load_index_part_torchhalf(filename, verbose=True): def load_index_part_numpy(filename): filename = filename.replace(".pt", ".np") #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r')) - return torch.from_numpy(np.load(filename, mode='r')) + return torch.from_numpy(np.load(filename)) class TorchStorageIndexManager(IndexManager): """ From 05fc0e67a847f1f019a852e094be527a3b0c9ed1 Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 22:04:01 +0100 Subject: [PATCH 08/11] fixes for numpy --- pyterrier_colbert/indexing.py | 13 +++++++------ pyterrier_colbert/ranking.py | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index 012bf61..1eb8259 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -43,7 +43,7 @@ def get_parts_ext(directory): #extension of get_parts to check for other file types - extensions = ['.pt', '.np', '.store'] + extensions = ['.pt', '.npy', '.store'] parts=[] for ext in extensions: @@ -71,7 +71,7 @@ def load_index_part_torchhalf(filename, verbose=True): return torch.load(filename) def load_index_part_numpy(filename): - filename = filename.replace(".pt", ".np") + filename = filename.replace(".pt", ".npy") #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r')) return torch.from_numpy(np.load(filename)) @@ -92,7 +92,7 @@ class NumpyIndexManager(IndexManager): """ def save(self, tensor, output_file): import numpy as np - output_file = output_file.replace(".pt", ".np") + output_file = output_file.replace(".pt", ".npy") np.save(output_file, tensor.detach().numpy()) #memmap = np.memmap(output_file, dtype=np.float16, mode='w+', shape=tensor.shape) #memmap[ : ] = tensor[ : ] @@ -130,12 +130,13 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None): if indexmgr == 'numpy': self.indexmgr = NumpyIndexManager(args.dim) import colbert.indexing.index_manager, colbert.indexing.loaders, colbert.indexing.faiss - colbert.indexing.faiss.load_index_part = colbert.indexing.index_manager.load_index_part = load_index_part_numpy - colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext - + colbert.indexing.faiss.load_index_part = load_index_part_numpy + colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext elif indexmgr == 'half': + assert False self.indexmgr = TorchStorageIndexManager(args.dim) else: + colbert.indexing.faiss.load_index_part = colbert.indexing.index_manager.load_index_part self.indexmgr = IndexManager(args.dim) def _initialize_iterator(self): diff --git a/pyterrier_colbert/ranking.py b/pyterrier_colbert/ranking.py index eb402e4..c2baa88 100644 --- a/pyterrier_colbert/ranking.py +++ b/pyterrier_colbert/ranking.py @@ -63,7 +63,7 @@ def __init__(self, file_path, file_doclens): self.endpos = np.cumsum(self.doclens) self.startpos = self.endpos - self.doclens import numpy as np - self.mmap = torch.from_numpy(np.load(file_path, mmap_mode='r')) + self.mmap = torch.from_numpy(np.load(file_path, mmap_mode='r+')) print(self.mmap.shape) def get_embedding(self, pid): @@ -74,7 +74,7 @@ def get_embedding(self, pid): class numpy_file_part_mem: def __init__(self, file_path, file_doclens): self.dim = 128 # TODO - file_path = file_path.replace(".pt", ".np") + file_path = file_path.replace(".pt", ".npy") self.doclens = file_doclens self.endpos = np.cumsum(self.doclens) self.startpos = self.endpos - self.doclens From 72dcf739baf38a173ccb2c78831811dde50ad1b9 Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 22:16:44 +0100 Subject: [PATCH 09/11] more fixes --- pyterrier_colbert/indexing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index 1eb8259..d849866 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -47,6 +47,8 @@ def get_parts_ext(directory): parts=[] for ext in extensions: + print([filename for filename in os.listdir(directory)]) + print([filename for filename in os.listdir(directory) if filename.endswith(ext)]) parts = sorted([int(filename[: -1 * len(ext)]) for filename in os.listdir(directory) if filename.endswith(ext)]) if len(parts) > 0: @@ -54,7 +56,7 @@ def get_parts_ext(directory): print("Found %d index files with ext %s" % (len(parts), extension)) break if len(parts) == 0: - raise ValueError("found no index embeddings files") + raise ValueError("found no index embedding files") assert list(range(len(parts))) == parts, parts @@ -127,15 +129,16 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None): self._load_model() + import colbert.indexing.index_manager, colbert.indexing.loaders, colbert.indexing.faiss if indexmgr == 'numpy': self.indexmgr = NumpyIndexManager(args.dim) - import colbert.indexing.index_manager, colbert.indexing.loaders, colbert.indexing.faiss colbert.indexing.faiss.load_index_part = load_index_part_numpy - colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext + colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext elif indexmgr == 'half': assert False self.indexmgr = TorchStorageIndexManager(args.dim) else: + colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext colbert.indexing.faiss.load_index_part = colbert.indexing.index_manager.load_index_part self.indexmgr = IndexManager(args.dim) From 196775d470fb026b21c2e9896d06edd7134a7a17 Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 22:43:00 +0100 Subject: [PATCH 10/11] resort to super for other file types --- pyterrier_colbert/indexing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index d849866..3782852 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -83,6 +83,9 @@ class TorchStorageIndexManager(IndexManager): """ def save(self, tensor, output_file): + if not output_file.endswith(".pt"): + # for .ids, .sample etc, resort to torch.save + return super().save(tensor, output_file) output_file = output_file.replace(".pt", ".store") size = tensor.shape[0] * tensor.shape[1] out_tensor = torch.HalfStorage.from_file(output_file, True, size) @@ -93,6 +96,9 @@ class NumpyIndexManager(IndexManager): A ColBERT IndexManager for numpy files, which support both mmap and direct loading """ def save(self, tensor, output_file): + if not output_file.endswith(".pt"): + # for .ids, .sample etc, resort to torch.save + return super().save(tensor, output_file) import numpy as np output_file = output_file.replace(".pt", ".npy") np.save(output_file, tensor.detach().numpy()) From 53369182b8179715634626137cd6a004ae4643c5 Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Tue, 5 Oct 2021 22:55:37 +0100 Subject: [PATCH 11/11] more numpy work - resort to torch for sample, tokenids etc --- pyterrier_colbert/indexing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py index 3782852..6962cf8 100644 --- a/pyterrier_colbert/indexing.py +++ b/pyterrier_colbert/indexing.py @@ -73,9 +73,12 @@ def load_index_part_torchhalf(filename, verbose=True): return torch.load(filename) def load_index_part_numpy(filename): - filename = filename.replace(".pt", ".npy") - #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r')) - return torch.from_numpy(np.load(filename)) + if filename.endswith(".pt"): + filename = filename.replace(".pt", ".npy") + return torch.from_numpy(np.load(filename)) + else: + #resort to torch for sample, etc + return torch.load(filename) class TorchStorageIndexManager(IndexManager): """