From 92928497737af6115579f66afaa7924b350f8cdc Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 13:55:38 +0100
Subject: [PATCH 01/11] initial commit of indexmgr

---
 pyterrier_colbert/indexing.py | 42 ++++++++++++++++++++++++++++++-----
 pyterrier_colbert/ranking.py  | 35 +++++++++++++++++++++++++++++
 tests/test_indexing.py        | 24 +++++++++++---------
 3 files changed, 85 insertions(+), 16 deletions(-)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index 46bac41..15841a9 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -41,8 +41,31 @@
 
 DEBUG=False
 
+class TorchStorageIndexManager(IndexManager):
+    """
+    A ColBERT IndexManager for torch.HalfStorage, which support mmap
+    """
+
+    def save(self, tensor, output_file):
+        output_file = output_file.replace(".pt", ".store")
+        size = tensor.shape[0] * tensor.shape[1]
+        out_tensor = torch.HalfStorage.from_file(output_file, True, size)
+        torch.HalfTensor(out_tensor).copy_(tensor.view(-1))
+
+class NumpyIndexManager(IndexManager):
+    """
+    A ColBERT IndexManager for numpy files, which support both mmap and direct loading
+    """
+    def save(self, tensor, output_file):
+        import numpy as np
+        output_file = output_file.replace(".pt", ".npm")
+        memmap = np.memmap(output_file, dtype=np.uint64, mode='w+', shape=tensor.shape)
+        memmap[ : ] = tensor[ : ]
+        memmap.flush()
+        del(memmap)
+
 class CollectionEncoder():
-    def __init__(self, args, process_idx, num_processes):
+    def __init__(self, args, process_idx, num_processes, indexmgr=None):
         self.args = args
         self.collection = args.collection
         self.process_idx = process_idx
@@ -68,7 +91,13 @@ def __init__(self, args, process_idx, num_processes):
         self.print_main(f"#> self.possible_subset_sizes = {self.possible_subset_sizes}")
 
         self._load_model()
-        self.indexmgr = IndexManager(args.dim)
+    
+        if indexmgr == 'numpy':
+            self.indexmgr = NumpyIndexManager(args.dim)
+        elif indexmgr == 'half':
+            self.indexmgr = TorchStorageIndexManager(args.dim)
+        else:
+            self.indexmgr = IndexManager(args.dim)
 
     def _initialize_iterator(self):
         return open(self.collection)
@@ -449,7 +478,7 @@ def merge_colbert_files(src_dirs, dst_dir):
         """Re-count and sym-link ColBERT index files in src_dirs folders into
         a unified ColBERT index in dst_dir folder"""
         
-        FILE_PATTERNS = ["%d.pt", "%d.sample", "%d.tokenids", "doclens.%d.json"]
+        FILE_PATTERNS = ["%d.pt", "%d.store", "%d.np", "%d.sample", "%d.tokenids", "doclens.%d.json"]
         
         src_sizes = [count_parts(d) for d in src_dirs]
         
@@ -457,9 +486,10 @@ def merge_colbert_files(src_dirs, dst_dir):
         for src_size, src_dir in zip(src_sizes, src_dirs):
             for i in range(src_size):
                 for file in FILE_PATTERNS:
-                    src_file = os.path.join(src_dir, file % i)
-                    dst_file = os.path.join(dst_dir, file % (offset + i))
-                    os.symlink(src_file, dst_file)
+                    if os.path.exists(src_file):
+                        src_file = os.path.join(src_dir, file % i)
+                        dst_file = os.path.join(dst_dir, file % (offset + i))
+                        os.symlink(src_file, dst_file)
             offset += src_size
 
     def make_new_faiss(index_root, index_name, **kwargs):
diff --git a/pyterrier_colbert/ranking.py b/pyterrier_colbert/ranking.py
index 072e61a..905d0a2 100644
--- a/pyterrier_colbert/ranking.py
+++ b/pyterrier_colbert/ranking.py
@@ -55,6 +55,37 @@ def get_embedding(self, pid):
         endpos = self.endpos[pid]
         return self.mmap[startpos:endpos,:]
 
+class numpy_file_part_mmap:
+    def __init__(self, file_path, file_doclens):
+        self.dim = 128 # TODO        
+        file_path = file_path.replace(".pt", ".np")
+        self.doclens = file_doclens
+        self.endpos = np.cumsum(self.doclens)
+        self.startpos = self.endpos - self.doclens
+        import numpy as np
+        self.mmap = torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r'))
+        print(self.mmap.shape)
+ 
+    def get_embedding(self, pid):
+        startpos = self.startpos[pid]
+        endpos = self.endpos[pid]
+        return self.mmap[startpos:endpos,:]
+
+class numpy_file_part_mem:
+    def __init__(self, file_path, file_doclens):
+        self.dim = 128 # TODO        
+        file_path = file_path.replace(".pt", ".np")
+        self.doclens = file_doclens
+        self.endpos = np.cumsum(self.doclens)
+        self.startpos = self.endpos - self.doclens
+        import numpy as np
+        self.mmap = torch.from_numpy(np.load(file_path, dtype=np.uint64, mode='r'))
+        print(self.mmap.shape)
+ 
+    def get_embedding(self, pid):
+        startpos = self.startpos[pid]
+        endpos = self.endpos[pid]
+        return self.mmap[startpos:endpos,:]
 
 class Object(object):
     pass
@@ -101,6 +132,10 @@ def _load_parts(index_path, part_doclens, memtype="mmap"):
             mmaps = [file_part_mmap(path, doclens) for path, doclens in zip(all_parts_paths, part_doclens)]
         elif memtype == "mem":
             mmaps = [file_part_mem(path, doclens) for path, doclens in tqdm(zip(all_parts_paths, part_doclens), total=len(all_parts_paths), desc="Loading index shards to memory", unit="shard")]
+        elif memtype == "numpy":
+            mmaps = [numpy_file_part_mem(path, doclens) for path, doclens in tqdm(zip(all_parts_paths, part_doclens), total=len(all_parts_paths), desc="Loading index shards to memory", unit="shard")]
+        elif memtype == "numpy_mmap":
+            mmaps = [numpy_file_part_mmap(path, doclens) for path, doclens in tqdm(zip(all_parts_paths, part_doclens), total=len(all_parts_paths), desc="Loading index shards to memory", unit="shard")]
         else:
             assert False, "Unknown memtype %s" % memtype
         return mmaps
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
index 5211050..2bde6dd 100644
--- a/tests/test_indexing.py
+++ b/tests/test_indexing.py
@@ -4,7 +4,7 @@
 CHECKPOINT="http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip"
 class TestIndexing(unittest.TestCase):
 
-    def _indexing_1doc(self, indexmgr):
+    def _indexing_1doc(self, indexmgr, indexread):
         #minimum test case size is 100 docs, 40 Wordpiece tokens, and nx > k. we found 200 worked
         import pyterrier as pt
         from pyterrier_colbert.indexing import ColBERTIndexer
@@ -13,7 +13,7 @@ def _indexing_1doc(self, indexmgr):
             CHECKPOINT, 
             os.path.dirname(self.test_dir),os.path.basename(self.test_dir), 
             chunksize=3,
-            #indexmgr=indexmgr,
+            indexmgr=indexmgr,
             gpu=False)
 
         iter = pt.get_dataset("vaswani").get_corpus_iter()
@@ -22,6 +22,7 @@ def _indexing_1doc(self, indexmgr):
         import pyterrier_colbert.pruning as pruning
             
         for factory in [indexer.ranking_factory()]:
+            factory.memtype = indexread
 
             for pipe, has_score, name in [
                 (factory.end_to_end(), True, "E2E"),
@@ -58,12 +59,6 @@ def _indexing_1doc(self, indexmgr):
                     else:
                         self.assertFalse("score" in dfOut.columns)
 
-    # def test_indexing_1doc_numpy(self):
-    #     self._indexing_1doc('numpy')
-    
-    # def test_indexing_1doc_half(self):
-    #     self._indexing_1doc('half')
-
     def indexing_empty(self):
         #minimum test case size is 100 docs, 40 Wordpiece tokens, and nx > k. we found 200 worked
         import pyterrier as pt
@@ -108,8 +103,17 @@ def indexing_merged(self):
         factory = ColBERTFactory(CHECKPOINT, index_root, "index_part", faiss_partitions=100, gpu=False)
         self.assertEqual(400, len(factory.docid2docno))
     
-    def test_indexing_1doc_torch(self):
-        self._indexing_1doc('torch')
+    def test_indexing_1doc_torch_mem(self):
+        self._indexing_1doc('torch', "mem")
+
+    def test_indexing_1doc_torch_mem(self):
+        self._indexing_1doc('half', "mmap")
+
+    def test_indexing_1doc_numpy(self):
+        self._indexing_1doc('numpy', 'numpy')
+    
+    def test_indexing_1doc_numpy_mmap(self):
+        self._indexing_1doc('numpy', 'numpy_mmap')
 
     def setUp(self):
         import pyterrier as pt

From a6a8a15ee03d86e2e78103355317865b77671da4 Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 14:04:58 +0100
Subject: [PATCH 02/11] pass through indexmgr

---
 pyterrier_colbert/indexing.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index 15841a9..d50594e 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -254,8 +254,8 @@ class Object(object):
 
 class CollectionEncoder_Generator(CollectionEncoder):
 
-    def __init__(self, *args, prepend_title=False):
-        super().__init__(*args)
+    def __init__(self, *args, prepend_title=False, **kwargs):
+        super().__init__(*args, **kwargs)
         self.prepend_title = prepend_title
 
     def _initialize_iterator(self):
@@ -282,7 +282,7 @@ def _preprocess_batch(self, offset, lines):
 
 
 class ColBERTIndexer(IterDictIndexerBase):
-    def __init__(self, checkpoint, index_root, index_name, chunksize, prepend_title=False, num_docs=None, ids=True, gpu=True):
+    def __init__(self, checkpoint, index_root, index_name, chunksize, prepend_title=False, num_docs=None, ids=True, gpu=True, indexmgr='None'):
         args = Object()
         args.similarity = 'cosine'
         args.dim = 128
@@ -310,6 +310,7 @@ def __init__(self, checkpoint, index_root, index_name, chunksize, prepend_title=
         self.prepend_title = prepend_title
         self.num_docs = num_docs
         self.gpu = gpu
+        self.indexmgr = indexmgr
         if not gpu:
             warn("Gpu disabled, YMMV")
             import colbert.parameters
@@ -348,7 +349,7 @@ def convert_gen(iterator):
                 docid+=1
                 yield l              
         self.args.generator = convert_gen(iterator)
-        ceg = CollectionEncoderIds(self.args,0,1) if self.ids else CollectionEncoder_Generator(self.args,0,1)
+        ceg = CollectionEncoderIds(self.args,0,1, indexmgr=self.indexmgr) if self.ids else CollectionEncoder_Generator(self.args,0,1, indexmgr=self.indexmgr)
 
         create_directory(self.args.index_root)
         create_directory(self.args.index_path)

From a5391dfdb0b1bd4a465e33233aeb5ff419da828c Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 14:50:03 +0100
Subject: [PATCH 03/11] monkey patching for numpy

---
 pyterrier_colbert/indexing.py | 14 ++++++++++++++
 tests/test_indexing.py        |  4 ++--
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index d50594e..cbac1ce 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -41,6 +41,18 @@
 
 DEBUG=False
 
+def load_index_part_torch(filename, verbose=True):
+    mmap_storage = torch.HalfStorage.from_file(file_path, False, sum(self.doclens) * self.dim)
+    return torch.HalfTensor(mmap_storage).view(sum(self.doclens), self.dim)
+
+def load_index_part_torchhalf(filename, verbose=True):
+    return torch.load(filename)
+
+def load_index_part_numpy(filename):
+    filename = filename.replace(".pt", ".np")
+    #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r'))
+    return torch.from_numpy(np.load(filename, dtype=np.uint64, mode='r'))
+
 class TorchStorageIndexManager(IndexManager):
     """
     A ColBERT IndexManager for torch.HalfStorage, which support mmap
@@ -94,6 +106,8 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None):
     
         if indexmgr == 'numpy':
             self.indexmgr = NumpyIndexManager(args.dim)
+            import colbert.indexing.index_manager
+            colbert.indexing.index_manager.load_index_part = load_index_part_numpy
         elif indexmgr == 'half':
             self.indexmgr = TorchStorageIndexManager(args.dim)
         else:
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
index 2bde6dd..a6802a9 100644
--- a/tests/test_indexing.py
+++ b/tests/test_indexing.py
@@ -106,8 +106,8 @@ def indexing_merged(self):
     def test_indexing_1doc_torch_mem(self):
         self._indexing_1doc('torch', "mem")
 
-    def test_indexing_1doc_torch_mem(self):
-        self._indexing_1doc('half', "mmap")
+    # def test_indexing_1doc_torch_mem(self):
+    #     self._indexing_1doc('half', "mmap")
 
     def test_indexing_1doc_numpy(self):
         self._indexing_1doc('numpy', 'numpy')

From 495d6ef483ccdc0cf862f26973b6d3ef3ae7aa07 Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 15:10:23 +0100
Subject: [PATCH 04/11] change dtype for numpy

---
 pyterrier_colbert/indexing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index cbac1ce..f50a997 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -51,7 +51,7 @@ def load_index_part_torchhalf(filename, verbose=True):
 def load_index_part_numpy(filename):
     filename = filename.replace(".pt", ".np")
     #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r'))
-    return torch.from_numpy(np.load(filename, dtype=np.uint64, mode='r'))
+    return torch.from_numpy(np.load(filename, dtype=np.float16, mode='r'))
 
 class TorchStorageIndexManager(IndexManager):
     """
@@ -71,7 +71,7 @@ class NumpyIndexManager(IndexManager):
     def save(self, tensor, output_file):
         import numpy as np
         output_file = output_file.replace(".pt", ".npm")
-        memmap = np.memmap(output_file, dtype=np.uint64, mode='w+', shape=tensor.shape)
+        memmap = np.memmap(output_file, dtype=np.float16, mode='w+', shape=tensor.shape)
         memmap[ : ] = tensor[ : ]
         memmap.flush()
         del(memmap)

From 1cdf6bcd8aba96efdff68f5caa9057c4354246fc Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 16:30:23 +0100
Subject: [PATCH 05/11] check for other exts

---
 pyterrier_colbert/indexing.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index f50a997..392efdd 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -41,6 +41,28 @@
 
 DEBUG=False
 
+def get_parts_ext(directory):
+    #extension of get_parts to check for other file types
+    extensions = ['.pt', '.np', '.store']
+
+    parts=[]
+    for extension in extensions:
+        parts = sorted([int(filename[: -1 * len(extension)]) for filename in os.listdir(directory)
+                        if filename.endswith(extension)])
+        if len(parts) > 0:
+            print("Found index files with ext %s" % extension)
+            break
+    if len(parts) == 0:
+        raise ValueError("found no index embeddings files")
+
+    assert list(range(len(parts))) == parts, parts
+
+    # Integer-sortedness matters.
+    parts_paths = [os.path.join(directory, '{}{}'.format(filename, extension)) for filename in parts]
+    samples_paths = [os.path.join(directory, '{}.sample'.format(filename)) for filename in parts]
+
+    return parts, parts_paths, samples_paths
+
 def load_index_part_torch(filename, verbose=True):
     mmap_storage = torch.HalfStorage.from_file(file_path, False, sum(self.doclens) * self.dim)
     return torch.HalfTensor(mmap_storage).view(sum(self.doclens), self.dim)
@@ -108,6 +130,9 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None):
             self.indexmgr = NumpyIndexManager(args.dim)
             import colbert.indexing.index_manager
             colbert.indexing.index_manager.load_index_part = load_index_part_numpy
+            import colbert.indexing.loaders
+            colbert.indexing.loaders.get_parts = get_parts_ext
+
         elif indexmgr == 'half':
             self.indexmgr = TorchStorageIndexManager(args.dim)
         else:

From 5bd40ea151cb50babf89eff975d0d49f404d1c81 Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 18:35:04 +0100
Subject: [PATCH 06/11] more work on numpy index format

---
 pyterrier_colbert/indexing.py | 32 ++++++++++++++++----------------
 pyterrier_colbert/ranking.py  |  4 ++--
 tests/test_indexing.py        |  2 +-
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index 392efdd..5d53335 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -46,11 +46,12 @@ def get_parts_ext(directory):
     extensions = ['.pt', '.np', '.store']
 
     parts=[]
-    for extension in extensions:
-        parts = sorted([int(filename[: -1 * len(extension)]) for filename in os.listdir(directory)
-                        if filename.endswith(extension)])
+    for ext in extensions:
+        parts = sorted([int(filename[: -1 * len(ext)]) for filename in os.listdir(directory)
+                        if filename.endswith(ext)])
         if len(parts) > 0:
-            print("Found index files with ext %s" % extension)
+            extension = ext
+            print("Found %d index files with ext %s" % (len(parts), extension))
             break
     if len(parts) == 0:
         raise ValueError("found no index embeddings files")
@@ -60,7 +61,6 @@ def get_parts_ext(directory):
     # Integer-sortedness matters.
     parts_paths = [os.path.join(directory, '{}{}'.format(filename, extension)) for filename in parts]
     samples_paths = [os.path.join(directory, '{}.sample'.format(filename)) for filename in parts]
-
     return parts, parts_paths, samples_paths
 
 def load_index_part_torch(filename, verbose=True):
@@ -73,7 +73,7 @@ def load_index_part_torchhalf(filename, verbose=True):
 def load_index_part_numpy(filename):
     filename = filename.replace(".pt", ".np")
     #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r'))
-    return torch.from_numpy(np.load(filename, dtype=np.float16, mode='r'))
+    return torch.from_numpy(np.load(filename, mode='r'))
 
 class TorchStorageIndexManager(IndexManager):
     """
@@ -92,11 +92,12 @@ class NumpyIndexManager(IndexManager):
     """
     def save(self, tensor, output_file):
         import numpy as np
-        output_file = output_file.replace(".pt", ".npm")
-        memmap = np.memmap(output_file, dtype=np.float16, mode='w+', shape=tensor.shape)
-        memmap[ : ] = tensor[ : ]
-        memmap.flush()
-        del(memmap)
+        output_file = output_file.replace(".pt", ".np")
+        np.save(output_file, tensor.detach().numpy())
+        #memmap = np.memmap(output_file, dtype=np.float16, mode='w+', shape=tensor.shape)
+        #memmap[ : ] = tensor[ : ]
+        #memmap.flush()
+        #del(memmap)
 
 class CollectionEncoder():
     def __init__(self, args, process_idx, num_processes, indexmgr=None):
@@ -128,11 +129,10 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None):
     
         if indexmgr == 'numpy':
             self.indexmgr = NumpyIndexManager(args.dim)
-            import colbert.indexing.index_manager
-            colbert.indexing.index_manager.load_index_part = load_index_part_numpy
-            import colbert.indexing.loaders
-            colbert.indexing.loaders.get_parts = get_parts_ext
-
+            import colbert.indexing.index_manager, colbert.indexing.loaders, colbert.indexing.faiss
+            colbert.indexing.faiss.load_index_part = colbert.indexing.index_manager.load_index_part = load_index_part_numpy
+            colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext
+            
         elif indexmgr == 'half':
             self.indexmgr = TorchStorageIndexManager(args.dim)
         else:
diff --git a/pyterrier_colbert/ranking.py b/pyterrier_colbert/ranking.py
index 905d0a2..eb402e4 100644
--- a/pyterrier_colbert/ranking.py
+++ b/pyterrier_colbert/ranking.py
@@ -63,7 +63,7 @@ def __init__(self, file_path, file_doclens):
         self.endpos = np.cumsum(self.doclens)
         self.startpos = self.endpos - self.doclens
         import numpy as np
-        self.mmap = torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r'))
+        self.mmap = torch.from_numpy(np.load(file_path, mmap_mode='r'))
         print(self.mmap.shape)
  
     def get_embedding(self, pid):
@@ -79,7 +79,7 @@ def __init__(self, file_path, file_doclens):
         self.endpos = np.cumsum(self.doclens)
         self.startpos = self.endpos - self.doclens
         import numpy as np
-        self.mmap = torch.from_numpy(np.load(file_path, dtype=np.uint64, mode='r'))
+        self.mmap = torch.from_numpy(np.load(file_path))
         print(self.mmap.shape)
  
     def get_embedding(self, pid):
diff --git a/tests/test_indexing.py b/tests/test_indexing.py
index a6802a9..93a2d6a 100644
--- a/tests/test_indexing.py
+++ b/tests/test_indexing.py
@@ -109,7 +109,7 @@ def test_indexing_1doc_torch_mem(self):
     # def test_indexing_1doc_torch_mem(self):
     #     self._indexing_1doc('half', "mmap")
 
-    def test_indexing_1doc_numpy(self):
+    def test_indexing_1doc_numpy_mem(self):
         self._indexing_1doc('numpy', 'numpy')
     
     def test_indexing_1doc_numpy_mmap(self):

From 671d0daa0ad8eed20c11ca862445e18a9853f865 Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 20:41:42 +0100
Subject: [PATCH 07/11] fix for numpy.load

---
 pyterrier_colbert/indexing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index 5d53335..012bf61 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -73,7 +73,7 @@ def load_index_part_torchhalf(filename, verbose=True):
 def load_index_part_numpy(filename):
     filename = filename.replace(".pt", ".np")
     #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r'))
-    return torch.from_numpy(np.load(filename, mode='r'))
+    return torch.from_numpy(np.load(filename))
 
 class TorchStorageIndexManager(IndexManager):
     """

From 05fc0e67a847f1f019a852e094be527a3b0c9ed1 Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 22:04:01 +0100
Subject: [PATCH 08/11] fixes for numpy

---
 pyterrier_colbert/indexing.py | 13 +++++++------
 pyterrier_colbert/ranking.py  |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index 012bf61..1eb8259 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -43,7 +43,7 @@
 
 def get_parts_ext(directory):
     #extension of get_parts to check for other file types
-    extensions = ['.pt', '.np', '.store']
+    extensions = ['.pt', '.npy', '.store']
 
     parts=[]
     for ext in extensions:
@@ -71,7 +71,7 @@ def load_index_part_torchhalf(filename, verbose=True):
     return torch.load(filename)
 
 def load_index_part_numpy(filename):
-    filename = filename.replace(".pt", ".np")
+    filename = filename.replace(".pt", ".npy")
     #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r'))
     return torch.from_numpy(np.load(filename))
 
@@ -92,7 +92,7 @@ class NumpyIndexManager(IndexManager):
     """
     def save(self, tensor, output_file):
         import numpy as np
-        output_file = output_file.replace(".pt", ".np")
+        output_file = output_file.replace(".pt", ".npy")
         np.save(output_file, tensor.detach().numpy())
         #memmap = np.memmap(output_file, dtype=np.float16, mode='w+', shape=tensor.shape)
         #memmap[ : ] = tensor[ : ]
@@ -130,12 +130,13 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None):
         if indexmgr == 'numpy':
             self.indexmgr = NumpyIndexManager(args.dim)
             import colbert.indexing.index_manager, colbert.indexing.loaders, colbert.indexing.faiss
-            colbert.indexing.faiss.load_index_part = colbert.indexing.index_manager.load_index_part = load_index_part_numpy
-            colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext
-            
+            colbert.indexing.faiss.load_index_part = load_index_part_numpy
+            colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext  
         elif indexmgr == 'half':
+            assert False
             self.indexmgr = TorchStorageIndexManager(args.dim)
         else:
+            colbert.indexing.faiss.load_index_part = colbert.indexing.index_manager.load_index_part
             self.indexmgr = IndexManager(args.dim)
 
     def _initialize_iterator(self):
diff --git a/pyterrier_colbert/ranking.py b/pyterrier_colbert/ranking.py
index eb402e4..c2baa88 100644
--- a/pyterrier_colbert/ranking.py
+++ b/pyterrier_colbert/ranking.py
@@ -63,7 +63,7 @@ def __init__(self, file_path, file_doclens):
         self.endpos = np.cumsum(self.doclens)
         self.startpos = self.endpos - self.doclens
         import numpy as np
-        self.mmap = torch.from_numpy(np.load(file_path, mmap_mode='r'))
+        self.mmap = torch.from_numpy(np.load(file_path, mmap_mode='r+'))
         print(self.mmap.shape)
  
     def get_embedding(self, pid):
@@ -74,7 +74,7 @@ def get_embedding(self, pid):
 class numpy_file_part_mem:
     def __init__(self, file_path, file_doclens):
         self.dim = 128 # TODO        
-        file_path = file_path.replace(".pt", ".np")
+        file_path = file_path.replace(".pt", ".npy")
         self.doclens = file_doclens
         self.endpos = np.cumsum(self.doclens)
         self.startpos = self.endpos - self.doclens

From 72dcf739baf38a173ccb2c78831811dde50ad1b9 Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 22:16:44 +0100
Subject: [PATCH 09/11] more fixes

---
 pyterrier_colbert/indexing.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index 1eb8259..d849866 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -47,6 +47,8 @@ def get_parts_ext(directory):
 
     parts=[]
     for ext in extensions:
+        print([filename for filename in os.listdir(directory)])
+        print([filename for filename in os.listdir(directory) if filename.endswith(ext)])
         parts = sorted([int(filename[: -1 * len(ext)]) for filename in os.listdir(directory)
                         if filename.endswith(ext)])
         if len(parts) > 0:
@@ -54,7 +56,7 @@ def get_parts_ext(directory):
             print("Found %d index files with ext %s" % (len(parts), extension))
             break
     if len(parts) == 0:
-        raise ValueError("found no index embeddings files")
+        raise ValueError("found no index embedding files")
 
     assert list(range(len(parts))) == parts, parts
 
@@ -127,15 +129,16 @@ def __init__(self, args, process_idx, num_processes, indexmgr=None):
 
         self._load_model()
     
+        import colbert.indexing.index_manager, colbert.indexing.loaders, colbert.indexing.faiss
         if indexmgr == 'numpy':
             self.indexmgr = NumpyIndexManager(args.dim)
-            import colbert.indexing.index_manager, colbert.indexing.loaders, colbert.indexing.faiss
             colbert.indexing.faiss.load_index_part = load_index_part_numpy
-            colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext  
+            colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext
         elif indexmgr == 'half':
             assert False
             self.indexmgr = TorchStorageIndexManager(args.dim)
         else:
+            colbert.indexing.faiss.get_parts = colbert.indexing.loaders.get_parts = get_parts_ext
             colbert.indexing.faiss.load_index_part = colbert.indexing.index_manager.load_index_part
             self.indexmgr = IndexManager(args.dim)
 

From 196775d470fb026b21c2e9896d06edd7134a7a17 Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 22:43:00 +0100
Subject: [PATCH 10/11] resort to super for other file types

---
 pyterrier_colbert/indexing.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index d849866..3782852 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -83,6 +83,9 @@ class TorchStorageIndexManager(IndexManager):
     """
 
     def save(self, tensor, output_file):
+        if not output_file.endswith(".pt"):
+            # for .ids, .sample etc, resort to torch.save
+            return super().save(tensor, output_file)
         output_file = output_file.replace(".pt", ".store")
         size = tensor.shape[0] * tensor.shape[1]
         out_tensor = torch.HalfStorage.from_file(output_file, True, size)
@@ -93,6 +96,9 @@ class NumpyIndexManager(IndexManager):
     A ColBERT IndexManager for numpy files, which support both mmap and direct loading
     """
     def save(self, tensor, output_file):
+        if not output_file.endswith(".pt"):
+            # for .ids, .sample etc, resort to torch.save
+            return super().save(tensor, output_file)
         import numpy as np
         output_file = output_file.replace(".pt", ".npy")
         np.save(output_file, tensor.detach().numpy())

From 53369182b8179715634626137cd6a004ae4643c5 Mon Sep 17 00:00:00 2001
From: Craig Macdonald <craig.macdonald@glasgow.ac.uk>
Date: Tue, 5 Oct 2021 22:55:37 +0100
Subject: [PATCH 11/11] more numpy work - resort to torch for sample, tokenids
 etc

---
 pyterrier_colbert/indexing.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pyterrier_colbert/indexing.py b/pyterrier_colbert/indexing.py
index 3782852..6962cf8 100644
--- a/pyterrier_colbert/indexing.py
+++ b/pyterrier_colbert/indexing.py
@@ -73,9 +73,12 @@ def load_index_part_torchhalf(filename, verbose=True):
     return torch.load(filename)
 
 def load_index_part_numpy(filename):
-    filename = filename.replace(".pt", ".npy")
-    #torch.from_numpy(np.memmap(file_path, dtype=np.uint64, mode='r'))
-    return torch.from_numpy(np.load(filename))
+    if filename.endswith(".pt"):
+        filename = filename.replace(".pt", ".npy")
+        return torch.from_numpy(np.load(filename))
+    else:
+        #resort to torch for sample, etc
+        return torch.load(filename)
 
 class TorchStorageIndexManager(IndexManager):
     """