From 4813f1e702cd066e917462c95b0759dbee424f0d Mon Sep 17 00:00:00 2001 From: Craig Macdonald Date: Sat, 21 Oct 2023 14:20:22 +0100 Subject: [PATCH] Add tests and Github Action (#17) * add .gitignore * sentence transformers is optional * add npids as dependency * add gha * WIP - none of these tests pass * add error detection for common misuse * add memory index testing * not sure why overwrite is needed * bump action versions * update tests to use FlexIndex * test_flexindex * test_flexindex * test_models * more test cases * more test cases in test_models * more test cases in test_models added default transformations to FlexIndex * removed expensive (an so far unused) GA dependencies * lazy faiss dependency * more tests * subtests in pytest * fix empty * update model cache key * fixing some broken gha tests * more tests, fixed bug with torch_retriever, fixed rank indexing * fix dependency cache * more tests * scann test * better np_vec_loader test cases * fix indentation in scann_retr.py * fix gta dependency caching * optional dependency package assertions * fix faiss_retr.py indentation * support for ivf when cuda not available * more tests, correct rank index, etc. * no more dependency caching; doesn't seem to work properly * better re-ranking test case * test_torch_vecs * make test skip conditions consistent --------- Co-authored-by: Sean MacAvaney --- .github/workflows/push.yml | 51 +++++++ .gitignore | 131 +++++++++++++++++ pyterrier_dr/__init__.py | 2 +- pyterrier_dr/biencoder.py | 3 + pyterrier_dr/flex/core.py | 24 ++-- pyterrier_dr/flex/corpus_graph.py | 13 +- pyterrier_dr/flex/faiss_retr.py | 167 ++++++++++++---------- pyterrier_dr/flex/gar.py | 2 +- pyterrier_dr/flex/np_retr.py | 1 - pyterrier_dr/flex/scann_retr.py | 63 ++++----- pyterrier_dr/flex/torch_retr.py | 10 +- pyterrier_dr/hgf_models.py | 4 + pyterrier_dr/indexes.py | 9 +- pyterrier_dr/sbert_models.py | 3 +- pyterrier_dr/tctcolbert_model.py | 34 +---- pyterrier_dr/util.py | 34 ++++- requirements-dev.txt | 3 + requirements.txt | 2 + tests/__init__.py | 0 tests/test_flexindex.py | 228 ++++++++++++++++++++++++++++++ tests/test_models.py | 148 +++++++++++++++++++ 21 files changed, 770 insertions(+), 162 deletions(-) create mode 100644 .github/workflows/push.yml create mode 100644 .gitignore create mode 100644 requirements-dev.txt create mode 100644 tests/__init__.py create mode 100644 tests/test_flexindex.py create mode 100644 tests/test_models.py diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml new file mode 100644 index 0000000..e7b35b2 --- /dev/null +++ b/.github/workflows/push.yml @@ -0,0 +1,51 @@ +name: Test Python package + +on: [push, pull_request] + +jobs: + build: + + strategy: + matrix: + python: [3.8] + java: [13] + os: ['ubuntu-latest'] # + architecture: ['x64'] + terrier: ['snapshot'] #'5.3', '5.4-SNAPSHOT', + + runs-on: ${{ matrix.os }} + steps: + + - uses: actions/checkout@v3 + + - name: Setup java + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + architecture: ${{ matrix.architecture }} + distribution: 'zulu' + + - name: Setup conda + uses: s-weigand/setup-conda@v1 + with: + python-version: ${{ matrix.python }} + conda-channels: anaconda, conda-forge + activate-conda: true + + # follows https://medium.com/ai2-blog/python-caching-in-github-actions-e9452698e98d + - name: Loading Torch models from cache + uses: actions/cache@v3 + with: + path: /home/runner/.cache/ + key: model-cache + + - name: Install Python dependencies + run: | + pip install --upgrade --upgrade-strategy eager -r requirements.txt -r requirements-dev.txt + conda install -c pytorch faiss-cpu=1.7.4 mkl=2021 blas=1.0=mkl + + - name: All unit tests + env: + TERRIER_VERSION: ${{ matrix.terrier }} + run: | + pytest -s diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0741e16 --- /dev/null +++ b/.gitignore @@ -0,0 +1,131 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +.vscode/ + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/pyterrier_dr/__init__.py b/pyterrier_dr/__init__.py index 1adb6e4..b5894cc 100644 --- a/pyterrier_dr/__init__.py +++ b/pyterrier_dr/__init__.py @@ -3,6 +3,6 @@ from .flex import FlexIndex from .biencoder import BiEncoder, BiQueryEncoder, BiDocEncoder, BiScorer from .hgf_models import HgfBiEncoder, TasB, RetroMAE -from .sbert_models import SBertBiEncoder, Ance, Query2Query +from .sbert_models import SBertBiEncoder, Ance, Query2Query, GTR from .tctcolbert_model import TctColBert from .electra import ElectraScorer diff --git a/pyterrier_dr/biencoder.py b/pyterrier_dr/biencoder.py index 6643aff..c9a2a35 100644 --- a/pyterrier_dr/biencoder.py +++ b/pyterrier_dr/biencoder.py @@ -24,6 +24,9 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame: columns = set(inp.columns) modes = [ (['qid', 'query', self.text_field], self.scorer), + (['qid', 'query_vec', self.text_field], self.scorer), + (['qid', 'query', 'doc_vec'], self.scorer), + (['qid', 'query_vec', 'doc_vec'], self.scorer), (['query'], self.query_encoder), ([self.text_field], self.doc_encoder), ] diff --git a/pyterrier_dr/flex/core.py b/pyterrier_dr/flex/core.py index e90a4c9..c25035b 100644 --- a/pyterrier_dr/flex/core.py +++ b/pyterrier_dr/flex/core.py @@ -1,15 +1,9 @@ -import re -import math -import os import shutil -import threading -import struct -import tempfile import itertools import json from pathlib import Path +from warnings import warn import numpy as np -import shutil import more_itertools import pandas as pd import pyterrier as pt @@ -17,7 +11,7 @@ from npids import Lookup from enum import Enum from .. import SimFn -from ..indexes import RankedLists, TorchRankedLists +from ..indexes import RankedLists import ir_datasets import torch @@ -91,6 +85,20 @@ def index(self, inp): with open(path/'pt_meta.json', 'wt') as f_meta: json.dump({"type": "dense_index", "format": "flex", "vec_size": vec_size, "doc_count": count}, f_meta) + def transform(self, inp): + columns = set(inp.columns) + modes = [ + (['qid', 'query_vec'], self.np_retriever, "performing exhaustive saerch with FlexIndex.np_retriever -- note that other FlexIndex retrievers may be faster"), + ] + for fields, fn, note in modes: + if all(f in columns for f in fields): + warn(f'based on input columns {list(columns)}, {note}') + return fn()(inp) + message = f'Unexpected input with columns: {inp.columns}. Supports:' + for fields, fn in modes: + message += f'\n - {fn.__doc__.strip()}: {fields}' + raise RuntimeError(message) + def get_corpus_iter(self, start_idx=None, stop_idx=None, verbose=True): docnos, dvecs, meta = self.payload() docno_iter = iter(docnos) diff --git a/pyterrier_dr/flex/corpus_graph.py b/pyterrier_dr/flex/corpus_graph.py index b10b367..47e5a57 100644 --- a/pyterrier_dr/flex/corpus_graph.py +++ b/pyterrier_dr/flex/corpus_graph.py @@ -4,6 +4,7 @@ import ir_datasets import torch import numpy as np +import pyterrier_dr from ..indexes import TorchRankedLists from . import FlexIndex @@ -40,20 +41,22 @@ def _build_corpus_graph(flex_index, k, out_dir, batch_size): out_dir.mkdir(parents=True, exist_ok=True) edges_path = out_dir/'edges.u32.np' weights_path = out_dir/'weights.f16.np' + device = pyterrier_dr.util.infer_device() + dtype = torch.half if device.type == 'cuda' else torch.float with logger.pbar_raw(total=int((num_chunks+1)*num_chunks/2), unit='chunk', smoothing=1) as pbar, \ ir_datasets.util.finialized_file(str(edges_path), 'wb') as fe, \ ir_datasets.util.finialized_file(str(weights_path), 'wb') as fw: for i in range(num_chunks): - left = torch.from_numpy(vectors[i*S:(i+1)*S]).cuda().half() - left /= left.norm(dim=1, keepdim=True) + left = torch.from_numpy(vectors[i*S:(i+1)*S]).to(device).to(dtype) + left = left / left.norm(dim=1, keepdim=True) scores = left @ left.T - scores[torch.eye(left.shape[0], dtype=bool, device='cuda')] = float('-inf') + scores[torch.eye(left.shape[0], dtype=bool, device=device)] = float('-inf') i_scores, i_dids = scores.topk(k, sorted=True, dim=1) rankings[i].update(i_scores, (i_dids + i*S)) pbar.update() for j in range(i+1, num_chunks): - right = torch.from_numpy(vectors[j*S:(j+1)*S]).cuda().half() - right /= right.norm(dim=1, keepdim=True) + right = torch.from_numpy(vectors[j*S:(j+1)*S]).to(device).to(dtype) + right = right / right.norm(dim=1, keepdim=True) scores = left @ right.T i_scores, i_dids = scores.topk(min(k, right.shape[0]), sorted=True, dim=1) j_scores, j_dids = scores.topk(min(k, left.shape[0]), sorted=True, dim=0) diff --git a/pyterrier_dr/flex/faiss_retr.py b/pyterrier_dr/flex/faiss_retr.py index a43b1b7..0ad207d 100644 --- a/pyterrier_dr/flex/faiss_retr.py +++ b/pyterrier_dr/flex/faiss_retr.py @@ -8,6 +8,7 @@ import numpy as np import tempfile import ir_datasets +import pyterrier_dr from . import FlexIndex logger = ir_datasets.log.easy() @@ -60,60 +61,62 @@ def transform(self, inp): def _faiss_flat_retriever(self, gpu=False, qbatch=64): - import faiss - if 'faiss_flat' not in self._cache: - meta, = self.payload(return_dvecs=False, return_docnos=False) - with tempfile.TemporaryDirectory() as tmp, logger.duration('reading faiss flat'): - DUMMY = b'\x00\x00\x10\x00\x00\x00\x00\x00' - # [ type ] [ train ] [ metric (dot) ] - header = b'IxFI' + struct.pack(' pd.DataFrame: else: raise ValueError(f'{self.flex_index.sim_fn} not supported') num_q = query_vecs.shape[0] - res = [] ranked_lists = RankedLists(self.num_results, num_q) batch_it = range(0, dvecs.shape[0], self.batch_size) if self.flex_index.verbose: diff --git a/pyterrier_dr/flex/scann_retr.py b/pyterrier_dr/flex/scann_retr.py index 4502ad1..628d912 100644 --- a/pyterrier_dr/flex/scann_retr.py +++ b/pyterrier_dr/flex/scann_retr.py @@ -1,12 +1,11 @@ import pandas as pd import math -import struct import os import pyterrier as pt import itertools import numpy as np -import tempfile import ir_datasets +import pyterrier_dr from . import FlexIndex logger = ir_datasets.log.easy() @@ -48,37 +47,37 @@ def transform(self, inp): def _scann_retriever(self, n_leaves=None, leaves_to_search=1, train_sample=None): - import scann - assert not hasattr(scann.scann_ops_pybind, 'builder'), "scann==1.0.0 required; install from wheel here: " - dvecs, meta, = self.payload(return_docnos=False) + pyterrier_dr.util.assert_scann() + import scann + dvecs, meta, = self.payload(return_docnos=False) - if n_leaves is None: - # rule of thumb: sqrt(doc_count) (from ) - n_leaves = math.ceil(math.sqrt(meta['doc_count'])) - # we'll shift it to the nearest power of 2 - n_leaves = int(1 << math.ceil(math.log2(n_leaves))) + if n_leaves is None: + # rule of thumb: sqrt(doc_count) (from ) + n_leaves = math.ceil(math.sqrt(meta['doc_count'])) + # we'll shift it to the nearest power of 2 + n_leaves = int(1 << math.ceil(math.log2(n_leaves))) - if train_sample is None: - train_sample = n_leaves * 39 - train_sample = min(train_sample, meta['doc_count']) - elif 0 <= train_sample <= 1: - train_sample = math.ceil(meta['doc_count'] * train_sample) + if train_sample is None: + train_sample = n_leaves * 39 + train_sample = min(train_sample, meta['doc_count']) + elif 0 <= train_sample <= 1: + train_sample = math.ceil(meta['doc_count'] * train_sample) - key = ('scann', n_leaves, train_sample) - index_name = f'scann_leaves-{n_leaves}_train-{train_sample}.scann' - if key not in self._cache: - if not os.path.exists(self.index_path/index_name): - with logger.duration(f'building scann index with {n_leaves} leaves'): - searcher = scann.ScannBuilder(dvecs, 10, "dot_product") # neighbours=10; doesn't seem to affect the model (?) - searcher = searcher.tree(num_leaves=n_leaves, num_leaves_to_search=leaves_to_search, training_sample_size=train_sample, quantize_centroids=True) - searcher = searcher.score_brute_force() - searcher = searcher.create_pybind() - with logger.duration('saving scann index'): - (self.index_path/index_name).mkdir() - searcher.serialize(str(self.index_path/index_name)) - self._cache[key] = searcher - else: - with logger.duration('reading index'): - self._cache[key] = scann.scann_ops_pybind.load_searcher(dvecs, str(self.index_path/index_name)) - return ScannRetriever(self, self._cache[key], leaves_to_search=leaves_to_search) + key = ('scann', n_leaves, train_sample) + index_name = f'scann_leaves-{n_leaves}_train-{train_sample}.scann' + if key not in self._cache: + if not os.path.exists(self.index_path/index_name): + with logger.duration(f'building scann index with {n_leaves} leaves'): + searcher = scann.ScannBuilder(dvecs, 10, "dot_product") # neighbours=10; doesn't seem to affect the model (?) + searcher = searcher.tree(num_leaves=n_leaves, num_leaves_to_search=leaves_to_search, training_sample_size=train_sample, quantize_centroids=True) + searcher = searcher.score_brute_force() + searcher = searcher.create_pybind() + with logger.duration('saving scann index'): + (self.index_path/index_name).mkdir() + searcher.serialize(str(self.index_path/index_name)) + self._cache[key] = searcher + else: + with logger.duration('reading index'): + self._cache[key] = scann.scann_ops_pybind.load_searcher(dvecs, str(self.index_path/index_name)) + return ScannRetriever(self, self._cache[key], leaves_to_search=leaves_to_search) FlexIndex.scann_retriever = _scann_retriever diff --git a/pyterrier_dr/flex/torch_retr.py b/pyterrier_dr/flex/torch_retr.py index 09cc8f6..a8e32d7 100644 --- a/pyterrier_dr/flex/torch_retr.py +++ b/pyterrier_dr/flex/torch_retr.py @@ -58,11 +58,15 @@ def transform(self, inp): scores = batch @ self.torch_vecs.T else: raise ValueError(f'{self.flex_index.sim_fn} not supported') - scores, docids = scores.topk(self.num_results, dim=1) + if scores.shape[1] > self.num_results: + scores, docids = scores.topk(self.num_results, dim=1) + else: + docids = scores.argsort(descending=True, dim=1) + scores = torch.gather(scores, dim=1, index=docids) res_scores.append(scores.cpu().numpy().reshape(-1)) res_docids.append(docids.cpu().numpy().reshape(-1)) - res_idxs.append(np.arange(start_idx, start_idx+batch.shape[0]).reshape(-1, 1).repeat(self.num_results, axis=1).reshape(-1)) - res_ranks.append(np.arange(self.num_results).reshape(1, -1).repeat(batch.shape[0], axis=0).reshape(-1)) + res_idxs.append(np.arange(start_idx, start_idx+batch.shape[0]).reshape(-1, 1).repeat(scores.shape[1], axis=1).reshape(-1)) + res_ranks.append(np.arange(scores.shape[1]).reshape(1, -1).repeat(batch.shape[0], axis=0).reshape(-1)) res_idxs = np.concatenate(res_idxs) res = {k: inp[k][res_idxs] for k in inp.columns if k not in ['docid', 'docno', 'rank', 'score']} res['score'] = np.concatenate(res_scores) diff --git a/pyterrier_dr/hgf_models.py b/pyterrier_dr/hgf_models.py index 61c90ac..280e80e 100644 --- a/pyterrier_dr/hgf_models.py +++ b/pyterrier_dr/hgf_models.py @@ -23,6 +23,8 @@ def encode_queries(self, texts, batch_size=None): inps = {k: v.to(self.device) for k, v in inps.items()} res = self.model(**inps).last_hidden_state[:, 0] # [CLS] embedding results.append(res.cpu().numpy()) + if not results: + return np.empty(shape=(0, 0)) return np.concatenate(results, axis=0) def encode_docs(self, texts, batch_size=None): @@ -33,6 +35,8 @@ def encode_docs(self, texts, batch_size=None): inps = {k: v.to(self.device) for k, v in inps.items()} res = self.model(**inps).last_hidden_state[:, 0] results.append(res.cpu().numpy()) + if not results: + return np.empty(shape=(0, 0)) return np.concatenate(results, axis=0) @classmethod diff --git a/pyterrier_dr/indexes.py b/pyterrier_dr/indexes.py index b2f1cbc..c572fab 100644 --- a/pyterrier_dr/indexes.py +++ b/pyterrier_dr/indexes.py @@ -7,7 +7,6 @@ from os.path import commonprefix import numpy as np import shutil -import faiss import more_itertools import pandas as pd import pyterrier as pt @@ -283,6 +282,8 @@ def get_corpus_iter(self, start_idx=None, stop_idx=None, verbose=True): class MemIndex(pt.Indexer): def __init__(self, num_results=1000, score_fn='dot', batch_size=4096, verbose=True, dtype='f4', drop_query_vec=True): + # check we havent been passed a destination index, as per disk-based indexers + assert isinstance(num_results, int) self.num_results = num_results self.score_fn = score_fn self.verbose = verbose @@ -368,7 +369,7 @@ def get_corpus_iter(self, start_idx=None, stop_idx=None, verbose=True): class RankedLists: - def __init__(self, num_results, num_queries): + def __init__(self, num_results : int, num_queries : int): self.num_results = num_results self.num_queries = num_queries self.scores = np.empty((num_queries, 0), dtype='f4') @@ -434,6 +435,7 @@ def __init__(self, index_path=None, num_results=1000, shard_size=500_000, score_ self._shards = list(self.iter_shards()) def iter_shards(self): + import faiss if self.cuda: res = faiss.StandardGpuResources() if self._shards is None: @@ -500,6 +502,7 @@ def transform(self, inp): return res def index(self, inp): + import faiss if isinstance(inp, pd.DataFrame): inp = inp.to_dict(orient="records") path = Path(self.index_path) @@ -550,6 +553,7 @@ def iter_shards(self, cache=True): return iter(self._shards) def _iter_shards(self): + import faiss for shardid in itertools.count(): if not (self.index_path/f'{shardid}.faiss').exists(): break @@ -597,6 +601,7 @@ def transform(self, inp): return res def index(self, inp): + import faiss if isinstance(inp, pd.DataFrame): inp = inp.to_dict(orient="records") path = Path(self.index_path) diff --git a/pyterrier_dr/sbert_models.py b/pyterrier_dr/sbert_models.py index 7a92b83..c873a8c 100644 --- a/pyterrier_dr/sbert_models.py +++ b/pyterrier_dr/sbert_models.py @@ -3,7 +3,6 @@ import torch import pyterrier as pt from transformers import AutoConfig -from sentence_transformers import SentenceTransformer from .biencoder import BiEncoder, BiQueryEncoder from .util import Variants from tqdm import tqdm @@ -23,6 +22,7 @@ def __init__(self, model_name, batch_size=32, text_field='text', verbose=False, if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' self.device = torch.device(device) + from sentence_transformers import SentenceTransformer self.model = SentenceTransformer(model_name).to(self.device).eval() self.config = AutoConfig.from_pretrained(model_name) @@ -66,6 +66,7 @@ def __init__(self, model_name=DEFAULT_MODEL_NAME, batch_size=32, verbose=False, if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' self.device = torch.device(device) + from sentence_transformers import SentenceTransformer self.model = SentenceTransformer(model_name).to(self.device).eval() self.batch_size = batch_size self.verbose = verbose diff --git a/pyterrier_dr/tctcolbert_model.py b/pyterrier_dr/tctcolbert_model.py index 216e80d..c87bb7f 100644 --- a/pyterrier_dr/tctcolbert_model.py +++ b/pyterrier_dr/tctcolbert_model.py @@ -2,6 +2,7 @@ import numpy as np import torch from transformers import AutoTokenizer, AutoModel +import pyterrier as pt from . import BiEncoder @@ -24,6 +25,8 @@ def encode_queries(self, texts, batch_size=None): res = self.model(**inps).last_hidden_state res = res[:, 4:, :].mean(dim=1) # remove the first 4 tokens (representing [CLS] [ Q ]), and average results.append(res.cpu().numpy()) + if not results: + return np.empty(shape=(0, 0)) return np.concatenate(results, axis=0) def encode_docs(self, texts, batch_size=None): @@ -39,36 +42,9 @@ def encode_docs(self, texts, batch_size=None): lens[lens == 0] = 1 # avoid edge case of div0 errors res = res.sum(dim=1) / lens # average based on dim results.append(res.cpu().numpy()) + if not results: + return np.empty(shape=(0, 0)) return np.concatenate(results, axis=0) def __repr__(self): return f'TctColBert({repr(self.model_name)})' - def _transform_D(self, inp): - """ - Document vectorisation - """ - res = self._encode_docs(inp[self.text_field]) - return inp.assign(doc_vec=[res[i] for i in range(res.shape[0])]) - - def _transform_Q(self, inp): - """ - Query vectorisation - """ - it = inp['query'] - if self.verbose: - it = pt.tqdm(it, desc='Encoding Queies', unit='query') - res = self._encode_queries(it) - return inp.assign(query_vec=[res[i] for i in range(res.shape[0])]) - - def _transform_R(self, inp): - """ - Result re-ranking - """ - return pt.apply.by_query(self._transform_R_byquery, add_ranks=True, verbose=self.verbose)(inp) - - def _transform_R_byquery(self, query_df): - query_rep = self._encode_queries([query_df['query'].iloc[0]]) - doc_reps = self._encode_docs(query_df[self.text_field]) - scores = (query_rep * doc_reps).sum(axis=1) - query_df['score'] = scores - return query_df diff --git a/pyterrier_dr/util.py b/pyterrier_dr/util.py index 67ed437..2c1fc71 100644 --- a/pyterrier_dr/util.py +++ b/pyterrier_dr/util.py @@ -1,9 +1,12 @@ from enum import Enum +import torch + class SimFn(Enum): dot = 'dot' cos = 'cos' + class Variants(type): def __getattr__(cls, name): if name in cls.VARIANTS: @@ -14,7 +17,36 @@ def wrapped(*args, **kwargs): def __init__(self, *args, **kwargs): return super().__init__(*args, **kwargs) -def infer_device(device): + +def infer_device(device=None): if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu' return torch.device(device) + + +def package_available(name): + try: + __import__(name) + return True + except ImportError: + return False + + +def faiss_available(): + return package_available('faiss') + + +def assert_faiss(): + assert faiss_available(), "faiss required; install using instructions here: " + + +def scann_available(): + if not package_available('scann'): + return False + import scann + # version 1.0.0 is requiredl detect via scann.scann_ops_pybind.builder + return hasattr(scann.scann_ops_pybind, 'builder') + + +def assert_scann(): + assert scann_available(), "scann==1.0.0 required; install from wheel here: " diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..7d35357 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +pytest +pytest-subtests +git+https://github.com/terrierteam/pyterrier_adaptive diff --git a/requirements.txt b/requirements.txt index e5be8f3..2f4ce13 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,5 @@ transformers python-terrier>=0.9.1 torch numpy +npids +sentence_transformers diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_flexindex.py b/tests/test_flexindex.py new file mode 100644 index 0000000..43febd6 --- /dev/null +++ b/tests/test_flexindex.py @@ -0,0 +1,228 @@ +import tempfile +import unittest +import numpy as np +import pandas as pd +import pyterrier as pt +import pyterrier_dr +from pyterrier_dr import FlexIndex + + +class TestFlexIndex(unittest.TestCase): + + def _generate_data(self, count=2000, dim=100): + def random_unit_vec(): + v = np.random.rand(dim).astype(np.float32) + return v / np.linalg.norm(v) + return [ + {'docno': str(i), 'doc_vec': random_unit_vec()} + for i in range(count) + ] + + def test_index_typical(self): + destdir = tempfile.mkdtemp() + self.test_dirs.append(destdir) + index = FlexIndex(destdir+'/index') + + self.assertFalse(index.built()) + + dataset = self._generate_data() + + index.index(dataset) + + self.assertTrue(index.built()) + + self.assertEqual(len(index), len(dataset)) + + stored_dataset = list(index.get_corpus_iter()) + self.assertEqual(len(stored_dataset), len(dataset)) + for a, b in zip(stored_dataset, dataset): + self.assertEqual(a['docno'], b['docno']) + self.assertTrue((a['doc_vec'] == b['doc_vec']).all()) + + def test_corpus_graph(self): + destdir = tempfile.mkdtemp() + self.test_dirs.append(destdir) + index = FlexIndex(destdir+'/index') + dataset = self._generate_data() + index.index(dataset) + + graph = index.corpus_graph(16) + self.assertEqual(graph.neighbours(4).shape, (16,)) + + @unittest.skipIf(not pyterrier_dr.util.faiss_available(), "faiss not available") + def test_faiss_hnsw_graph(self): + destdir = tempfile.mkdtemp() + self.test_dirs.append(destdir) + index = FlexIndex(destdir+'/index') + dataset = self._generate_data() + index.index(dataset) + + graph = index.faiss_hnsw_graph(16) + self.assertEqual(graph.neighbours(4).shape, (16,)) + + def _test_retr(self, Retr, exact=True): + with self.subTest('basic'): + destdir = tempfile.mkdtemp() + self.test_dirs.append(destdir) + index = FlexIndex(destdir+'/index') + dataset = self._generate_data(count=2000) + index.index(dataset) + + retr = Retr(index) + res = retr(pd.DataFrame([ + {'qid': '0', 'query_vec': dataset[0]['doc_vec']}, + {'qid': '1', 'query_vec': dataset[1]['doc_vec']}, + ])) + self.assertTrue(all(c in res.columns) for c in ['qid', 'docno', 'rank', 'score']) + if exact: + self.assertEqual(len(res), 2000) + self.assertEqual(len(res[res.qid=='0']), 1000) + self.assertEqual(len(res[res.qid=='1']), 1000) + self.assertEqual(res[(res.qid=='0')&((res['rank']==0))].iloc[0]['docno'], '0') + self.assertEqual(res[(res.qid=='1')&((res['rank']==0))].iloc[0]['docno'], '1') + else: + self.assertTrue(len(res) <= 2000) + self.assertTrue(len(res[res.qid=='0']) <= 1000) + self.assertTrue(len(res[res.qid=='1']) <= 1000) + + with self.subTest('smaller'): + destdir = tempfile.mkdtemp() + self.test_dirs.append(destdir) + index = FlexIndex(destdir+'/index') + dataset = self._generate_data(count=100) + index.index(dataset) + + retr = Retr(index) + res = retr(pd.DataFrame([ + {'qid': '0', 'query_vec': dataset[0]['doc_vec']}, + {'qid': '1', 'query_vec': dataset[1]['doc_vec']}, + ])) + self.assertTrue(all(c in res.columns) for c in ['qid', 'docno', 'rank', 'score']) + if exact: + self.assertEqual(len(res), 200) + self.assertEqual(len(res[res.qid=='0']), 100) + self.assertEqual(len(res[res.qid=='1']), 100) + self.assertEqual(res[(res.qid=='0')&((res['rank']==0))].iloc[0]['docno'], '0') + self.assertEqual(res[(res.qid=='1')&((res['rank']==0))].iloc[0]['docno'], '1') + else: + self.assertTrue(len(res) <= 200) + self.assertTrue(len(res[res.qid=='0']) <= 100) + self.assertTrue(len(res[res.qid=='1']) <= 100) + + @unittest.skipIf(not pyterrier_dr.util.faiss_available(), "faiss not available") + def test_faiss_flat_retriever(self): + self._test_retr(FlexIndex.faiss_flat_retriever) + + @unittest.skipIf(not pyterrier_dr.util.faiss_available(), "faiss not available") + def test_faiss_hnsw_retriever(self): + self._test_retr(FlexIndex.faiss_hnsw_retriever, exact=False) + + @unittest.skipIf(not pyterrier_dr.util.faiss_available(), "faiss not available") + def test_faiss_ivf_retriever(self): + self._test_retr(FlexIndex.faiss_ivf_retriever, exact=False) + + @unittest.skipIf(not pyterrier_dr.util.scann_available(), "scann not available") + def test_scann_retriever(self): + self._test_retr(FlexIndex.scann_retriever, exact=False) + + def test_np_retriever(self): + self._test_retr(FlexIndex.np_retriever) + + def test_torch_retriever(self): + self._test_retr(FlexIndex.torch_retriever) + + def test_np_vec_loader(self): + destdir = tempfile.mkdtemp() + self.test_dirs.append(destdir) + index = FlexIndex(destdir+'/index') + dataset = self._generate_data() + index.index(dataset) + + vec_loader = index.np_vec_loader() + with self.subTest('docid'): + res = vec_loader(pd.DataFrame({ + 'docid': [5, 1, 100, 198], + })) + self.assertTrue(all(c in res.columns) for c in ['docid', 'doc_vec']) + self.assertEqual(len(res), 4) + self.assertTrue((res.iloc[0]['doc_vec'] == dataset[5]['doc_vec']).all()) + self.assertTrue((res.iloc[1]['doc_vec'] == dataset[1]['doc_vec']).all()) + self.assertTrue((res.iloc[2]['doc_vec'] == dataset[100]['doc_vec']).all()) + self.assertTrue((res.iloc[3]['doc_vec'] == dataset[198]['doc_vec']).all()) + with self.subTest('docno'): + res = vec_loader(pd.DataFrame({ + 'docno': ['20', '0', '100', '198'], + 'query': 'ignored', + })) + self.assertTrue(all(c in res.columns) for c in ['docno', 'doc_vec', 'query']) + self.assertEqual(len(res), 4) + self.assertTrue((res.iloc[0]['doc_vec'] == dataset[20]['doc_vec']).all()) + self.assertTrue((res.iloc[1]['doc_vec'] == dataset[0]['doc_vec']).all()) + self.assertTrue((res.iloc[2]['doc_vec'] == dataset[100]['doc_vec']).all()) + self.assertTrue((res.iloc[3]['doc_vec'] == dataset[198]['doc_vec']).all()) + + def _test_reranker(self, Reranker, expands=False): + destdir = tempfile.mkdtemp() + self.test_dirs.append(destdir) + index = FlexIndex(destdir+'/index') + dataset = self._generate_data(count=2000) + index.index(dataset) + + retr = Reranker(index) + res = retr(pd.DataFrame([ + {'qid': '0', 'query_vec': dataset[0]['doc_vec'], 'docno': '0', 'score': -100.}, + {'qid': '0', 'query_vec': dataset[0]['doc_vec'], 'docno': '50', 'score': -100.}, + {'qid': '1', 'query_vec': dataset[1]['doc_vec'], 'docno': '100', 'score': -100.}, + {'qid': '1', 'query_vec': dataset[1]['doc_vec'], 'docno': '0', 'score': -100.}, + {'qid': '1', 'query_vec': dataset[1]['doc_vec'], 'docno': '40', 'score': -100.}, + ])) + self.assertTrue(all(c in res.columns) for c in ['qid', 'docno', 'rank', 'score']) + if expands: + self.assertTrue(len(res) >= 5) + else: + self.assertEqual(len(res), 5) + self.assertEqual(res[(res.qid=='0')&((res['rank']==0))].iloc[0]['docno'], '0') + self.assertEqual(len(res[res.score==-100.]), 0) # all scores re-assigned + + def test_np_scorer(self): + self._test_reranker(FlexIndex.np_scorer) + + def test_torch_scorer(self): + self._test_reranker(FlexIndex.torch_scorer) + + def test_pre_ladr(self): + self._test_reranker(FlexIndex.pre_ladr, expands=True) + + def test_ada_ladr(self): + self._test_reranker(FlexIndex.ada_ladr, expands=True) + + def test_gar(self): + self._test_reranker(FlexIndex.gar, expands=True) + + def test_torch_vecs(self): + destdir = tempfile.mkdtemp() + self.test_dirs.append(destdir) + index = FlexIndex(destdir+'/index') + dataset = self._generate_data() + index.index(dataset) + + torch_vecs = index.torch_vecs() + self.assertEqual(torch_vecs.shape, (2000, 100)) + + def setUp(self): + import pyterrier as pt + if not pt.started(): + pt.init() + self.test_dirs = [] + + def tearDown(self): + import shutil + for d in self.test_dirs: + try: + shutil.rmtree(d) + except: + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..58156b9 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,148 @@ +import unittest +import tempfile +import itertools +import numpy as np +import pandas as pd +import pyterrier as pt + + +class TestModels(unittest.TestCase): + + def _base_test(self, model, test_query_encoder=True, test_doc_encoder=True, test_scorer=True, test_indexer=True, test_retriever=True): + dataset = pt.get_dataset('irds:vaswani') + + docs = list(itertools.islice(pt.get_dataset('irds:vaswani').get_corpus_iter(), 200)) + docs_df = pd.DataFrame(docs) + + if test_query_encoder: + with self.subTest('query_encoder'): + topics = dataset.get_topics() + enc_topics = model(topics) + self.assertEqual(len(enc_topics), len(topics)) + self.assertTrue('query_vec' in enc_topics.columns) + self.assertTrue(all(c in enc_topics.columns for c in topics.columns)) + self.assertEqual(enc_topics.query_vec.dtype, object) + self.assertEqual(enc_topics.query_vec[0].dtype, np.float32) + self.assertTrue(all(enc_topics.query_vec[0].shape == v.shape for v in enc_topics.query_vec)) + + with self.subTest('query_encoder empty'): + enc_topics_empty = model(pd.DataFrame(columns=['qid', 'query'])) + self.assertEqual(len(enc_topics_empty), 0) + self.assertTrue('query_vec' in enc_topics.columns) + + if test_doc_encoder: + with self.subTest('doc_encoder'): + enc_docs = model(pd.DataFrame(docs_df)) + self.assertEqual(len(enc_docs), len(docs_df)) + self.assertTrue('doc_vec' in enc_docs.columns) + self.assertTrue(all(c in enc_docs.columns for c in docs_df.columns)) + self.assertEqual(enc_docs.doc_vec.dtype, object) + self.assertEqual(enc_docs.doc_vec[0].dtype, np.float32) + self.assertTrue(all(enc_docs.doc_vec[0].shape == v.shape for v in enc_docs.doc_vec)) + + with self.subTest('doc_encoder empty'): + enc_docs_empty = model(pd.DataFrame(columns=['docno', 'text'])) + self.assertEqual(len(enc_docs_empty), 0) + self.assertTrue('doc_vec' in enc_docs_empty.columns) + + if test_scorer: + with self.subTest('scorer_qtext_dtext'): + res_qtext_dtext = topics.head(2).merge(docs_df, how='cross') + scored_res_qtext_dtext = model(res_qtext_dtext) + self.assertTrue('score' in scored_res_qtext_dtext.columns) + self.assertTrue('rank' in scored_res_qtext_dtext.columns) + self.assertTrue(all(c in scored_res_qtext_dtext.columns for c in res_qtext_dtext.columns)) + + with self.subTest('scorer_qvec_dtext'): + res_qvec_dtext = enc_topics.drop(columns=['query']).head(2).merge(docs_df, how='cross') + scored_res_qvec_dtext = model(res_qvec_dtext) + self.assertTrue('score' in scored_res_qvec_dtext.columns) + self.assertTrue('rank' in scored_res_qvec_dtext.columns) + self.assertTrue(all(c in scored_res_qvec_dtext.columns for c in res_qvec_dtext.columns)) + + with self.subTest('scorer_qtext_dvec'): + res_qtext_dvec = topics.head(2).merge(enc_docs.drop(columns=['text']), how='cross') + scored_res_qtext_dvec = model(res_qtext_dvec) + self.assertTrue('score' in scored_res_qtext_dvec.columns) + self.assertTrue('rank' in scored_res_qtext_dvec.columns) + self.assertTrue(all(c in scored_res_qtext_dvec.columns for c in res_qtext_dvec.columns)) + + with self.subTest('scorer_qvec_dvec'): + res_qvec_dvec = enc_topics.drop(columns=['query']).head(2).merge(enc_docs.drop(columns=['text']), how='cross') + scored_res_qvec_dvec = model(res_qvec_dvec) + self.assertTrue('score' in scored_res_qvec_dvec.columns) + self.assertTrue('rank' in scored_res_qvec_dvec.columns) + self.assertTrue(all(c in scored_res_qvec_dvec.columns for c in res_qvec_dvec.columns)) + + with self.subTest('scorer empty'): + enc_res_empty = model(pd.DataFrame(columns=['qid', 'query', 'docno', 'text'])) + self.assertEqual(len(enc_res_empty), 0) + self.assertTrue('score' in enc_res_empty.columns) + self.assertTrue('rank' in enc_res_empty.columns) + + if test_indexer: + with self.subTest('indexer'): + # Make sure this model can index properly + # More extensive testing of FlexIndex is done in test_flexindex + from pyterrier_dr import FlexIndex + destdir = tempfile.mkdtemp() + self.test_dirs.append(destdir) + index = FlexIndex(destdir+'/index') + pipeline = model >> index + pipeline.index(docs) + self.assertTrue(index.built()) + self.assertEqual(len(index), len(docs)) + + if test_retriever: + with self.subTest('retriever'): + assert test_indexer, "test_retriever requires test_indexer" + # Make sure this model can retrieve properly + # More extensive testing of FlexIndex is done in test_flexindex + retr_res = pipeline(dataset.get_topics()) + self.assertTrue('qid' in retr_res.columns) + self.assertTrue('query' in retr_res.columns) + self.assertTrue('docno' in retr_res.columns) + self.assertTrue('score' in retr_res.columns) + self.assertTrue('rank' in retr_res.columns) + + def test_tct(self): + from pyterrier_dr import TctColBert + self._base_test(TctColBert()) + + def test_ance(self): + from pyterrier_dr import Ance + self._base_test(Ance.firstp()) + + def test_tasb(self): + from pyterrier_dr import TasB + self._base_test(TasB.dot()) + + def test_retromae(self): + from pyterrier_dr import RetroMAE + self._base_test(RetroMAE.msmarco_finetune()) + + def test_gtr(self): + from pyterrier_dr import GTR + self._base_test(GTR.base()) + + def test_query2query(self): + from pyterrier_dr import Query2Query + self._base_test(Query2Query(), test_doc_encoder=False, test_scorer=False, test_indexer=False, test_retriever=False) + + def setUp(self): + import pyterrier as pt + if not pt.started(): + pt.init() + self.test_dirs = [] + + def tearDown(self): + import shutil + for d in self.test_dirs: + try: + shutil.rmtree(d) + except: + pass + + +if __name__ == '__main__': + unittest.main()