From 6909587f66dfb871a36379d88ede4d26fadcfe1c Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Sun, 24 Nov 2024 10:51:31 +0000 Subject: [PATCH] Project Modernization (#27) * DS_Store in gitignore * license * pyproject.toml instead of setup.py * pt_docs * test and deploy actions * coverage target * faiss as dev dependency * style check * ruff fixes * fix ruff errors in indexes.py * dev dependencies --- .github/workflows/{release.yml => deploy.yml} | 0 .github/workflows/push.yml | 51 --------------- .github/workflows/style.yml | 36 +++++++++++ .github/workflows/test.yml | 62 +++++++++++++++++++ .gitignore | 2 + LICENSE | 21 +++++++ MANIFEST.in | 2 +- pyproject.toml | 43 +++++++++++++ pyterrier_dr/__init__.py | 25 +++++--- pyterrier_dr/bge_m3.py | 2 +- pyterrier_dr/flex/__init__.py | 20 +++--- pyterrier_dr/indexes.py | 13 ++-- pyterrier_dr/pt_docs/index.rst | 36 +++++++++++ requirements-dev.txt | 4 ++ setup.py | 46 -------------- 15 files changed, 238 insertions(+), 125 deletions(-) rename .github/workflows/{release.yml => deploy.yml} (100%) delete mode 100644 .github/workflows/push.yml create mode 100644 .github/workflows/style.yml create mode 100644 .github/workflows/test.yml create mode 100644 LICENSE create mode 100644 pyproject.toml create mode 100644 pyterrier_dr/pt_docs/index.rst delete mode 100644 setup.py diff --git a/.github/workflows/release.yml b/.github/workflows/deploy.yml similarity index 100% rename from .github/workflows/release.yml rename to .github/workflows/deploy.yml diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml deleted file mode 100644 index e7b35b2..0000000 --- a/.github/workflows/push.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: Test Python package - -on: [push, pull_request] - -jobs: - build: - - strategy: - matrix: - python: [3.8] - java: [13] - os: ['ubuntu-latest'] # - architecture: ['x64'] - terrier: ['snapshot'] #'5.3', '5.4-SNAPSHOT', - - runs-on: ${{ matrix.os }} - steps: - - - uses: actions/checkout@v3 - - - name: Setup java - uses: actions/setup-java@v3 - with: - java-version: ${{ matrix.java }} - architecture: ${{ matrix.architecture }} - distribution: 'zulu' - - - name: Setup conda - uses: s-weigand/setup-conda@v1 - with: - python-version: ${{ matrix.python }} - conda-channels: anaconda, conda-forge - activate-conda: true - - # follows https://medium.com/ai2-blog/python-caching-in-github-actions-e9452698e98d - - name: Loading Torch models from cache - uses: actions/cache@v3 - with: - path: /home/runner/.cache/ - key: model-cache - - - name: Install Python dependencies - run: | - pip install --upgrade --upgrade-strategy eager -r requirements.txt -r requirements-dev.txt - conda install -c pytorch faiss-cpu=1.7.4 mkl=2021 blas=1.0=mkl - - - name: All unit tests - env: - TERRIER_VERSION: ${{ matrix.terrier }} - run: | - pytest -s diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml new file mode 100644 index 0000000..77fed21 --- /dev/null +++ b/.github/workflows/style.yml @@ -0,0 +1,36 @@ +name: style + +on: + push: {branches: [master]} # pushes to master + pull_request: {} # all PRs + +jobs: + ruff: + strategy: + matrix: + python-version: ['3.10'] + os: ['ubuntu-latest'] + + runs-on: ${{ matrix.os }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache Dependencies + uses: actions/cache@v4 + with: + path: ${{ env.pythonLocation }} + key: ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }} + + - name: Install Dependencies + run: | + pip install --upgrade -r requirements-dev.txt + pip install -e . + + - name: Ruff + run: 'ruff check --output-format=github pyterrier_dr' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..f055661 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,62 @@ +name: test + +on: + push: {branches: [master]} # pushes to master + pull_request: {} # all PRs + schedule: [cron: '0 12 * * 3'] # every Wednesday at noon + +jobs: + pytest: + strategy: + matrix: + os: ['ubuntu-latest'] + python-version: ['3.8', '3.12'] + + runs-on: ${{ matrix.os }} + env: + runtag: ${{ matrix.os }}-${{ matrix.python-version }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache Dependencies + uses: actions/cache@v4 + with: + path: ${{ env.pythonLocation }} + key: ${{ env.runtag }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }} + + - name: Loading Torch models from cache + uses: actions/cache@v3 + with: + path: /home/runner/.cache/ + key: model-cache + + - name: Install Dependencies + run: | + pip install --upgrade -r requirements.txt -r requirements-dev.txt + pip install -e . + + - name: Unit Test + run: | + pytest --durations=20 -p no:faulthandler --json-report --json-report-file ${{ env.runtag }}.results.json --cov pyterrier_dr --cov-report json:${{ env.runtag }}.coverage.json tests/ + + - name: Upload Test Results + if: always() + uses: actions/upload-artifact@v4 + with: + path: ${{ env.runtag }}.*.json + overwrite: true + + - name: Report Test Results + if: always() + run: | + printf "**Test Results**\n\n" >> $GITHUB_STEP_SUMMARY + jq '.summary' ${{ env.runtag }}.results.json >> $GITHUB_STEP_SUMMARY + printf "\n\n**Test Coverage**\n\n" >> $GITHUB_STEP_SUMMARY + jq '.files | to_entries[] | " - `" + .key + "`: **" + .value.summary.percent_covered_display + "%**"' -r ${{ env.runtag }}.coverage.json >> $GITHUB_STEP_SUMMARY diff --git a/.gitignore b/.gitignore index 0741e16..c03c8a3 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,5 @@ dmypy.json # Pyre type checker .pyre/ + +.DS_Store diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d6d2fd4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024, Sean MacAvaney + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index f9bd145..d51e072 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1 @@ -include requirements.txt +recursive-include pyterrier_dr *.rst diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..cba3421 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,43 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pyterrier-dr" +description = "Dense Retrieval for PyTerrier" +requires-python = ">=3.8" +authors = [ + {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"}, +] +maintainers = [ + {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"}, +] +readme = "README.rst" +classifiers = [ + "Programming Language :: Python", + "Operating System :: OS Independent", + "Topic :: Text Processing", + "Topic :: Text Processing :: Indexing", + "License :: OSI Approved :: MIT License", +] +dynamic = ["version", "dependencies"] + +[tool.setuptools.dynamic] +version = {attr = "pyterrier_dr.__version__"} +dependencies = {file = ["requirements.txt"]} + +[project.optional-dependencies] +bgem3 = [ + "FlagEmbedding", +] + +[tool.setuptools.packages.find] +exclude = ["tests"] + +[project.urls] +Repository = "https://github.com/terrierteam/pyterrier_dr" +"Bug Tracker" = "https://github.com/terrierteam/pyterrier_dr/issues" + +[project.entry-points."pyterrier.artifact"] +"dense_index.flex" = "pyterrier_dr:FlexIndex" +"cde_cache.np_pickle" = "pyterrier_dr:CDECache" diff --git a/pyterrier_dr/__init__.py b/pyterrier_dr/__init__.py index 2d053c4..1a02d5a 100644 --- a/pyterrier_dr/__init__.py +++ b/pyterrier_dr/__init__.py @@ -1,12 +1,17 @@ __version__ = '0.2.0' -from .util import SimFn, infer_device -from .indexes import DocnoFile, NilIndex, NumpyIndex, RankedLists, FaissFlat, FaissHnsw, MemIndex, TorchIndex -from .flex import FlexIndex -from .biencoder import BiEncoder, BiQueryEncoder, BiDocEncoder, BiScorer -from .hgf_models import HgfBiEncoder, TasB, RetroMAE -from .sbert_models import SBertBiEncoder, Ance, Query2Query, GTR -from .tctcolbert_model import TctColBert -from .electra import ElectraScorer -from .bge_m3 import BGEM3, BGEM3QueryEncoder, BGEM3DocEncoder -from .cde import CDE, CDECache +from pyterrier_dr.util import SimFn, infer_device +from pyterrier_dr.indexes import DocnoFile, NilIndex, NumpyIndex, RankedLists, FaissFlat, FaissHnsw, MemIndex, TorchIndex +from pyterrier_dr.flex import FlexIndex +from pyterrier_dr.biencoder import BiEncoder, BiQueryEncoder, BiDocEncoder, BiScorer +from pyterrier_dr.hgf_models import HgfBiEncoder, TasB, RetroMAE +from pyterrier_dr.sbert_models import SBertBiEncoder, Ance, Query2Query, GTR +from pyterrier_dr.tctcolbert_model import TctColBert +from pyterrier_dr.electra import ElectraScorer +from pyterrier_dr.bge_m3 import BGEM3, BGEM3QueryEncoder, BGEM3DocEncoder +from pyterrier_dr.cde import CDE, CDECache + +__all__ = ["FlexIndex", "DocnoFile", "NilIndex", "NumpyIndex", "RankedLists", "FaissFlat", "FaissHnsw", "MemIndex", "TorchIndex", + "BiEncoder", "BiQueryEncoder", "BiDocEncoder", "BiScorer", "HgfBiEncoder", "TasB", "RetroMAE", "SBertBiEncoder", "Ance", + "Query2Query", "GTR", "TctColBert", "ElectraScorer", "BGEM3", "BGEM3QueryEncoder", "BGEM3DocEncoder", "CDE", "CDECache", + "SimFn", "infer_device"] diff --git a/pyterrier_dr/bge_m3.py b/pyterrier_dr/bge_m3.py index 10e32a0..780f36f 100644 --- a/pyterrier_dr/bge_m3.py +++ b/pyterrier_dr/bge_m3.py @@ -16,7 +16,7 @@ def __init__(self, model_name='BAAI/bge-m3', batch_size=32, max_length=8192, tex self.device = torch.device(device) try: from FlagEmbedding import BGEM3FlagModel - except ImportError as e: + except ImportError: raise ImportError("BGE-M3 requires the FlagEmbedding package. You can install it using 'pip install pyterrier-dr[bgem3]'") self.model = BGEM3FlagModel(self.model_name, use_fp16=self.use_fp16, device=self.device) diff --git a/pyterrier_dr/flex/__init__.py b/pyterrier_dr/flex/__init__.py index 6d6b7a3..88e79e3 100644 --- a/pyterrier_dr/flex/__init__.py +++ b/pyterrier_dr/flex/__init__.py @@ -1,9 +1,11 @@ -from .core import FlexIndex, IndexingMode -from .np_retr import * -from .torch_retr import * -from .corpus_graph import * -from .faiss_retr import * -from .scann_retr import * -from .ladr import * -from .gar import * -from .voyager_retr import * +from pyterrier_dr.flex.core import FlexIndex, IndexingMode +from pyterrier_dr.flex import np_retr +from pyterrier_dr.flex import torch_retr +from pyterrier_dr.flex import corpus_graph +from pyterrier_dr.flex import faiss_retr +from pyterrier_dr.flex import scann_retr +from pyterrier_dr.flex import ladr +from pyterrier_dr.flex import gar +from pyterrier_dr.flex import voyager_retr + +__all__ = ["FlexIndex", "IndexingMode", "np_retr", "torch_retr", "corpus_graph", "faiss_retr", "scann_retr", "ladr", "gar", "voyager_retr"] diff --git a/pyterrier_dr/indexes.py b/pyterrier_dr/indexes.py index 56e8488..2607626 100644 --- a/pyterrier_dr/indexes.py +++ b/pyterrier_dr/indexes.py @@ -1,3 +1,4 @@ +# Deprecated module import torch import itertools import math @@ -261,7 +262,7 @@ def index(self, inp): fout.write(doc_vecs.tobytes()) docnos.extend([d['docno'] for d in docs]) count += len(docs) - DocnoFile.build(docnos, path/f'docnos.npy') + DocnoFile.build(docnos, path/'docnos.npy') with open(path/'meta.json', 'wt') as f_meta: json.dump({'dtype': self.dtype, 'vec_size': vec_size, 'count': count}, f_meta) @@ -468,8 +469,7 @@ def transform(self, inp): query_vecs = query_vecs / np.linalg.norm(query_vecs, axis=1, keepdims=True) query_vecs = query_vecs.copy() res = [] - query_heaps = [[] for _ in range(query_vecs.shape[0])] - docnos = DocnoFile(self.index_path/f'docnos.npy') + docnos = DocnoFile(self.index_path/'docnos.npy') num_q = query_vecs.shape[0] ranked_lists = RankedLists(self.num_results, num_q) dids_offset = 0 @@ -526,7 +526,7 @@ def index(self, inp): index.add(doc_vecs) docnos.extend(d['docno'] for d in batch) faiss.write_index(index, str(path/f'{shardid}.faiss')) - DocnoFile.build(docnos, path/f'docnos.npy') + DocnoFile.build(docnos, path/'docnos.npy') class FaissHnsw(pt.Indexer): @@ -568,8 +568,7 @@ def transform(self, inp): query_vecs = query_vecs / np.linalg.norm(query_vecs, axis=1, keepdims=True) query_vecs = query_vecs.copy() res = [] - query_heaps = [[] for _ in range(query_vecs.shape[0])] - docnos = DocnoFile(self.index_path/f'docnos.npy') + docnos = DocnoFile(self.index_path/'docnos.npy') num_q = query_vecs.shape[0] ranked_lists = RankedLists(self.num_results, num_q) dids_offset = 0 @@ -629,7 +628,7 @@ def index(self, inp): index.add(doc_vecs) docnos.extend(d['docno'] for d in batch) faiss.write_index(index, str(path/f'{shardid}.faiss')) - DocnoFile.build(docnos, path/f'docnos.npy') + DocnoFile.build(docnos, path/'docnos.npy') diff --git a/pyterrier_dr/pt_docs/index.rst b/pyterrier_dr/pt_docs/index.rst new file mode 100644 index 0000000..427a6d5 --- /dev/null +++ b/pyterrier_dr/pt_docs/index.rst @@ -0,0 +1,36 @@ +Dense Retrieval for PyTerrier +======================================================= + +Features to support Dense Retrieval in `PyTerrier `__. + +.. rubric:: Getting Started + +.. code-block:: console + :caption: Install ``pyterrier-dr`` with ``pip`` + + $ pip install pyterrier-dr + +Import ``pyterrier_dr``, load a pre-built index and model, and retrieve: + +.. code-block:: python + :caption: Basic example of using ``pyterrier_dr`` + + >>> from pyterrier_dr import FlexIndex, TasB + + >>> index = FlexIndex.from_hf('macavaney/vaswani.tasb.flex') + >>> model = TasB('sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco') + >>> pipeline = model.query_encoder() >> index.np_retriever() + >>> pipeline.search('chemical reactions') + + score docno docid rank qid query + 0 95.841721 7049 7048 0 1 chemical reactions + 1 94.669395 9374 9373 1 1 chemical reactions + 2 93.520027 3101 3100 2 1 chemical reactions + 3 92.809227 6480 6479 3 1 chemical reactions + 4 92.376190 3452 3451 4 1 chemical reactions + .. ... ... ... ... .. ... + 995 82.554390 7701 7700 995 1 chemical reactions + 996 82.552139 1553 1552 996 1 chemical reactions + 997 82.551933 10064 10063 997 1 chemical reactions + 998 82.546890 4417 4416 998 1 chemical reactions + 999 82.545776 7120 7119 999 1 chemical reactions diff --git a/requirements-dev.txt b/requirements-dev.txt index ae6b446..7350e9c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,9 @@ pytest pytest-subtests +pytest-cov +pytest-json-report git+https://github.com/terrierteam/pyterrier_adaptive voyager FlagEmbedding +faiss-cpu +ruff diff --git a/setup.py b/setup.py deleted file mode 100644 index ff35d0e..0000000 --- a/setup.py +++ /dev/null @@ -1,46 +0,0 @@ -import setuptools - -requirements = [] -with open('requirements.txt', 'rt') as f: - for req in f.read().splitlines(): - if req.startswith('git+'): - pkg_name = req.split('/')[-1].replace('.git', '') - if "#egg=" in pkg_name: - pkg_name = pkg_name.split("#egg=")[1] - requirements.append(f'{pkg_name} @ {req}') - else: - requirements.append(req) - -with open("README.md", "r") as fh: - long_description = fh.read() - -def get_version(rel_path): - for line in open(rel_path): - if line.startswith('__version__'): - delim = '"' if '"' in line else "'" - return line.split(delim)[1] - else: - raise RuntimeError("Unable to find version string.") - -setuptools.setup( - name="pyterrier-dr", - version=get_version('pyterrier_dr/__init__.py'), - author="Sean MacAvaney", - author_email='sean.macavaney@glasgow.ac.uk', - description="PyTerrier components for dense retrieval", - long_description=long_description, - url='https://github.com/terrierteam/pyterrier_dr', - long_description_content_type="text/markdown", - packages=setuptools.find_packages(), - install_requires=requirements, - extras_require={ - 'bgem3': ['FlagEmbedding'], - }, - python_requires='>=3.6', - entry_points={ - 'pyterrier.artifact': [ - 'dense_index.flex = pyterrier_dr:FlexIndex', - 'cde_cache.np_pickle = pyterrier_dr:CDECache', - ], - }, -)