diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index ca59cd3..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: Test Python package - -on: - push: - pull_request: - workflow_dispatch: - inputs: - note: - description: "note" - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - python-version: ['3.8', '3.11'] - name: [ubuntu-gcc-9] - java: [13] - architecture: ['x64'] - include: - - name: ubuntu-gcc-9 - os: ubuntu-latest - compiler: "gcc" - version: "9" - - steps: - - - uses: actions/checkout@v2 - with: - submodules: recursive - - - name: Setup java - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} - architecture: ${{ matrix.architecture }} - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip pytest - pip install pyterrier_pisa torch transformers --extra-index-url https://download.pytorch.org/whl/cpu - pip install --upgrade --upgrade-strategy eager -r requirements.txt - - - name: Run tests - run: | - pytest tests/ - diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml new file mode 100644 index 0000000..f8d0799 --- /dev/null +++ b/.github/workflows/style.yml @@ -0,0 +1,31 @@ +name: style + +on: + push: {branches: [main]} # pushes to main + pull_request: {} # all PRs + +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Cache Dependencies + uses: actions/cache@v4 + with: + path: ${{ env.pythonLocation }} + key: ${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }} + + - name: Install Dependencies + run: | + pip install --upgrade -r requirements-dev.txt + pip install -e . + + - name: Ruff + run: 'ruff check --output-format=github pyterrier_adaptive' diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..f84cfde --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,49 @@ +name: test + +on: + push: {branches: [main]} # pushes to main + pull_request: {} # all PRs + schedule: [cron: '0 12 * * 3'] # every Wednesday at noon + +jobs: + pytest: + strategy: + matrix: + os: ['ubuntu-latest'] + python-version: ['3.8', '3.11'] + + runs-on: ${{ matrix.os }} + env: + runtag: ${{ matrix.os }}-${{ matrix.python-version }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Cache Dependencies + uses: actions/cache@v4 + with: + path: ${{ env.pythonLocation }} + key: ${{ env.runtag }}-${{ hashFiles('requirements.txt', 'requirements-dev.txt') }} + + - name: Install Dependencies + run: | + pip install --upgrade -r requirements.txt -r requirements-dev.txt + pip install -e . + + - name: Unit Test + run: | + pytest --durations=20 -p no:faulthandler --json-report --json-report-file ${{ env.runtag }}.results.json --cov pyterrier_adaptive --cov-report json:${{ env.runtag }}.coverage.json tests/ + + - name: Report Test Results + if: always() + run: | + printf "**Test Results**\n\n" >> $GITHUB_STEP_SUMMARY + jq '.summary' ${{ env.runtag }}.results.json >> $GITHUB_STEP_SUMMARY + printf "\n\n**Test Coverage**\n\n" >> $GITHUB_STEP_SUMMARY + jq '.files | to_entries[] | " - `" + .key + "`: **" + .value.summary.percent_covered_display + "%**"' -r ${{ env.runtag }}.coverage.json >> $GITHUB_STEP_SUMMARY diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d6d2fd4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024, Sean MacAvaney + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index 8afbefe..089bcd2 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1 @@ -include README.md -include requirements.txt +recursive-include pyterrier_adaptive *.rst diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5bc5ec5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "pyterrier-adaptive" +description = "PyTerrier implementation of Adaptive Re-Ranking using a Corpus Graph (CIKM 2022)" +requires-python = ">=3.8" +authors = [ + { name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk" } +] +maintainers = [ + {name = "Sean MacAvaney", email = "sean.macavaney@glasgow.ac.uk"}, +] +readme = "README.md" +classifiers = [ + "Programming Language :: Python", + "Operating System :: OS Independent", + "Topic :: Text Processing", + "Topic :: Text Processing :: Indexing", + "License :: OSI Approved :: MIT License", +] +dynamic = ["version", "dependencies"] + +[tool.setuptools.dynamic] +version = {attr = "pyterrier_adaptive.__version__"} +dependencies = {file = ["requirements.txt"]} + +[project.optional-dependencies] +laff = ["transformers"] + +[tool.setuptools.packages.find] +exclude = ["tests"] + +[project.urls] +Repository = "https://github.com/terrierteam/pyterrier_adaptive" +"Bug Tracker" = "https://github.com/terrierteam/pyterrier_adaptive/issues" + +[project.entry-points."pyterrier.artifact"] +"corpus_graph.np_topk" = "pyterrier_adaptive.corpus_graph:NpTopKCorpusGraph" diff --git a/pyterrier_adaptive/__init__.py b/pyterrier_adaptive/__init__.py index 2a80ae5..6e2b9d3 100644 --- a/pyterrier_adaptive/__init__.py +++ b/pyterrier_adaptive/__init__.py @@ -3,3 +3,5 @@ from .gar import GAR from .corpus_graph import CorpusGraph, NpTopKCorpusGraph from pyterrier_adaptive._laff import Laff + +__all__ = ['GAR', 'CorpusGraph', 'NpTopKCorpusGraph', 'Laff'] diff --git a/pyterrier_adaptive/_laff.py b/pyterrier_adaptive/_laff.py index 898fb6f..7093a80 100644 --- a/pyterrier_adaptive/_laff.py +++ b/pyterrier_adaptive/_laff.py @@ -8,11 +8,14 @@ from more_itertools import chunked import pyterrier as pt import pyterrier_alpha as pta -from pyterrier_adaptive import NpTopKCorpusGraph +from pyterrier_adaptive import CorpusGraph, NpTopKCorpusGraph class Laff(pt.Transformer): - """A transformer that computes a learned affinity score between two document texts using a transformer model. """ + """A transformer that computes a learned affinity score between two document texts using a transformer model. + + .. cite.dblp:: journals/corr/abs-2410-20286 + """ def __init__(self, model: str = 'macavaney/laff', @@ -22,8 +25,7 @@ def __init__(self, max_length: int = 512, verbose: bool = False ): - """ Initialize the LAFF transformer. - + """ Args: model: the name of the transformer model to use. device: the device to use for the transformer model. @@ -50,15 +52,30 @@ def compute_affinity(self, texts_left: the left-hand side texts. texts_right: the right-hand side texts. + If either the left or right text is a string (or length-1 list), it is projected to the + length of the other input (akin to numpy or torch projection). + + A higher affinity score indicates the documents are more similar to one another. + + .. code-block:: python + :caption: Compute the Learned Affinity (LAFF) score between documents. + + >>> from pyterrier_adaptive import Laff + >>> model = Laff() + >>> model.compute_affinity('the cat sat on the mat', ['cats like to sit in the sun', 'dogs like to play fetch']) + [5.46875, -3.140625] + Returns: A list of affinity scores. """ - if isinstance(texts_left, str) and isinstance(texts_right, str): - return self.compute_affinity([texts_left], [texts_right])[0] - elif isinstance(texts_left, str): - texts_left = [texts_left] * len(texts_right) - elif isinstance(texts_right, str): - texts_right = [texts_right] * len(texts_left) + if isinstance(texts_left, str): + texts_left = [texts_left] + if isinstance(texts_right, str): + texts_right = [texts_right] + if len(texts_left) == 1: + texts_left = texts_left * len(texts_right) + elif len(texts_right) == 1: + texts_right = texts_right * len(texts_left) assert len(texts_left) == len(texts_right) affinity_scores = [] @@ -99,6 +116,21 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame: res.sort_values(['text', 'affinity'], ascending=[True, False], inplace=True) return res + def wrap_graph(self, + graph: CorpusGraph, + text_loader: pt.Transformer, + ) -> 'OnTheFlyLaffGraph': + """Wrap a corpus graph with the LAFF transformer for on-the-fly LAFF scomre computation. + + Args: + graph: the input corpus graph. + text_loader: a transformer that loads the text for a given document. + + Returns: + A corpus graph that computes LAFF scores on-the-fly. + """ + return OnTheFlyLaffGraph(graph, self, text_loader) + def apply_to_graph(self, graph: NpTopKCorpusGraph, text_loader: pt.Transformer, @@ -143,3 +175,37 @@ def apply_to_graph(self, fw.write(weights.tobytes()) return NpTopKCorpusGraph(out_path) + + +class OnTheFlyLaffGraph: + def __init__(self, graph: CorpusGraph, laff: Laff, text_loader: pt.Transformer): + self.graph = graph + self.laff = laff + self.text_loader = text_loader + + def neighbours(self, docid: str, weights: bool = False, orig_weights: bool = False): + orig_neighbors, orig_weights_value = self.graph.neighbours(docid, weights=True) + orig_count = len(orig_neighbors) + orig_weights_value = [w for w, n in zip(orig_weights_value, orig_neighbors) if n != docid] + orig_neighbors = [n for n in orig_neighbors if n != docid] + left_text, *right_texts = self.text_loader(pd.DataFrame({'docno': [docid] + orig_neighbors}))['text'] + affinity_scores = self.laff.compute_affinity(left_text, right_texts) + affinity_scores = np.array(affinity_scores) + sort_order = (-affinity_scores).argsort() + new_neighbors = [orig_neighbors[i] for i in sort_order] + if len(new_neighbors) < orig_count: + new_neighbors += [docid] * (orig_count - len(new_neighbors)) + result = [new_neighbors] + if weights: + affinity_scores = [affinity_scores[i] for i in sort_order] + if len(affinity_scores) < orig_count: + affinity_scores += [float('-inf')] * (orig_count - len(affinity_scores)) + result.append(affinity_scores) + if orig_weights: + orig_weights_value = [orig_weights_value[i] for i in sort_order] + if len(orig_weights_value) < orig_count: + orig_weights_value += [float('-inf')] * (orig_count - len(orig_weights_value)) + result.append(orig_weights_value) + if len(result) == 1: + return result[0] + return tuple(result) diff --git a/pyterrier_adaptive/corpus_graph.py b/pyterrier_adaptive/corpus_graph.py index 3a9dab3..29f0117 100644 --- a/pyterrier_adaptive/corpus_graph.py +++ b/pyterrier_adaptive/corpus_graph.py @@ -1,17 +1,14 @@ import pickle import tempfile from lz4.frame import LZ4FrameFile -import shutil import json import numpy as np import pandas as pd from pathlib import Path -import torch import more_itertools from typing import Union, Tuple, List import ir_datasets from npids import Lookup -import pyterrier as pt import pyterrier_alpha as pta try: diff --git a/pyterrier_adaptive/gar.py b/pyterrier_adaptive/gar.py index 47aca14..9159b71 100644 --- a/pyterrier_adaptive/gar.py +++ b/pyterrier_adaptive/gar.py @@ -4,38 +4,42 @@ import pyterrier as pt import pandas as pd import ir_datasets +import pyterrier_adaptive logger = ir_datasets.log.easy() class GAR(pt.Transformer): - """ - A transformer that implements the Graph-based Adaptive Re-ranker algorithm from - MacAvaney et al. "Adaptive Re-Ranking with a Corpus Graph" CIKM 2022. + """A :class:`~pyterrier.Transformer` that implements Graph-based Adaptive Re-ranking (GAR). + + Required input columns: ``['qid', 'query', 'docno', 'score', 'rank']`` + + Output columns: ``['qid', 'query', 'docno', 'score', 'rank', 'iteration']`` + + .. note:: - Required input columns: ['qid', 'query', 'docno', 'score', 'rank'] - Output columns: ['qid', 'query', 'docno', 'score', 'rank', 'iteration'] - where iteration defines the batch number which identified the document. Specifically - even=initial retrieval odd=corpus graph -1=backfilled + The iteration column defines the batch number that first identified the document in the + results. Due to the alternating nature of the algorithm, ``even=initial retrieval``, ``odd=corpus graph``, + and ``-1=backfilled``. + .. cite.dblp:: conf/cikm/MacAvaneyTM22 """ def __init__(self, scorer: pt.Transformer, - corpus_graph: 'CorpusGraph', + corpus_graph: 'pyterrier_adaptive.CorpusGraph', num_results: int = 1000, batch_size: Optional[int] = None, backfill: bool = True, enabled: bool = True, verbose: bool = False): """ - GAR init method - Args: - scorer(pyterrier.Transformer): A transformer that scores query-document pairs. It will only be provided with ['qid, 'query', 'docno', 'score']. - corpus_graph(pyterrier_adaptive.CorpusGraph): A graph of the corpus, enabling quick lookups of nearest neighbours - num_results(int): The maximum number of documents to score (called "budget" and $c$ in the paper) - batch_size(int): The number of documents to score at once (called $b$ in the paper). If not provided, will attempt to use the batch size from the scorer - backfill(bool): If True, always include all documents from the initial stage, even if they were not re-scored - enabled(bool): If False, perform re-ranking without using the corpus graph - verbose(bool): If True, print progress information + Args: + scorer(:class:`~pyterrier.Transformer`): A transformer that scores query-document pairs. It will only be provided with ['qid, 'query', 'docno', 'score']. + corpus_graph(:class:`~pyterrier_adaptive.CorpusGraph`): A graph of the corpus, enabling quick lookups of nearest neighbours + num_results(int): The maximum number of documents to score (called "budget" and $c$ in the paper) + batch_size(int): The number of documents to score at once (called $b$ in the paper). If not provided, will attempt to use the batch size from the scorer + backfill(bool): If True, always include all documents from the initial stage, even if they were not re-scored + enabled(bool): If False, perform re-ranking without using the corpus graph + verbose(bool): If True, print progress information """ self.scorer = scorer self.corpus_graph = corpus_graph diff --git a/pyterrier_adaptive/pt_docs/index.rst b/pyterrier_adaptive/pt_docs/index.rst new file mode 100644 index 0000000..5f4e6f9 --- /dev/null +++ b/pyterrier_adaptive/pt_docs/index.rst @@ -0,0 +1,35 @@ +Adaptive Retrieval +========================================================= + +`pyterrier-adaptive `__ provides PyTerrier +functionality to support Adaptive Retrieval. + +Adaptive Retrieval is a family of techniques that help overcome the recall limitation of +re-ranking approaches by identifying relevant documents that were missed by earlier stages. + +API Documentation +--------------------------------------------------------- + +.. autoclass:: pyterrier_adaptive.GAR + :members: + +.. autoclass:: pyterrier_adaptive.CorpusGraph + :members: + +.. autoclass:: pyterrier_adaptive.NpTopKCorpusGraph + :members: + +.. autoclass:: pyterrier_adaptive.Laff + :members: + +Bibliography +--------------------------------------------------------- + +For more information on adaptive retrieval, see: + +.. cite.dblp:: conf/cikm/MacAvaneyTM22 +.. cite.dblp:: conf/cikm/MacAvaneyTM22a +.. cite.dblp:: conf/sigir/KulkarniMGF23 +.. cite.dblp:: conf/ecir/FraylingMMO24 +.. cite.dblp:: conf/sigir/MacAvaneyT24 +.. cite.dblp:: journals/corr/abs-2410-20286 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..7c2beb4 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,7 @@ +ruff +pytest +pytest-subtests +pytest-cov +pytest-json-report +transformers +pyterrier-pisa diff --git a/setup.py b/setup.py deleted file mode 100644 index 49781f2..0000000 --- a/setup.py +++ /dev/null @@ -1,44 +0,0 @@ -from glob import glob -import setuptools - -def get_version(path): - for line in open(path): - if line.startswith('__version__'): - delim = '"' if '"' in line else "'" - return line.split(delim)[1] - raise RuntimeError(f"Unable to find __version__ in {path}") - - -def get_requirements(path): - res = [] - for line in open(path): - line = line.split('#')[0].strip() - if line: - res.append(line) - return res - -setuptools.setup( - name="pyterrier-adaptive", - version=get_version('pyterrier_adaptive/__init__.py'), - author="Sean MacAvaney", - author_email="sean.macavaney@glasgow.ac.uk", - description="PyTerrier implementation of Adaptive Re-Ranking using a Corpus Graph (CIKM 2022)", - long_description=open('README.md').read(), - long_description_content_type="text/markdown", - url="https://github.com/terrierteam/pyterrier_adaptive", - packages=setuptools.find_packages(include=['pyterrier_adaptive']), - install_requires=list(open('requirements.txt')), - classifiers=[], - python_requires='>=3.8', - entry_points={ - 'pyterrier.artifact': [ - 'corpus_graph.np_topk = pyterrier_adaptive.corpus_graph:NpTopKCorpusGraph', - ], - }, - optional_dependencies={ - 'laff': ['transformers'], - }, - package_data={ - '': ['requirements.txt'], - }, -) diff --git a/tests/test_laff.py b/tests/test_laff.py index f269f2a..d6ab678 100644 --- a/tests/test_laff.py +++ b/tests/test_laff.py @@ -6,9 +6,6 @@ class TestLaff(unittest.TestCase): def test_laff(self): laff = Laff() - scores = laff.compute_affinity("hello", "world") - self.assertTrue(isinstance(scores, float)) - scores = laff.compute_affinity(["hello", "world"], ["world", "hello"]) self.assertEqual(len(scores), 2) self.assertTrue(isinstance(scores[0], float))