Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vector PRF #25

Merged
merged 12 commits into from
Nov 26, 2024
3 changes: 2 additions & 1 deletion pyterrier_dr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@
from pyterrier_dr.electra import ElectraScorer
from pyterrier_dr.bge_m3 import BGEM3, BGEM3QueryEncoder, BGEM3DocEncoder
from pyterrier_dr.cde import CDE, CDECache
from pyterrier_dr.prf import AveragePrf, VectorPrf

__all__ = ["FlexIndex", "DocnoFile", "NilIndex", "NumpyIndex", "RankedLists", "FaissFlat", "FaissHnsw", "MemIndex", "TorchIndex",
"BiEncoder", "BiQueryEncoder", "BiDocEncoder", "BiScorer", "HgfBiEncoder", "TasB", "RetroMAE", "SBertBiEncoder", "Ance",
"Query2Query", "GTR", "TctColBert", "ElectraScorer", "BGEM3", "BGEM3QueryEncoder", "BGEM3DocEncoder", "CDE", "CDECache",
"SimFn", "infer_device"]
"SimFn", "infer_device", "AveragePrf", "VectorPrf"]
2 changes: 1 addition & 1 deletion pyterrier_dr/flex/np_retr.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
ranked_lists = RankedLists(self.num_results, num_q)
batch_it = range(0, dvecs.shape[0], self.batch_size)
if self.flex_index.verbose:
batch_it = pt.tqdm(batch_it)
batch_it = pt.tqdm(batch_it, desc='NumpyRetriever scoring', unit='docbatch')
for idx_start in batch_it:
doc_batch = dvecs[idx_start:idx_start+self.batch_size].T
if self.flex_index.sim_fn == SimFn.cos:
Expand Down
136 changes: 136 additions & 0 deletions pyterrier_dr/prf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import numpy as np
import pandas as pd
import pyterrier as pt
import pyterrier_alpha as pta


class VectorPrf(pt.Transformer):
"""
Performs a Rocchio-esque PRF by linearly combining the query_vec column with
the doc_vec column of the top k documents.

Arguments:
- alpha: weight of original query_vec
- beta: weight of doc_vec
- k: number of pseudo-relevant feedback documents

Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']``

Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.)

Example::

prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.vector_prf() >> index

.. code-block:: bibtex
:caption: Citation

@article{DBLP:journals/tois/0009MZKZ23,
author = {Hang Li and
Ahmed Mourad and
Shengyao Zhuang and
Bevan Koopman and
Guido Zuccon},
title = {Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers:
Successes and Pitfalls},
journal = {{ACM} Trans. Inf. Syst.},
volume = {41},
number = {3},
pages = {62:1--62:40},
year = {2023},
url = {https://doi.org/10.1145/3570724},
doi = {10.1145/3570724},
timestamp = {Fri, 21 Jul 2023 22:26:51 +0200},
biburl = {https://dblp.org/rec/journals/tois/0009MZKZ23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
def __init__(self,
*,
alpha: float = 1,
beta: float = 0.2,
k: int = 3
):
self.alpha = alpha
self.beta = beta
self.k = k

@pta.transform.by_query(add_ranks=False)
def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
"""Performs Vector PRF on the input dataframe."""
pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec'])

query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']

# get the docvectors for the top k docs
doc_vecs = np.stack([ row.doc_vec for row in inp.head(self.k).itertuples() ])
# combine their average and add to the query
query_vec = self.alpha * inp['query_vec'].iloc[0] + self.beta * np.mean(doc_vecs, axis=0)
# generate new query dataframe with the existing query columns and the new query_vec
return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])

def __repr__(self):
return f"VectorPrf(alpha={self.alpha}, beta={self.beta}, k={self.k})"


class AveragePrf(pt.Transformer):
"""
Performs Average PRF (as described by Li et al.) by averaging the query_vec column with
the doc_vec column of the top k documents.

Arguments:
- k: number of pseudo-relevant feedback documents

Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']``

Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.)

Example::

prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.average_prf() >> index

.. code-block:: bibtex
:caption: Citation

@article{DBLP:journals/tois/0009MZKZ23,
author = {Hang Li and
Ahmed Mourad and
Shengyao Zhuang and
Bevan Koopman and
Guido Zuccon},
title = {Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers:
Successes and Pitfalls},
journal = {{ACM} Trans. Inf. Syst.},
volume = {41},
number = {3},
pages = {62:1--62:40},
year = {2023},
url = {https://doi.org/10.1145/3570724},
doi = {10.1145/3570724},
timestamp = {Fri, 21 Jul 2023 22:26:51 +0200},
biburl = {https://dblp.org/rec/journals/tois/0009MZKZ23.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
"""
def __init__(self,
*,
k: int = 3
):
self.k = k

@pta.transform.by_query(add_ranks=False)
def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
"""Performs Average PRF on the input dataframe."""
pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec'])

query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']

# get the docvectors for the top k docs and the query_vec
all_vecs = np.stack([inp['query_vec'].iloc[0]] + [row.doc_vec for row in inp.head(self.k).itertuples()])
# combine their average and add to the query
query_vec = np.mean(all_vecs, axis=0)
# generate new query dataframe with the existing query columns and the new query_vec
return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])

def __repr__(self):
return f"AveragePrf(k={self.k})"
8 changes: 8 additions & 0 deletions pyterrier_dr/pt_docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,11 @@ Import ``pyterrier_dr``, load a pre-built index and model, and retrieve:
997 82.551933 10064 10063 997 1 chemical reactions
998 82.546890 4417 4416 998 1 chemical reactions
999 82.545776 7120 7119 999 1 chemical reactions


.. rubric:: Table of Contents

.. toctree::
:maxdepth: 1

prf
16 changes: 16 additions & 0 deletions pyterrier_dr/pt_docs/prf.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Pseudo Relevance Feedback (PRF)
===============================

Dense Pseudo Relevance Feedback (PRF) is a technique to improve the performance of a retrieval system by expanding the
original query vector with the vectors from the top-ranked documents. The idea is that the top-ranked documents.

PyTerrier-DR provides two dense PRF implementations: :class:`pyterrier_dr.AveragePrf` and :class:`pyterrier_dr.VectorPrf`.

API Documentation
-----------------

.. autoclass:: pyterrier_dr.AveragePrf
:members:

.. autoclass:: pyterrier_dr.VectorPrf
:members:
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
transformers
python-terrier>=0.11.0
pyterrier-alpha>=0.9.3
pyterrier-alpha>=0.10.0
torch
numpy>=1.21.0, <2.0.0
npids
Expand Down
138 changes: 138 additions & 0 deletions tests/test_prf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import unittest
import numpy as np
import pandas as pd
from pyterrier_dr import AveragePrf, VectorPrf


class TestModels(unittest.TestCase):

def test_average_prf(self):
prf = AveragePrf()
with self.subTest('single row'):
inp = pd.DataFrame([['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])]], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
self.assertEqual(out['qid'].iloc[0], 'q1')
self.assertEqual(out['query'].iloc[0], 'query')
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))

with self.subTest('multiple rows'):
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.]))

with self.subTest('multiple rows -- k=3'):
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.]))

with self.subTest('multiple rows -- k=1'):
prf.k = 1
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))

with self.subTest('multiple queries'):
prf.k = 3
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([8, 7, 1])],
['q2', 'query2', np.array([4, 6, 1]), 'd1', np.array([9, 4, 2])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 2)
self.assertEqual(out['qid'].iloc[0], 'q1')
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.]))
self.assertEqual(out['qid'].iloc[1], 'q2')
np.testing.assert_array_equal(out['query_vec'].iloc[1], np.array([6.5, 5., 1.5]))

def test_vector_prf(self):
prf = VectorPrf(alpha=0.5, beta=0.5)
with self.subTest('single row'):
inp = pd.DataFrame([['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])]], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
self.assertEqual(out['qid'].iloc[0], 'q1')
self.assertEqual(out['query'].iloc[0], 'query')
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))

with self.subTest('multiple rows'):
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5)

with self.subTest('multiple rows -- k=3'):
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5)

with self.subTest('multiple rows -- k=1'):
prf.k = 1
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 1)
np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))

with self.subTest('multiple queries'):
prf.k = 3
inp = pd.DataFrame([
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([1, 4, 2])],
['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([8, 7, 1])],
['q2', 'query2', np.array([4, 6, 1]), 'd1', np.array([9, 4, 2])],
], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
out = prf(inp)
self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
self.assertEqual(len(out), 2)
self.assertEqual(out['qid'].iloc[0], 'q1')
np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5)
self.assertEqual(out['qid'].iloc[1], 'q2')
np.testing.assert_array_equal(out['query_vec'].iloc[1], np.array([6.5, 5., 1.5]))



if __name__ == '__main__':
unittest.main()