diff --git a/pyterrier_dr/__init__.py b/pyterrier_dr/__init__.py index 1a02d5a..320146f 100644 --- a/pyterrier_dr/__init__.py +++ b/pyterrier_dr/__init__.py @@ -10,8 +10,9 @@ from pyterrier_dr.electra import ElectraScorer from pyterrier_dr.bge_m3 import BGEM3, BGEM3QueryEncoder, BGEM3DocEncoder from pyterrier_dr.cde import CDE, CDECache +from pyterrier_dr.prf import AveragePrf, VectorPrf __all__ = ["FlexIndex", "DocnoFile", "NilIndex", "NumpyIndex", "RankedLists", "FaissFlat", "FaissHnsw", "MemIndex", "TorchIndex", "BiEncoder", "BiQueryEncoder", "BiDocEncoder", "BiScorer", "HgfBiEncoder", "TasB", "RetroMAE", "SBertBiEncoder", "Ance", "Query2Query", "GTR", "TctColBert", "ElectraScorer", "BGEM3", "BGEM3QueryEncoder", "BGEM3DocEncoder", "CDE", "CDECache", - "SimFn", "infer_device"] + "SimFn", "infer_device", "AveragePrf", "VectorPrf"] diff --git a/pyterrier_dr/flex/np_retr.py b/pyterrier_dr/flex/np_retr.py index 60fa8e4..c0e6c2f 100644 --- a/pyterrier_dr/flex/np_retr.py +++ b/pyterrier_dr/flex/np_retr.py @@ -29,7 +29,7 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame: ranked_lists = RankedLists(self.num_results, num_q) batch_it = range(0, dvecs.shape[0], self.batch_size) if self.flex_index.verbose: - batch_it = pt.tqdm(batch_it) + batch_it = pt.tqdm(batch_it, desc='NumpyRetriever scoring', unit='docbatch') for idx_start in batch_it: doc_batch = dvecs[idx_start:idx_start+self.batch_size].T if self.flex_index.sim_fn == SimFn.cos: diff --git a/pyterrier_dr/prf.py b/pyterrier_dr/prf.py new file mode 100644 index 0000000..95aa34e --- /dev/null +++ b/pyterrier_dr/prf.py @@ -0,0 +1,136 @@ +import numpy as np +import pandas as pd +import pyterrier as pt +import pyterrier_alpha as pta + + +class VectorPrf(pt.Transformer): + """ + Performs a Rocchio-esque PRF by linearly combining the query_vec column with + the doc_vec column of the top k documents. + + Arguments: + - alpha: weight of original query_vec + - beta: weight of doc_vec + - k: number of pseudo-relevant feedback documents + + Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']`` + + Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.) + + Example:: + + prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.vector_prf() >> index + + .. code-block:: bibtex + :caption: Citation + + @article{DBLP:journals/tois/0009MZKZ23, + author = {Hang Li and + Ahmed Mourad and + Shengyao Zhuang and + Bevan Koopman and + Guido Zuccon}, + title = {Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers: + Successes and Pitfalls}, + journal = {{ACM} Trans. Inf. Syst.}, + volume = {41}, + number = {3}, + pages = {62:1--62:40}, + year = {2023}, + url = {https://doi.org/10.1145/3570724}, + doi = {10.1145/3570724}, + timestamp = {Fri, 21 Jul 2023 22:26:51 +0200}, + biburl = {https://dblp.org/rec/journals/tois/0009MZKZ23.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} + } + """ + def __init__(self, + *, + alpha: float = 1, + beta: float = 0.2, + k: int = 3 + ): + self.alpha = alpha + self.beta = beta + self.k = k + + @pta.transform.by_query(add_ranks=False) + def transform(self, inp: pd.DataFrame) -> pd.DataFrame: + """Performs Vector PRF on the input dataframe.""" + pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec']) + + query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec'] + + # get the docvectors for the top k docs + doc_vecs = np.stack([ row.doc_vec for row in inp.head(self.k).itertuples() ]) + # combine their average and add to the query + query_vec = self.alpha * inp['query_vec'].iloc[0] + self.beta * np.mean(doc_vecs, axis=0) + # generate new query dataframe with the existing query columns and the new query_vec + return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec']) + + def __repr__(self): + return f"VectorPrf(alpha={self.alpha}, beta={self.beta}, k={self.k})" + + +class AveragePrf(pt.Transformer): + """ + Performs Average PRF (as described by Li et al.) by averaging the query_vec column with + the doc_vec column of the top k documents. + + Arguments: + - k: number of pseudo-relevant feedback documents + + Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']`` + + Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.) + + Example:: + + prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.average_prf() >> index + + .. code-block:: bibtex + :caption: Citation + + @article{DBLP:journals/tois/0009MZKZ23, + author = {Hang Li and + Ahmed Mourad and + Shengyao Zhuang and + Bevan Koopman and + Guido Zuccon}, + title = {Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers: + Successes and Pitfalls}, + journal = {{ACM} Trans. Inf. Syst.}, + volume = {41}, + number = {3}, + pages = {62:1--62:40}, + year = {2023}, + url = {https://doi.org/10.1145/3570724}, + doi = {10.1145/3570724}, + timestamp = {Fri, 21 Jul 2023 22:26:51 +0200}, + biburl = {https://dblp.org/rec/journals/tois/0009MZKZ23.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} + } + """ + def __init__(self, + *, + k: int = 3 + ): + self.k = k + + @pta.transform.by_query(add_ranks=False) + def transform(self, inp: pd.DataFrame) -> pd.DataFrame: + """Performs Average PRF on the input dataframe.""" + pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec']) + + query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec'] + + # get the docvectors for the top k docs and the query_vec + all_vecs = np.stack([inp['query_vec'].iloc[0]] + [row.doc_vec for row in inp.head(self.k).itertuples()]) + # combine their average and add to the query + query_vec = np.mean(all_vecs, axis=0) + # generate new query dataframe with the existing query columns and the new query_vec + return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec']) + + def __repr__(self): + return f"AveragePrf(k={self.k})" diff --git a/pyterrier_dr/pt_docs/index.rst b/pyterrier_dr/pt_docs/index.rst index 427a6d5..e6dfc55 100644 --- a/pyterrier_dr/pt_docs/index.rst +++ b/pyterrier_dr/pt_docs/index.rst @@ -34,3 +34,11 @@ Import ``pyterrier_dr``, load a pre-built index and model, and retrieve: 997 82.551933 10064 10063 997 1 chemical reactions 998 82.546890 4417 4416 998 1 chemical reactions 999 82.545776 7120 7119 999 1 chemical reactions + + +.. rubric:: Table of Contents + +.. toctree:: + :maxdepth: 1 + + prf diff --git a/pyterrier_dr/pt_docs/prf.rst b/pyterrier_dr/pt_docs/prf.rst new file mode 100644 index 0000000..78d9061 --- /dev/null +++ b/pyterrier_dr/pt_docs/prf.rst @@ -0,0 +1,16 @@ +Pseudo Relevance Feedback (PRF) +=============================== + +Dense Pseudo Relevance Feedback (PRF) is a technique to improve the performance of a retrieval system by expanding the +original query vector with the vectors from the top-ranked documents. The idea is that the top-ranked documents. + +PyTerrier-DR provides two dense PRF implementations: :class:`pyterrier_dr.AveragePrf` and :class:`pyterrier_dr.VectorPrf`. + +API Documentation +----------------- + +.. autoclass:: pyterrier_dr.AveragePrf + :members: + +.. autoclass:: pyterrier_dr.VectorPrf + :members: diff --git a/requirements.txt b/requirements.txt index 3184823..ba2e698 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ transformers python-terrier>=0.11.0 -pyterrier-alpha>=0.9.3 +pyterrier-alpha>=0.10.0 torch numpy>=1.21.0, <2.0.0 npids diff --git a/tests/test_prf.py b/tests/test_prf.py new file mode 100644 index 0000000..be312d3 --- /dev/null +++ b/tests/test_prf.py @@ -0,0 +1,138 @@ +import unittest +import numpy as np +import pandas as pd +from pyterrier_dr import AveragePrf, VectorPrf + + +class TestModels(unittest.TestCase): + + def test_average_prf(self): + prf = AveragePrf() + with self.subTest('single row'): + inp = pd.DataFrame([['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])]], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 1) + self.assertEqual(out['qid'].iloc[0], 'q1') + self.assertEqual(out['query'].iloc[0], 'query') + np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5])) + + with self.subTest('multiple rows'): + inp = pd.DataFrame([ + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])], + ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])], + ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])], + ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 1) + np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.])) + + with self.subTest('multiple rows -- k=3'): + inp = pd.DataFrame([ + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])], + ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])], + ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])], + ['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])], + ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 1) + np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.])) + + with self.subTest('multiple rows -- k=1'): + prf.k = 1 + inp = pd.DataFrame([ + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])], + ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])], + ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])], + ['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])], + ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 1) + np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5])) + + with self.subTest('multiple queries'): + prf.k = 3 + inp = pd.DataFrame([ + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])], + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([1, 4, 2])], + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([8, 7, 1])], + ['q2', 'query2', np.array([4, 6, 1]), 'd1', np.array([9, 4, 2])], + ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 2) + self.assertEqual(out['qid'].iloc[0], 'q1') + np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.])) + self.assertEqual(out['qid'].iloc[1], 'q2') + np.testing.assert_array_equal(out['query_vec'].iloc[1], np.array([6.5, 5., 1.5])) + + def test_vector_prf(self): + prf = VectorPrf(alpha=0.5, beta=0.5) + with self.subTest('single row'): + inp = pd.DataFrame([['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])]], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 1) + self.assertEqual(out['qid'].iloc[0], 'q1') + self.assertEqual(out['query'].iloc[0], 'query') + np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5])) + + with self.subTest('multiple rows'): + inp = pd.DataFrame([ + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])], + ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])], + ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])], + ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 1) + np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5) + + with self.subTest('multiple rows -- k=3'): + inp = pd.DataFrame([ + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])], + ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])], + ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])], + ['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])], + ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 1) + np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5) + + with self.subTest('multiple rows -- k=1'): + prf.k = 1 + inp = pd.DataFrame([ + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])], + ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])], + ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])], + ['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])], + ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 1) + np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5])) + + with self.subTest('multiple queries'): + prf.k = 3 + inp = pd.DataFrame([ + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])], + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([1, 4, 2])], + ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([8, 7, 1])], + ['q2', 'query2', np.array([4, 6, 1]), 'd1', np.array([9, 4, 2])], + ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec']) + out = prf(inp) + self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec']) + self.assertEqual(len(out), 2) + self.assertEqual(out['qid'].iloc[0], 'q1') + np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5) + self.assertEqual(out['qid'].iloc[1], 'q2') + np.testing.assert_array_equal(out['query_vec'].iloc[1], np.array([6.5, 5., 1.5])) + + + +if __name__ == '__main__': + unittest.main()