Vector PRF (#25)

* match torchscorer tqdm * initial commit of the vector based prf * cleanup * cleanup * reworked prf as transformers, added a simple test * using ``pta.transform.by_query`` * pull forward all query columns * bibtex block in documentation * documentation * whoops * more tests --------- Co-authored-by: Sean MacAvaney <[email protected]>
terrierteam · Nov 26, 2024 · 6fe2564 · 6fe2564
1 parent 6909587
commit 6fe2564
Show file tree

Hide file tree

Showing 7 changed files with 302 additions and 3 deletions.
diff --git a/pyterrier_dr/__init__.py b/pyterrier_dr/__init__.py
@@ -10,8 +10,9 @@
 from pyterrier_dr.electra import ElectraScorer
 from pyterrier_dr.bge_m3 import BGEM3, BGEM3QueryEncoder, BGEM3DocEncoder
 from pyterrier_dr.cde import CDE, CDECache
+from pyterrier_dr.prf import AveragePrf, VectorPrf
 
 __all__ = ["FlexIndex", "DocnoFile", "NilIndex", "NumpyIndex", "RankedLists", "FaissFlat", "FaissHnsw", "MemIndex", "TorchIndex",
            "BiEncoder", "BiQueryEncoder", "BiDocEncoder", "BiScorer", "HgfBiEncoder", "TasB", "RetroMAE", "SBertBiEncoder", "Ance",
            "Query2Query", "GTR", "TctColBert", "ElectraScorer", "BGEM3", "BGEM3QueryEncoder", "BGEM3DocEncoder", "CDE", "CDECache",
-           "SimFn", "infer_device"]
+           "SimFn", "infer_device", "AveragePrf", "VectorPrf"]
diff --git a/pyterrier_dr/flex/np_retr.py b/pyterrier_dr/flex/np_retr.py
@@ -29,7 +29,7 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
         ranked_lists = RankedLists(self.num_results, num_q)
         batch_it = range(0, dvecs.shape[0], self.batch_size)
         if self.flex_index.verbose:
-            batch_it = pt.tqdm(batch_it)
+            batch_it = pt.tqdm(batch_it, desc='NumpyRetriever scoring', unit='docbatch')
         for idx_start in batch_it:
             doc_batch = dvecs[idx_start:idx_start+self.batch_size].T
             if self.flex_index.sim_fn == SimFn.cos:

diff --git a/pyterrier_dr/prf.py b/pyterrier_dr/prf.py
@@ -0,0 +1,136 @@
+import numpy as np
+import pandas as pd
+import pyterrier as pt
+import pyterrier_alpha as pta
+
+
+class VectorPrf(pt.Transformer):
+    """
+    Performs a Rocchio-esque PRF by linearly combining the query_vec column with 
+    the doc_vec column of the top k documents.
+
+    Arguments:
+     - alpha: weight of original query_vec
+     - beta: weight of doc_vec
+     - k: number of pseudo-relevant feedback documents
+
+    Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']``
+
+    Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.)
+
+    Example::
+    
+            prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.vector_prf() >> index 
+
+    .. code-block:: bibtex
+        :caption: Citation
+
+        @article{DBLP:journals/tois/0009MZKZ23,
+          author       = {Hang Li and
+                          Ahmed Mourad and
+                          Shengyao Zhuang and
+                          Bevan Koopman and
+                          Guido Zuccon},
+          title        = {Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers:
+                          Successes and Pitfalls},
+          journal      = {{ACM} Trans. Inf. Syst.},
+          volume       = {41},
+          number       = {3},
+          pages        = {62:1--62:40},
+          year         = {2023},
+          url          = {https://doi.org/10.1145/3570724},
+          doi          = {10.1145/3570724},
+          timestamp    = {Fri, 21 Jul 2023 22:26:51 +0200},
+          biburl       = {https://dblp.org/rec/journals/tois/0009MZKZ23.bib},
+          bibsource    = {dblp computer science bibliography, https://dblp.org}
+        }
+    """
+    def __init__(self,
+        *,
+        alpha: float = 1,
+        beta: float = 0.2,
+        k: int = 3
+    ):
+        self.alpha = alpha
+        self.beta = beta
+        self.k = k
+
+    @pta.transform.by_query(add_ranks=False)
+    def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
+        """Performs Vector PRF on the input dataframe."""
+        pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec'])
+
+        query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']
+
+        # get the docvectors for the top k docs
+        doc_vecs = np.stack([ row.doc_vec for row in inp.head(self.k).itertuples() ])
+        # combine their average and add to the query
+        query_vec = self.alpha * inp['query_vec'].iloc[0] + self.beta * np.mean(doc_vecs, axis=0)
+        # generate new query dataframe with the existing query columns and the new query_vec
+        return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])
+
+    def __repr__(self):
+        return f"VectorPrf(alpha={self.alpha}, beta={self.beta}, k={self.k})"
+
+
+class AveragePrf(pt.Transformer):
+    """
+    Performs Average PRF (as described by Li et al.) by averaging the query_vec column with 
+    the doc_vec column of the top k documents.
+
+    Arguments:
+     - k: number of pseudo-relevant feedback documents
+
+    Expected Input Columns: ``['qid', 'query_vec', 'docno', 'doc_vec']``
+
+    Output Columns: ``['qid', 'query_vec']`` (Any other query columns from the input are also pulled included in the output.)
+
+    Example::
+    
+            prf_pipe = model >> index >> index.vec_loader() >> pyterier_dr.average_prf() >> index 
+
+    .. code-block:: bibtex
+        :caption: Citation
+
+        @article{DBLP:journals/tois/0009MZKZ23,
+          author       = {Hang Li and
+                          Ahmed Mourad and
+                          Shengyao Zhuang and
+                          Bevan Koopman and
+                          Guido Zuccon},
+          title        = {Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers:
+                          Successes and Pitfalls},
+          journal      = {{ACM} Trans. Inf. Syst.},
+          volume       = {41},
+          number       = {3},
+          pages        = {62:1--62:40},
+          year         = {2023},
+          url          = {https://doi.org/10.1145/3570724},
+          doi          = {10.1145/3570724},
+          timestamp    = {Fri, 21 Jul 2023 22:26:51 +0200},
+          biburl       = {https://dblp.org/rec/journals/tois/0009MZKZ23.bib},
+          bibsource    = {dblp computer science bibliography, https://dblp.org}
+        }
+    """
+    def __init__(self,
+        *,
+        k: int = 3
+    ):
+        self.k = k
+
+    @pta.transform.by_query(add_ranks=False)
+    def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
+        """Performs Average PRF on the input dataframe."""
+        pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec'])
+
+        query_cols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']
+
+        # get the docvectors for the top k docs and the query_vec
+        all_vecs = np.stack([inp['query_vec'].iloc[0]] + [row.doc_vec for row in inp.head(self.k).itertuples()])
+        # combine their average and add to the query
+        query_vec = np.mean(all_vecs, axis=0)
+        # generate new query dataframe with the existing query columns and the new query_vec
+        return pd.DataFrame([[inp[c].iloc[0] for c in query_cols] + [query_vec]], columns=query_cols + ['query_vec'])
+
+    def __repr__(self):
+        return f"AveragePrf(k={self.k})"
diff --git a/pyterrier_dr/pt_docs/index.rst b/pyterrier_dr/pt_docs/index.rst
@@ -34,3 +34,11 @@ Import ``pyterrier_dr``, load a pre-built index and model, and retrieve:
     997  82.551933  10064  10063   997   1  chemical reactions
     998  82.546890   4417   4416   998   1  chemical reactions
     999  82.545776   7120   7119   999   1  chemical reactions
+
+
+.. rubric:: Table of Contents
+
+.. toctree::
+    :maxdepth: 1
+
+    prf
diff --git a/pyterrier_dr/pt_docs/prf.rst b/pyterrier_dr/pt_docs/prf.rst
@@ -0,0 +1,16 @@
+Pseudo Relevance Feedback (PRF)
+===============================
+
+Dense Pseudo Relevance Feedback (PRF) is a technique to improve the performance of a retrieval system by expanding the
+original query vector with the vectors from the top-ranked documents. The idea is that the top-ranked documents.
+
+PyTerrier-DR provides two dense PRF implementations: :class:`pyterrier_dr.AveragePrf` and :class:`pyterrier_dr.VectorPrf`.
+
+API Documentation
+-----------------
+
+.. autoclass:: pyterrier_dr.AveragePrf
+    :members:
+
+.. autoclass:: pyterrier_dr.VectorPrf
+    :members:
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 transformers
 python-terrier>=0.11.0
-pyterrier-alpha>=0.9.3
+pyterrier-alpha>=0.10.0
 torch
 numpy>=1.21.0, <2.0.0
 npids

diff --git a/tests/test_prf.py b/tests/test_prf.py
@@ -0,0 +1,138 @@
+import unittest
+import numpy as np
+import pandas as pd
+from pyterrier_dr import AveragePrf, VectorPrf
+
+
+class TestModels(unittest.TestCase):
+
+    def test_average_prf(self):
+        prf = AveragePrf()
+        with self.subTest('single row'):
+            inp = pd.DataFrame([['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])]], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 1)
+            self.assertEqual(out['qid'].iloc[0], 'q1')
+            self.assertEqual(out['query'].iloc[0], 'query')
+            np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))
+
+        with self.subTest('multiple rows'):
+            inp = pd.DataFrame([
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
+            ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 1)
+            np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.]))
+
+        with self.subTest('multiple rows -- k=3'):
+            inp = pd.DataFrame([
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
+            ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 1)
+            np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.]))
+
+        with self.subTest('multiple rows -- k=1'):
+            prf.k = 1
+            inp = pd.DataFrame([
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
+            ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 1)
+            np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))
+
+        with self.subTest('multiple queries'):
+            prf.k = 3
+            inp = pd.DataFrame([
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([1, 4, 2])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([8, 7, 1])],
+                ['q2', 'query2', np.array([4, 6, 1]), 'd1', np.array([9, 4, 2])],
+            ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 2)
+            self.assertEqual(out['qid'].iloc[0], 'q1')
+            np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([3.5, 4.5, 3.]))
+            self.assertEqual(out['qid'].iloc[1], 'q2')
+            np.testing.assert_array_equal(out['query_vec'].iloc[1], np.array([6.5, 5., 1.5]))
+
+    def test_vector_prf(self):
+        prf = VectorPrf(alpha=0.5, beta=0.5)
+        with self.subTest('single row'):
+            inp = pd.DataFrame([['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])]], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 1)
+            self.assertEqual(out['qid'].iloc[0], 'q1')
+            self.assertEqual(out['query'].iloc[0], 'query')
+            np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))
+
+        with self.subTest('multiple rows'):
+            inp = pd.DataFrame([
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
+            ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 1)
+            np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5)
+
+        with self.subTest('multiple rows -- k=3'):
+            inp = pd.DataFrame([
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
+            ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 1)
+            np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5)
+
+        with self.subTest('multiple rows -- k=1'):
+            prf.k = 1
+            inp = pd.DataFrame([
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd2', np.array([1, 4, 2])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd3', np.array([8, 7, 1])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd4', np.array([100, 100, 100])],
+            ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 1)
+            np.testing.assert_array_equal(out['query_vec'].iloc[0], np.array([2.5, 3.5, 4.5]))
+
+        with self.subTest('multiple queries'):
+            prf.k = 3
+            inp = pd.DataFrame([
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([4, 5, 6])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([1, 4, 2])],
+                ['q1', 'query', np.array([1, 2, 3]), 'd1', np.array([8, 7, 1])],
+                ['q2', 'query2', np.array([4, 6, 1]), 'd1', np.array([9, 4, 2])],
+            ], columns=['qid', 'query', 'query_vec', 'docno', 'doc_vec'])
+            out = prf(inp)
+            self.assertEqual(out.columns.tolist(), ['qid', 'query', 'query_vec'])
+            self.assertEqual(len(out), 2)
+            self.assertEqual(out['qid'].iloc[0], 'q1')
+            np.testing.assert_almost_equal(out['query_vec'].iloc[0], np.array([2.666667, 3.666667, 3.]), decimal=5)
+            self.assertEqual(out['qid'].iloc[1], 'q2')
+            np.testing.assert_array_equal(out['query_vec'].iloc[1], np.array([6.5, 5., 1.5]))
+
+
+
+if __name__ == '__main__':
+    unittest.main()