Skip to content

Commit

Permalink
add Maximal Marginal Relevance (#36)
Browse files Browse the repository at this point in the history
* mmr implementation

* refactor a bit

* scores as float

* documentation updates

* refactor

* add test for mmr

* ruff

* updated documentation
  • Loading branch information
seanmacavaney authored Dec 19, 2024
1 parent 0c7a334 commit 422c98d
Show file tree
Hide file tree
Showing 8 changed files with 153 additions and 8 deletions.
3 changes: 2 additions & 1 deletion pyterrier_dr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
from pyterrier_dr.cde import CDE, CDECache
from pyterrier_dr.prf import AveragePrf, VectorPrf
from pyterrier_dr._ils import ILS, ils
from pyterrier_dr._mmr import MmrScorer

__all__ = ["FlexIndex", "DocnoFile", "NilIndex", "NumpyIndex", "RankedLists", "FaissFlat", "FaissHnsw", "MemIndex", "TorchIndex",
"BiEncoder", "BiQueryEncoder", "BiDocEncoder", "BiScorer", "HgfBiEncoder", "TasB", "RetroMAE", "SBertBiEncoder", "Ance",
"Query2Query", "GTR", "E5", "TctColBert", "ElectraScorer", "BGEM3", "BGEM3QueryEncoder", "BGEM3DocEncoder", "CDE", "CDECache",
"SimFn", "infer_device", "AveragePrf", "VectorPrf", "ILS", "ils"]
"SimFn", "infer_device", "AveragePrf", "VectorPrf", "ILS", "ils", "MmrScorer"]
66 changes: 66 additions & 0 deletions pyterrier_dr/_mmr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import numpy as np
import pandas as pd
import pyterrier as pt
import pyterrier_alpha as pta


class MmrScorer(pt.Transformer):
"""An MMR (Maximal Marginal Relevance) scorer (i.e., re-ranker).
The MMR scorer re-orders documents by balancing relevance (from the initial scores) and diversity (based on the
similarity of the document vectors).
.. cite.dblp:: conf/sigir/CarbonellG98
"""
def __init__(self, *, Lambda: float = 0.5, norm_rel: bool = False, norm_sim: bool = False, drop_doc_vec: bool = True, verbose: bool = False):
"""
Args:
Lambda: The balance parameter between relevance and diversity (default: 0.5)
norm_rel: Whether to normalize relevance scores to [0, 1] (default: False)
norm_sim: Whether to normalize similarity scores to [0, 1] (default: False)
drop_doc_vec: Whether to drop the 'doc_vec' column after re-ranking (default: True)
verbose: Whether to display verbose output (e.g., progress bars) (default: False)
"""
self.Lambda = Lambda
self.norm_rel = norm_rel
self.norm_sim = norm_sim
self.drop_doc_vec = drop_doc_vec
self.verbose = verbose

def transform(self, inp: pd.DataFrame) -> pd.DataFrame:
pta.validate.result_frame(inp, extra_columns=['doc_vec'])
out = []

it = inp.groupby('qid')
if self.verbose:
it = pt.tqdm(it, unit='q', desc=repr(self))

for qid, frame in it:
scores = frame['score'].values
dvec_matrix = np.stack(frame['doc_vec'])
dvec_matrix = dvec_matrix / np.linalg.norm(dvec_matrix, axis=1)[:, None]
dvec_sims = dvec_matrix @ dvec_matrix.T
if self.norm_rel:
scores = (scores - scores.min()) / (scores.max() - scores.min())
if self.norm_sim:
dvec_sims = (dvec_sims - dvec_sims.min()) / (dvec_sims.max() - dvec_sims.min())
marg_rels = np.zeros_like(scores)
new_idxs = []
for _ in range(scores.shape[0]):
mmr_scores = (self.Lambda * scores) - ((1 - self.Lambda) * marg_rels)
idx = mmr_scores.argmax()
new_idxs.append(idx)
if marg_rels.shape[0] > 1:
marg_rels = np.max(np.stack([marg_rels, dvec_sims[idx]]), axis=0)
marg_rels[idx] = float('inf') # ignore this document from now on
new_frame = frame.iloc[new_idxs].reset_index(drop=True).assign(
score=-np.arange(len(new_idxs), dtype=float),
rank=np.arange(len(new_idxs))
)
if self.drop_doc_vec:
new_frame = new_frame.drop(columns='doc_vec')
out.append(new_frame)

return pd.concat(out, ignore_index=True)

__repr__ = pta.transformer_repr
3 changes: 2 additions & 1 deletion pyterrier_dr/flex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@
from pyterrier_dr.flex import ladr
from pyterrier_dr.flex import gar
from pyterrier_dr.flex import voyager_retr
from pyterrier_dr.flex import diversity

__all__ = ["FlexIndex", "IndexingMode", "np_retr", "torch_retr", "corpus_graph", "faiss_retr", "flatnav_retr", "scann_retr", "ladr", "gar", "voyager_retr"]
__all__ = ["FlexIndex", "IndexingMode", "np_retr", "torch_retr", "corpus_graph", "faiss_retr", "flatnav_retr", "scann_retr", "ladr", "gar", "voyager_retr", "diversity"]
6 changes: 5 additions & 1 deletion pyterrier_dr/flex/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,13 @@ def docnos(self) -> Lookup:

@property
def ILS(self) -> ir_measures.Measure:
"""Return an ILS (Intra-List Similarity) measure for this index. See: :func:`pyterrier_dr.ILS` for more details."""
"""Return an ILS (Intra-List Similarity) measure for this index. See :func:`pyterrier_dr.ILS` for more details."""
return pyterrier_dr.ILS(self)

def __repr__(self):
return f'FlexIndex({str(self.index_path)!r})'


class FlexIndexer(pt.Indexer):
def __init__(self, index: FlexIndex, mode: Union[IndexingMode, str] = IndexingMode.create):
self._index = index
Expand Down
22 changes: 22 additions & 0 deletions pyterrier_dr/flex/diversity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import pyterrier as pt
import pyterrier_dr
from . import FlexIndex


def _mmr(self, *, Lambda: float = 0.5, norm_rel: bool = False, norm_sim: bool = False, drop_doc_vec: bool = True, verbose: bool = False) -> pt.Transformer:
"""Returns an MMR (Maximal Marginal Relevance) scorer (i.e., re-ranker) over this index.
The method first loads vectors from the index and then applies :class:`MmrScorer` to re-rank the results. See
:class:`MmrScorer` for more details on MMR.
Args:
Lambda: The balance parameter between relevance and diversity (default: 0.5)
norm_rel: Whether to normalize relevance scores to [0, 1] (default: False)
norm_sim: Whether to normalize similarity scores to [0, 1] (default: False)
drop_doc_vec: Whether to drop the 'doc_vec' column after re-ranking (default: True)
verbose: Whether to display verbose output (e.g., progress bars) (default: False)
.. cite.dblp:: conf/sigir/CarbonellG98
"""
return self.vec_loader() >> pyterrier_dr.MmrScorer(Lambda=Lambda, norm_rel=norm_rel, norm_sim=norm_sim, drop_doc_vec=drop_doc_vec, verbose=verbose)
FlexIndex.mmr = _mmr
24 changes: 19 additions & 5 deletions pyterrier_dr/pt_docs/diversity.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
Diversity
=======================================================

``pyterrier-dr`` provides a diversity evaluation measure, :func:`~pyterrier_dr.ILS` (Intra-List Similarity),
Search Result Diversification
-------------------------------------------------------

``pyterrier-dr`` provides one diversification algorithm, :class:`~pyterrier_dr.MmrScorer` (Maximal Marginal Relevance).
The transformer works over input dataframes that contain the dense vectors of the documents and the query. You can also
use :meth:`~pyterrier_dr.FlexIndex.mmr` to first load vectors from an index and then apply MMR.

.. autoclass:: pyterrier_dr.MmrScorer

Diversity Evaluation
-------------------------------------------------------

``pyterrier-dr`` provides one diversity evaluation measure, :func:`~pyterrier_dr.ILS` (Intra-List Similarity),
which can be used to evaluate the diversity of search results based on the dense vectors of a :class:`~pyterrier_dr.FlexIndex`.

This measure can be used alongside PyTerrier's built-in evaluation measures in a :func:`pyterrier.Experiment`.
Expand All @@ -22,15 +34,17 @@ This measure can be used alongside PyTerrier's built-in evaluation measures in a
pt.Experiment(
[
bm25,
model >> index,
model >> index.retriever(),
model >> index.retriever() >> index.mmr(),
],
dataset.get_topics(),
dataset.get_qrels(),
[nDCG@10, R(rel=2)@1000, index.ILS@10, index.ILS@1000]
)
# name nDCG@10 R(rel=2)@1000 ILS@10 ILS@1000
# BM25 0.498902 0.755495 0.852248 0.754691
# TAS-B 0.716068 0.841756 0.889112 0.775415
# name nDCG@10 R(rel=2)@1000 ILS@10 ILS@1000
# BM25 0.498 0.755 0.852 0.754
# TasB 0.716 0.841 0.889 0.775
# TasB w/ MMR 0.714 0.841 0.888 0.775
.. autofunction:: pyterrier_dr.ILS
.. autofunction:: pyterrier_dr.ils
6 changes: 6 additions & 0 deletions pyterrier_dr/pt_docs/indexing-retrieval.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ API Documentation
.. automethod:: gar
.. automethod:: ladr_proactive
.. automethod:: ladr_adaptive
.. automethod:: mmr

Evaluation
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

.. autoproperty:: ILS

Index Data Access
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
31 changes: 31 additions & 0 deletions tests/test_mmr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import unittest
import numpy as np
import pandas as pd
from pyterrier_dr import MmrScorer


class TestMmr(unittest.TestCase):
def test_mmr(self):
mmr = MmrScorer()
results = mmr(pd.DataFrame([
['q0', 'd0', 1.0, np.array([0, 1, 0])],
['q0', 'd1', 0.5, np.array([0, 1, 1])],
['q0', 'd2', 0.5, np.array([1, 1, 1])],
['q0', 'd3', 0.1, np.array([1, 1, 0])],
['q1', 'd0', 0.6, np.array([0, 1, 0])],
['q2', 'd0', 0.4, np.array([0, 1, 0])],
['q2', 'd1', 0.3, np.array([0, 1, 1])],
], columns=['qid', 'docno', 'score', 'doc_vec']))
pd.testing.assert_frame_equal(results, pd.DataFrame([
['q0', 'd0', 0.0, 0],
['q0', 'd2', -1.0, 1],
['q0', 'd1', -2.0, 2],
['q0', 'd3', -3.0, 3],
['q1', 'd0', 0.0, 0],
['q2', 'd0', 0.0, 0],
['q2', 'd1', -1.0, 1],
], columns=['qid', 'docno', 'score', 'rank']))


if __name__ == '__main__':
unittest.main()

0 comments on commit 422c98d

Please sign in to comment.