From 70307bfcbf5de3da44dfb40c8a6387588fb8f835 Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Sat, 23 Nov 2024 21:18:55 +0000 Subject: [PATCH] cleanup --- pyterrier_dr/__init__.py | 2 +- pyterrier_dr/prf.py | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pyterrier_dr/__init__.py b/pyterrier_dr/__init__.py index 643a47d..58e0917 100644 --- a/pyterrier_dr/__init__.py +++ b/pyterrier_dr/__init__.py @@ -10,4 +10,4 @@ from .electra import ElectraScorer from .bge_m3 import BGEM3, BGEM3QueryEncoder, BGEM3DocEncoder from .cde import CDE, CDECache -from .prf import average_prf, vector_prf \ No newline at end of file +from .prf import average_prf, vector_prf diff --git a/pyterrier_dr/prf.py b/pyterrier_dr/prf.py index 918dba3..78371ce 100644 --- a/pyterrier_dr/prf.py +++ b/pyterrier_dr/prf.py @@ -1,4 +1,10 @@ -def vector_prf(alpha : float = 1, beta : float = 0.2, k : int = 3): +import numpy as np +import pandas as pd +import pyterrier as pt +import pyterrier_alpha as pta + + +def vector_prf(*, alpha : float = 1, beta : float = 0.2, k : int = 3): """ Performs a Rocchio-esque PRF by linearly combining the query_vec column with the doc_vec column of the top k documents. @@ -8,7 +14,7 @@ def vector_prf(alpha : float = 1, beta : float = 0.2, k : int = 3): - beta: weight of doc_vec - k: number of pseudo-relevant feedback documents - Expected Input: ['qid', 'query_vec', 'doc_vec'] + Expected Input: ['qid', 'query', 'query_vec', 'doc_vec'] Output: ['qid', 'query', 'query_vec'] Example:: @@ -17,10 +23,8 @@ def vector_prf(alpha : float = 1, beta : float = 0.2, k : int = 3): Reference: Hang Li, Ahmed Mourad, Shengyao Zhuang, Bevan Koopman, Guido Zuccon. [Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers: Successes and Pitfalls](https://arxiv.org/pdf/2108.11044.pdf) """ - import numpy as np, pandas as pd, pyterrier as pt, pyterrier_alpha as pta - def _vector_prf(inp): - pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec']) + pta.validate.result_frame(inp, extra_columns=['query', 'query_vec', 'doc_vec']) # get the docvectors for the top k docs doc_vecs = np.stack([ row.doc_vec for row in inp.head(k).itertuples() ]) @@ -31,14 +35,12 @@ def _vector_prf(inp): return pt.apply.by_query(_vector_prf, add_ranks=False) -def average_prf(k : int = 3): +def average_prf(*, k : int = 3): """ Performs Average PRF (as described by Li et al.) by averaging the query_vec column with the doc_vec column of the top k documents. Arguments: - - alpha: weight of original query_vec - - beta: weight of doc_vec - k: number of pseudo-relevant feedback documents Expected Input: ['qid', 'query_vec', 'doc_vec'] @@ -51,8 +53,6 @@ def average_prf(k : int = 3): Reference: Hang Li, Ahmed Mourad, Shengyao Zhuang, Bevan Koopman, Guido Zuccon. [Pseudo Relevance Feedback with Deep Language Models and Dense Retrievers: Successes and Pitfalls](https://arxiv.org/pdf/2108.11044.pdf) """ - import numpy as np, pandas as pd, pyterrier as pt, pyterrier_alpha as pta - def _average_prf(inp): pta.validate.result_frame(inp, extra_columns=['query_vec', 'doc_vec']) @@ -63,4 +63,4 @@ def _average_prf(inp): # generate new query dataframe with 'qid', 'query', 'query_vec' return pd.DataFrame([[inp.iloc[0]['qid'], inp.iloc[0]['query'], query_vec]], columns=['qid', 'query', 'query_vec']) - return pt.apply.by_query(_average_prf, add_ranks=False) \ No newline at end of file + return pt.apply.by_query(_average_prf, add_ranks=False)