Skip to content

Commit

Permalink
drop_query_vec for gar and ladr
Browse files Browse the repository at this point in the history
  • Loading branch information
seanmacavaney committed Nov 23, 2024
1 parent ef7332e commit 779798b
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 42 deletions.
26 changes: 19 additions & 7 deletions pyterrier_dr/flex/gar.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,31 @@
import pandas as pd
import pyterrier as pt
import heapq
import pyterrier_alpha as pta
from . import FlexIndex
import numpy as np
from pyterrier_dr import SimFn

class FlexGar(pt.Transformer):
def __init__(self, flex_index, graph, score_fn, batch_size=128, num_results=1000):
def __init__(self, flex_index, graph, score_fn, batch_size=128, num_results=1000, drop_query_vec=False):
self.flex_index = flex_index
self.docnos, self.dvecs, _ = flex_index.payload()
self.score_fn = score_fn
self.graph = graph
self.batch_size = batch_size
self.num_results = num_results
self.drop_query_vec = drop_query_vec

def transform(self, inp):
assert 'qid' in inp.columns and 'query_vec' in inp.columns and 'docno' in inp.columns and 'score' in inp.columns
all_results = []
pta.validate.result_frame(inp, extra_columns=['query_vec', 'score'])

qcols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']
if not self.drop_query_vec:
qcols += ['query_vec']
all_results = pta.DataFrameBuilder(qcols + ['docno', 'score', 'rank'])

for qid, inp in inp.groupby('qid'):
qvec = inp['query_vec'].iloc[0].reshape(1, -1)
qdata = {col: [inp[col].iloc[0]] for col in qcols}
initial_heap = list(zip(-inp['score'], self.docnos.inv[inp['docno']]))
heapq.heapify(initial_heap)
results = {}
Expand Down Expand Up @@ -47,10 +54,15 @@ def transform(self, inp):
for did, score in zip(batch, scores):
results[did] = score
heapq.heappush(frontier_heap, (-score, did))
for rank, (did, score) in enumerate(sorted(results.items(), key=lambda x: (-x[1], x[0]))):
all_results.append({'qid': qid, 'docno': self.docnos.fwd[did], 'score': score, 'rank': rank})
i += 1
return pd.DataFrame(all_results)
d, s = zip(*sorted(results.items(), key=lambda x: (-x[1], x[0])))
all_results.extend(dict(
**qdata,
docno=self.docnos.fwd[d],
score=s,
rank=np.arange(len(s)),
))
return all_results.to_df()



Expand Down
77 changes: 43 additions & 34 deletions pyterrier_dr/flex/ladr.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,34 @@
import pandas as pd
import itertools
import numpy as np
import pyterrier as pt
from .. import SimFn
import pyterrier_alpha as pta
from . import FlexIndex
import ir_datasets

logger = ir_datasets.log.easy()

class LadrPreemptive(pt.Transformer):
def __init__(self, flex_index, graph, dense_scorer, hops=1):
def __init__(self, flex_index, graph, dense_scorer, hops=1, drop_query_vec=False):
self.flex_index = flex_index
self.graph = graph
self.dense_scorer = dense_scorer
self.hops = hops
self.drop_query_vec = drop_query_vec

def transform(self, inp):
assert 'query_vec' in inp.columns and 'qid' in inp.columns
assert 'docno' in inp.columns
pta.validate.result_frame(inp, extra_columns=['query_vec'])
docnos, config = self.flex_index.payload(return_dvecs=False)

res = {'qid': [], 'docid': [], 'score': []}
qcols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']
if not self.drop_query_vec:
qcols += ['query_vec']
all_results = pta.DataFrameBuilder(qcols + ['docno', 'score', 'rank'])

it = iter(inp.groupby('qid'))
if self.flex_index.verbose:
it = logger.pbar(it)
for qid, df in it:
qdata = {col: [df[col].iloc[0]] for col in qcols}
docids = docnos.inv[df['docno'].values]
lx_docids = docids
ext_docids = [docids]
for _ in range(self.hops):
docids = self.graph.edges_data[docids].reshape(-1)
Expand All @@ -40,40 +42,46 @@ def transform(self, inp):
else:
idxs = np.arange(scores.shape[0])
docids, scores = ext_docids[idxs], scores[idxs]
res['qid'].extend(itertools.repeat(qid, len(docids)))
res['docid'].append(docids)
res['score'].append(scores)
res['docid'] = np.concatenate(res['docid'])
res['score'] = np.concatenate(res['score'])
res['docno'] = docnos.fwd[res['docid']]
res = pd.DataFrame(res)
res = pt.model.add_ranks(res)
return res
idxs = np.argsort(-scores)
docids, scores = docids[idxs], scores[idxs]
all_results.extend(dict(
**qdata,
docno=docnos.fwd[docids],
score=scores,
rank=np.arange(len(scores)),
))
return all_results.to_df()


def _pre_ladr(self, k=16, hops=1, dense_scorer=None):
def _pre_ladr(self, k=16, hops=1, dense_scorer=None, drop_query_vec=False):
graph = self.corpus_graph(k) if isinstance(k, int) else k
return LadrPreemptive(self, graph, hops=hops, dense_scorer=dense_scorer or self.scorer())
return LadrPreemptive(self, graph, hops=hops, dense_scorer=dense_scorer or self.scorer(), drop_query_vec=drop_query_vec)
FlexIndex.ladr = _pre_ladr # TODO: remove this alias later
FlexIndex.pre_ladr = _pre_ladr

class LadrAdaptive(pt.Transformer):
def __init__(self, flex_index, graph, dense_scorer, depth=100, max_hops=None):
def __init__(self, flex_index, graph, dense_scorer, depth=100, max_hops=None, drop_query_vec=False):
self.flex_index = flex_index
self.graph = graph
self.dense_scorer = dense_scorer
self.depth = depth
self.max_hops = max_hops
self.drop_query_vec = drop_query_vec

def transform(self, inp):
assert 'query_vec' in inp.columns and 'qid' in inp.columns
assert 'docno' in inp.columns
pta.validate.result_frame(inp, extra_columns=['query_vec'])
docnos, config = self.flex_index.payload(return_dvecs=False)

res = {'qid': [], 'docid': [], 'score': []}
qcols = [col for col in inp.columns if col.startswith('q') and col != 'query_vec']
if not self.drop_query_vec:
qcols += ['query_vec']
all_results = pta.DataFrameBuilder(qcols + ['docno', 'score', 'rank'])

it = iter(inp.groupby('qid'))
if self.flex_index.verbose:
it = logger.pbar(it)
for qid, df in it:
qdata = {col: [df[col].iloc[0]] for col in qcols}
query_vecs = df['query_vec'].iloc[0].reshape(1, -1)
docids = np.unique(docnos.inv[df['docno'].values])
scores = self.dense_scorer.score(query_vecs, docids).reshape(-1)
Expand All @@ -98,17 +106,18 @@ def transform(self, inp):
idxs = np.argpartition(scores, -self.flex_index.num_results)[-self.flex_index.num_results:]
else:
idxs = np.arange(scores.shape[0])
res['qid'].extend(itertools.repeat(qid, len(idxs)))
res['docid'].append(docids[idxs])
res['score'].append(scores[idxs])
res['docid'] = np.concatenate(res['docid'])
res['score'] = np.concatenate(res['score'])
res['docno'] = docnos.fwd[res['docid']]
res = pd.DataFrame(res)
res = pt.model.add_ranks(res)
return res
docids, scores = docids[idxs], scores[idxs]
idxs = np.argsort(-scores)
docids, scores = docids[idxs], scores[idxs]
all_results.extend(dict(
**qdata,
docno=docnos.fwd[docids],
score=scores,
rank=np.arange(len(scores)),
))
return all_results.to_df()

def _ada_ladr(self, k=16, dense_scorer=None, depth=100, max_hops=None):
def _ada_ladr(self, k=16, dense_scorer=None, depth=100, max_hops=None, drop_query_vec=False):
graph = self.corpus_graph(k) if isinstance(k, int) else k
return LadrAdaptive(self, graph, dense_scorer=dense_scorer or self.scorer(), depth=depth, max_hops=max_hops)
return LadrAdaptive(self, graph, dense_scorer=dense_scorer or self.scorer(), depth=depth, max_hops=max_hops, drop_query_vec=drop_query_vec)
FlexIndex.ada_ladr = _ada_ladr
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
transformers
python-terrier>=0.11.0
pyterrier-alpha>=0.2.0
pyterrier-alpha>=0.9.3
torch
numpy>=1.21.0, <2.0.0
npids
Expand Down

0 comments on commit 779798b

Please sign in to comment.