-
Notifications
You must be signed in to change notification settings - Fork 0
/
compatibility.py
139 lines (115 loc) · 4.6 KB
/
compatibility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
"""Maximum similarity to an ideal ranking from files in TREC format
This code implements an evaluation metric called "compatibility", which
was developed and explored over three papers. Start with the first
(i.e. most recent).
1) Charles L. A. Clarke, Alexandra Vtyurina, and Mark D. Smucker. 2020.
Assessing top-k preferences
Under review. See: https://arxiv.org/abs/2007.11682
2) Charles L. A. Clarke, Mark D. Smucker, and Alexandra Vtyurina. 2020.
Offline evaluation by maximum similarity to an ideal ranking.
29th ACM Conference on Information and Knowledge Management.
3) Charles L. A. Clarke, Alexandra Vtyurina, and Mark D. Smucker. 2020.
Offline evaluation without gain.
ACM SIGIR International Conference on the Theory of Information Retrieval.
"""
import argparse
import sys
# Default persistence of 0.95, which is roughly equivalent to NSCG@20.
# Can be changed on the command line.
P = 0.95
# An additional normalization step was introduced in paper #1 (above)
# to handle short, truncated ideal results. I don't recommend changing
# it, so it's not an command line argument, but setting it to False
# is required to exactly reproduce the numbers in papers #2 and #3,
# as well as the un-normalized numbers in paper #1.
NORMALIZE = True
# Depth for RBO computation. There's probably no need to ever play with this.
DEPTH = 1000
def rbo(run, ideal, p):
run_set = set()
ideal_set = set()
score = 0.0
normalizer = 0.0
weight = 1.0
for i in range(DEPTH):
if i < len(run):
run_set.add(run[i])
if i < len(ideal):
ideal_set.add(ideal[i])
score += weight*len(ideal_set.intersection(run_set))/(i + 1)
normalizer += weight
weight *= p
return score/normalizer
def idealize(run, ideal, qrels):
rank = {}
for i in range(len(run)):
rank[run[i]] = i
ideal.sort(key=lambda docno: rank[docno] if docno in rank else len(run))
ideal.sort(key=lambda docno: qrels[docno], reverse=True)
return ideal
def main():
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('-p', type=float, default=P, help='persistence')
parser.add_argument('qrels', type=str, help='TREC-style qrels')
parser.add_argument('run', type=str, help='TREC-style run')
args = parser.parse_args()
if args.p < 0.01 or args.p > 0.99:
print('Value of p = ' + str(args.p) + ' out of range [0.01,0.99]',
file=sys.stderr)
sys.exit(0)
ideal = {}
qrels = {}
with open(args.qrels) as qrelsf:
for line in qrelsf:
(topic, q0, docno, qrel) = line.rstrip().split()
qrel = float(qrel)
if qrel > 0.0:
if topic not in qrels:
ideal[topic] = []
qrels[topic] = {}
if docno in qrels[topic]:
if qrel > qrels[topic][docno]:
qrels[topic][docno] = qrel
else:
ideal[topic].append(docno)
qrels[topic][docno] = qrel
runid = ""
run = {}
scores = {}
with open(args.run) as runf:
for line in runf:
(topic, q0, docno, rank, score, runid) = line.rstrip().split()
if topic not in run:
run[topic] = []
scores[topic] = {}
run[topic].append(docno)
scores[topic][docno] = float(score)
for topic in run:
run[topic].sort()
run[topic].sort(key=lambda docno: scores[topic][docno], reverse=True)
if topic in ideal:
ideal[topic] = idealize(run[topic], ideal[topic], qrels[topic])
#print('runid', 'topic', 'compatibility', sep=',')
count = 0
total = 0.0
for topic in run:
if topic in ideal:
score = rbo(run[topic], ideal[topic], args.p)
if NORMALIZE:
best = rbo(ideal[topic], ideal[topic], args.p)
if best > 0.0:
score /= best
else:
score = best
count += 1
total += score
#print(runid, topic, score, sep=',')
print('compatibility', topic, "{:.4f}".format(score), sep='\t')
if count > 0:
print('compatibility', 'all', "{:.4f}".format(total/count), sep='\t')
else:
print('compatibility', 'all', "{:.4f}".format(0.0), sep='\t')
if __name__ == "__main__":
main()