-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate.py
216 lines (170 loc) · 6.96 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import os
import json
import pandas as pd
import numpy as np
from sklearn import metrics
IOU_THRESH = 0.5
def get_questions_from_csv():
df = pd.read_csv("category_descriptions.csv")
q_dict = {}
for i in range(df.shape[0]):
category = df.iloc[i, 0].split("Category: ")[1]
description = df.iloc[i, 1].split("Description: ")[1]
q_dict[category.title()] = description
return q_dict
qtype_dict = get_questions_from_csv()
def load_json(path):
with open(path, "r") as f:
dict = json.load(f)
return dict
def get_preds(nbest_preds_dict, conf=None):
results = {}
for question_id in nbest_preds_dict:
list_of_pred_dicts = nbest_preds_dict[question_id]
preds = {}
for pred_dict in list_of_pred_dicts:
text = pred_dict["text"]
prob = pred_dict["probability"]
if not text == "": # don't count empty string as a prediction
preds[text] = prob
preds_list = [pred for pred in preds.keys() if preds[pred] > conf]
results[question_id] = preds_list
return results
def get_answers(test_json_dict):
results = {}
data = test_json_dict["data"]
for contract in data:
for para in contract["paragraphs"]:
qas = para["qas"]
for qa in qas:
id = qa["id"]
answers = qa["answers"]
answers = [answers[i]["text"] for i in range(len(answers))]
results[id] = answers
return results
def get_jaccard(gt, pred):
remove_tokens = [".", ",", ";", ":"]
for token in remove_tokens:
gt = gt.replace(token, "")
pred = pred.replace(token, "")
gt = gt.lower()
pred = pred.lower()
gt = gt.replace("/", " ")
pred = pred.replace("/", " ")
gt_words = set(gt.split(" "))
pred_words = set(pred.split(" "))
intersection = gt_words.intersection(pred_words)
union = gt_words.union(pred_words)
jaccard = len(intersection) / len(union)
return jaccard
def compute_precision_recall(gt_dict, preds_dict, category=None):
tp, fp, fn = 0, 0, 0
for key in gt_dict:
if category and category not in key:
continue
substr_ok = "Parties" in key
answers = gt_dict[key]
preds = preds_dict[key]
# first check if answers is empty
if len(answers) == 0:
if len(preds) > 0:
fp += len(preds) # false positive for each one
else:
for ans in answers:
assert len(ans) > 0
# check if there is a match
match_found = False
for pred in preds:
if substr_ok:
is_match = get_jaccard(ans, pred) >= IOU_THRESH or ans in pred
else:
is_match = get_jaccard(ans, pred) >= IOU_THRESH
if is_match:
match_found = True
if match_found:
tp += 1
else:
fn += 1
# now also get any fps by looping through preds
for pred in preds:
# Check if there's a match. if so, don't count (don't want to double count based on the above)
# but if there's no match, then this is a false positive.
# (Note: we get the true positives in the above loop instead of this loop so that we don't double count
# multiple predictions that are matched with the same answer.)
match_found = False
for ans in answers:
assert len(ans) > 0
if substr_ok:
is_match = get_jaccard(ans, pred) >= IOU_THRESH or ans in pred
else:
is_match = get_jaccard(ans, pred) >= IOU_THRESH
if is_match:
match_found = True
if not match_found:
fp += 1
precision = tp / (tp + fp) if tp + fp > 0 else np.nan
recall = tp / (tp + fn) if tp + fn > 0 else np.nan
return precision, recall
def process_precisions(precision):
"""
Processes precisions to ensure that precision and recall don't both get worse
Assumes the list precision is sorted in order of recalls
"""
precision_best = precision[::-1]
for i in range(1, len(precision_best)):
precision_best[i] = max(precision_best[i-1], precision_best[i])
precision = precision_best[::-1]
return precision
def get_prec_at_recall(precisions, recalls, confs, recall_thresh=0.9):
"""
Assumes recalls are sorted in increasing order
"""
processed_precisions = process_precisions(precisions)
prec_at_recall = 0
for prec, recall, conf in zip(processed_precisions, recalls, confs):
if recall >= recall_thresh:
prec_at_recall = prec
break
return prec_at_recall, conf
def get_precisions_recalls(pred_dict, gt_dict, category=None):
precisions = [1]
recalls = [0]
confs = []
for conf in list(np.arange(0.99, 0, -0.01)) + [0.001, 0]:
conf_thresh_pred_dict = get_preds(pred_dict, conf)
prec, recall = compute_precision_recall(gt_dict, conf_thresh_pred_dict, category=category)
precisions.append(prec)
recalls.append(recall)
confs.append(conf)
return precisions, recalls, confs
def get_aupr(precisions, recalls):
processed_precisions = process_precisions(precisions)
aupr = metrics.auc(recalls, processed_precisions)
if np.isnan(aupr):
return 0
return aupr
def get_results(model_path, gt_dict, verbose=False):
predictions_path = os.path.join(model_path, "nbest_predictions_.json")
name = model_path.split("/")[-1]
pred_dict = load_json(predictions_path)
assert sorted(list(pred_dict.keys())) == sorted(list(gt_dict.keys()))
precisions, recalls, confs = get_precisions_recalls(pred_dict, gt_dict)
prec_at_90_recall, _ = get_prec_at_recall(precisions, recalls, confs, recall_thresh=0.9)
prec_at_80_recall, _ = get_prec_at_recall(precisions, recalls, confs, recall_thresh=0.8)
aupr = get_aupr(precisions, recalls)
if verbose:
print("AUPR: {:.3f}, Precision at 80% Recall: {:.3f}, Precision at 90% Recall: {:.3f}".format(aupr, prec_at_80_recall, prec_at_90_recall))
# now save results as a dataframe and return
results = {"name": name, "aupr": aupr, "prec_at_80_recall": prec_at_80_recall, "prec_at_90_recall": prec_at_90_recall}
return results
if __name__ == "__main__":
test_json_path = "./data/test.json"
model_path = "./trained_models/roberta-base"
save_dir = "./results"
if not os.path.exists(save_dir): os.mkdir(save_dir)
gt_dict = load_json(test_json_path)
gt_dict = get_answers(gt_dict)
results = get_results(model_path, gt_dict, verbose=True)
save_path = os.path.join(save_dir, "{}.json".format(model_path.split("/")[-1]))
with open(save_path, "w") as f:
f.write("{}\n".format(results))