-
Notifications
You must be signed in to change notification settings - Fork 1
/
metrics.py
154 lines (120 loc) · 4.67 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from sklearn.metrics import f1_score
import collections
import numpy as np
from generation_metrics import Metric
from t5.evaluation import metrics
T0_METRICS = {
"BLEU": metrics.bleu,
"ROUGE": metrics.rouge,
"Span Squad": metrics.span_squad,
"Squad": metrics.squad,
"Trivia QA": metrics.trivia_qa,
"Accuracy": metrics.accuracy,
"Spearman Correlation": metrics.spearman_corrcoef,
"Other": metrics.accuracy
}
def _metric_max_over_ground_truths(metric_fn, ground_truths, prediction):
"""Computes the maximum of the metric over all ground truths."""
return max(
[metric_fn(ground_truth, prediction) for ground_truth in ground_truths]
)
def _str_em(target, prediction):
return target == prediction
def _str_f1(target, prediction):
"""Computes token f1 score for a single target and prediction."""
prediction_tokens = prediction.split()
target_tokens = target.split()
common = (collections.Counter(prediction_tokens) &
collections.Counter(target_tokens))
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(target_tokens)
f1 = 100 * (2 * precision * recall) / (precision + recall)
return f1
def acc_single_ref(all_preds, all_labels):
'''
all_preds: List[Int]
all_labels: List[Int]
'''
acc = 100 * np.mean([int(p == l) for p, l in zip(all_preds, all_labels)])
return {"acc": acc}
def acc_multi_ref(all_preds, all_labels):
'''
all_preds: List[Int]
all_labels: List[List[Int]]
'''
acc = 100 * np.mean([int(any([p == ll for ll in l])) for p, l in zip(all_preds, all_labels)])
return {"acc": acc}
def f1_single_ref(all_preds, all_labels):
'''
all_preds: List[Int]
all_labels: List[Int]
'''
f1_macro = 100 * f1_score(all_labels, all_preds, average="macro")
return {"f1": f1_macro}
def str_em_single_ref(all_preds, all_labels):
'''
all_preds: List[Int]
all_labels: List[Int]
'''
em = 100 * np.mean([_str_em(p, l) for p, l in zip(all_preds, all_labels)])
return {"em": em}
def str_em_multi_ref(all_preds, all_labels):
'''
all_preds: List[Int]
all_labels: List[List[Int]]
'''
em = 100 * np.mean([_metric_max_over_ground_truths(_str_em, l, p) for p, l in zip(all_preds, all_labels)])
return {"em": em}
def str_f1_single_ref(all_preds, all_labels):
'''
all_preds: List[Int]
all_labels: List[Int]
'''
f1 = 100 * np.mean([_str_f1(p, l) for p, l in zip(all_preds, all_labels)])
return {"f1": f1}
def str_f1_multi_ref(all_preds, all_labels):
'''
all_preds: List[Int]
all_labels: List[List[Int]]
'''
f1 = 100 * np.mean([_metric_max_over_ground_truths(_str_f1, l, p) for p, l in zip(all_preds, all_labels)])
return {"f1": f1}
def rouge_single_ref(all_preds, all_labels):
metric = Metric()
for l, p in zip(all_labels, all_preds):
metric.forword([l.split()], p.split())
metric_res, *_ = metric.close()
return metric_res
def rouge_multi_ref(all_preds, all_labels):
metric = Metric()
for l, p in zip(all_labels, all_preds):
metric.forword([ll.split() for ll in l], p.split())
metric_res, *_ = metric.close()
return metric_res
# Multi-rouge/multi-bleu. When there are multiple references, we want to get the
# rouge score that is highest. According to the authors, this is how it was done
# in the GEM paper.
# Source: https://github.com/google/BIG-bench/blob/main/bigbench/api/task_metrics.py
def rouge_fn(targets, predictions):
"""Computes ROUGE by taking the max ROUGE-N per reference and N."""
# Following strategy from https://www.aclweb.org/anthology/W04-1013/.
# Identify best reference per response and ROUGE type.
rouge_types = ["rouge1", "rouge2", "rougeLsum"]
max_references = {rouge_type: [] for rouge_type in rouge_types}
for targ_for_resp, resp in zip(targets, predictions):
# Compute individual scores per example/ref pair.
resp_scores = [metrics.rouge([t], [resp]) for t in targ_for_resp]
# Find best scoring references for generated output and ROUGE type.
for rouge_type in rouge_types:
best_score_index = max(range(len(resp_scores)), key=lambda x: resp_scores[x][rouge_type])
best_ref = targ_for_resp[best_score_index]
# Add the reference to the new reference list.
max_references[rouge_type].append(best_ref)
# Compute metric for each of the reference lists for a ref type.
results = {}
for rouge_type in rouge_types:
results[rouge_type] = metrics.rouge(max_references[rouge_type], predictions)[rouge_type]
return results