-
Notifications
You must be signed in to change notification settings - Fork 2
/
evaluate.py
118 lines (91 loc) · 5.21 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
from typing import List
import matplotlib.pyplot as plt
import nltk
import seaborn as sns
import pandas as pd
import sys
import numpy as np
from sacrebleu.metrics import BLEU
from sentence_transformers import SentenceTransformer, util
from bert_score import score
GROUND_TRUTH_COLUMN = "Ground Truth"
PREDICTION_COLUMN = "Prediction"
def compute_corpus_bleu(ground_truths: List[str], predictions: List[str]):
assert len(ground_truths) == len(predictions)
list_of_target_token_list, list_of_pred_token_list = [], []
for k in range(len(ground_truths)):
one_target_token_list = ground_truths[k].strip().split()
list_of_target_token_list.append(one_target_token_list)
one_pred_token_list = predictions[k].strip().split()
list_of_pred_token_list.append(one_pred_token_list)
list_of_references, list_of_hypotheses = [], []
for k in range(len(list_of_target_token_list)):
list_of_references.append([list_of_target_token_list[k]])
list_of_hypotheses.append(list_of_pred_token_list[k])
return nltk.translate.bleu_score.corpus_bleu(list_of_references,
list_of_hypotheses, weights=[(1, 0.0, 0.0, 0.0),
(1. / 2., 1. / 2.),
(1. / 3., 1. / 3., 1. / 3.),
(1. / 4., 1. / 4., 1. / 4., 1. / 4.)])
def compute_bert_score(ground_truths: List[str], predictions: List[str]):
assert len(ground_truths) == len(predictions)
# define model
model = SentenceTransformer('all-MiniLM-L6-v2')
gt_embeddings = model.encode(ground_truths, convert_to_tensor=True)
pred_embeddings = model.encode(predictions, convert_to_tensor=True)
cosine_score = [util.cos_sim(gt_embed, pred_embed).item() for gt_embed, pred_embed in
zip(gt_embeddings, pred_embeddings)]
return (sum(cosine_score) / len(ground_truths)), cosine_score
def simple_count_number_of_questions(text: str) -> int:
return text.count("?")
def save_results(df: pd.DataFrame, path: str):
filename = os.path.join(path, "evaluation.csv")
df.reset_index(inplace=True)
df.to_csv(filename)
print(f"File saved at :{filename}")
def main(PATH):
df = pd.read_csv(PATH)
all_gt = list(df[GROUND_TRUTH_COLUMN])
all_pred = list(df[PREDICTION_COLUMN])
sentence_bleu = []
for sample_gt, sample_pred in zip(all_gt, all_pred):
reference = sample_gt.strip().split()
hypothesis = sample_pred.strip().split()
BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis,
weights=[(1, 0.0, 0.0, 0.0),
(1. / 2., 1. / 2.),
(1. / 3., 1. / 3., 1. / 3.),
(1. / 4., 1. / 4., 1. / 4., 1. / 4.)])
sentence_bleu.append(BLEUscore)
df["s_bleu1"], df["s_bleu2"], df["s_bleu3"], df["s_bleu4"] = zip(*list(sentence_bleu))
df["gt_questions_count"] = df[GROUND_TRUTH_COLUMN].apply(lambda x: simple_count_number_of_questions(x))
df["prediction_questions_count"] = df[PREDICTION_COLUMN].apply(lambda x: simple_count_number_of_questions(x))
print(
f"Correct question numbers prediction: {((df['gt_questions_count'] - df['prediction_questions_count']) == 0).sum()} out of {len(df['prediction_questions_count'])} or {(((df['gt_questions_count'] - df['prediction_questions_count']) == 0).sum()) / len(df['prediction_questions_count'])}")
sentence_bleu = np.array(sentence_bleu)
print(f"Overall sentence BLUE score: {np.average(sentence_bleu, axis=0) * 100}")
corpus_bleu = compute_corpus_bleu(all_gt, all_pred)
print(f"Overall corpus BLUE score: {corpus_bleu * np.array([100, 100, 100, 100])}")
overall_bert_score, bert_score = compute_bert_score(all_gt, all_pred)
df["bert_score"] = bert_score
print(f"Overall BERT score: {overall_bert_score}")
P, R, F1 = score(all_pred, all_gt, lang="en", model_type="microsoft/deberta-xlarge-mnli", verbose=False)
print(f"System level F1 score: {F1.mean():.10f}")
sacre_bleu = BLEU()
sacrebleu_scores = sacre_bleu.corpus_score(all_pred, [all_gt]).format()
print(f"Sacrebleu results: {sacrebleu_scores}")
print(sacre_bleu.get_signature())
save_results(df, os.path.dirname(PATH))
fig, axs = plt.subplots(4, 1, figsize=(15, 8))
sns.histplot(data=df, x="gt_questions_count", label="Ground truth questions", color="green", alpha=0.2, ax=axs[0])
sns.histplot(data=df, x="prediction_questions_count", label="Prediction questions", color="blue", alpha=0.2,
ax=axs[0])
sns.boxplot(data=df, x="gt_questions_count", y="s_bleu4", ax=axs[1])
sns.histplot(data=df, x="bert_score", ax=axs[2])
sns.boxplot(data=df, x="gt_questions_count", y="bert_score", ax=axs[3])
axs[0].legend()
plt.savefig(os.path.join(os.path.dirname(PATH), "distribution.png"))
if __name__ == "__main__":
PATH = sys.argv[1]
main(PATH)