-
Notifications
You must be signed in to change notification settings - Fork 17
/
eval_ceval.py
124 lines (106 loc) · 3.82 KB
/
eval_ceval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from typing import Literal
from ...llms import BaseLLM
from ..base_evaluator import BaseEvaluator
from .dataset import CEvalDataset
from .utils import get_subject_mapping
QA_TEMPLATE = """
{question}
A. {choice_a}
B. {choice_b}
C. {choice_c}
D. {choice_d}
答案:{answer}
"""
PROMPT_TEMPLATE = """以下是中国关于{discipline}考试的单项选择题,请选出其中的正确答案。
{qa_examples}
{qa_test}"""
CEVAL_HARD_DISCIPLINES = ",".join(
[
"advanced_mathematics",
"discrete_mathematics",
"probability_and_statistics",
"college_chemistry",
"college_physics",
"high_school_mathematics",
"high_school_chemistry",
"high_school_physics",
]
)
class CEvalEvaluator(BaseEvaluator):
def __init__(
self,
model: BaseLLM,
num_batches: int = 1,
output_dir: str = "./output",
disciplines: str = CEVAL_HARD_DISCIPLINES,
split: Literal["test", "val", "dev"] = "val",
num_shots: int = 2,
):
super().__init__(
model,
num_batches,
output_dir,
disciplines=disciplines,
split=split,
num_shots=num_shots,
)
self.split = split
# ─── Get Valid Disciplines ────────────────────────────────────
self.all_disciplines = set(get_subject_mapping().keys())
if disciplines is None:
self.disciplines = self.all_disciplines
else:
self.disciplines = set(disciplines.split(",")) & self.all_disciplines
# ─── Load Examples For Few-shot Learning ──────────────────────
if num_shots > 0:
ds = CEvalDataset(self.disciplines, split=self.split)
self.discipline_examples = ds.load_as_dict_of_discipline(num_shots)
else:
self.discipline_examples = {}
def set_generation_configs(self) -> None:
new_configs = {"max_new_tokens": 16, "do_sample": False}
self.model.update_generation_configs(new_configs)
def load_batched_dataset(self) -> list[list[dict]]:
dataset = CEvalDataset(self.disciplines, split=self.split)
batches = dataset.to_batched(self.num_batches)
return batches
def qa_prompt(self, examples: list[dict]) -> str:
prompt = "".join(
QA_TEMPLATE.format(
question=example["question"],
choice_a=example["A"],
choice_b=example["B"],
choice_c=example["C"],
choice_d=example["D"],
answer=example["answer"],
)
for example in examples
)
return prompt
def scoring(self, data_point: dict) -> dict:
discipline = data_point["type"]
query = PROMPT_TEMPLATE.format(
discipline=get_subject_mapping()[discipline][1], # Get the Chinese name
qa_examples=self.qa_prompt(self.discipline_examples[discipline]),
qa_test=self.qa_prompt([data_point]),
)
query = query.strip()[:-1] # Remove the answer to be predicted
response = self.model.safe_request(query)
answer = response.strip().split("\n")[0].strip() # Get the first line
return {
"metrics": {
"correct": answer == data_point["answer"],
},
"log": {
"answer": answer,
"response": response,
"query": query,
},
"valid": answer != "",
}
def compute_overall(self, results: list[dict]) -> dict:
return {
"accuracy": sum([result["metrics"]["correct"] for result in results])
/ len(results),
"num": len(results),
}