Skip to content

Commit

Permalink
update data adapter
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunnglin committed Dec 19, 2024
1 parent 376afc8 commit 655d49c
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 229 deletions.
5 changes: 0 additions & 5 deletions evalscope/benchmarks/arc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
113 changes: 18 additions & 95 deletions evalscope/benchmarks/arc/arc_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,34 @@
import json
import os

from evalscope.benchmarks.data_adapter import DataAdapter
from evalscope.metrics.metrics import exact_match, weighted_mean
from evalscope.utils import ResponseParser, normalize_score
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.metrics import WeightedAverageAccuracy, exact_match
from evalscope.models import MultiChoiceModelAdapter
from evalscope.utils import ResponseParser
from evalscope.utils.logger import get_logger

# flake8: noqa

logger = get_logger()

DATASET_ID = 'modelscope/ai2_arc'

# task_list = ['ARC-Easy', 'ARC-Challenge']
SUBSET_LIST = ['ARC-Challenge']


@Benchmark.register(
name='arc',
dataset_id='modelscope/ai2_arc',
model_adapter=MultiChoiceModelAdapter,
subset_list=['ARC-Easy', 'ARC-Challenge'],
metric_list=[WeightedAverageAccuracy],
few_shot_num=0,
train_split='train',
eval_split='test',
prompt_template='',
)
class ARCAdapter(DataAdapter):

choices = ['A', 'B', 'C', 'D']

def __init__(self,
subset_list: list = None,
metric_list: list = None,
few_shot_num: int = None,
train_split: str = 'train',
eval_split: str = 'test',
prompt_template: str = '',
**kwargs):

if subset_list is None:
subset_list = SUBSET_LIST

if metric_list is None:
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]

def __init__(self, **kwargs):
few_shot_num = kwargs.get('few_shot_num', None)
if few_shot_num is None:
# Use 0-shot by default
logger.info(f'Set 0-shot examples by system for ARC.')
Expand All @@ -45,14 +39,7 @@ def __init__(self,
if few_shot_num != 0:
logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')

super().__init__(
subset_list=subset_list,
metric_list=metric_list,
few_shot_num=few_shot_num,
train_split=train_split,
eval_split=eval_split,
prompt_template=prompt_template,
**kwargs)
super().__init__(**kwargs)

def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
"""
Expand Down Expand Up @@ -158,70 +145,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
def match(self, gold: str, pred: str) -> float:
return exact_match(gold=gold, pred=pred)

def compute_metric(self, review_res_list: list) -> float:
"""
Compute evaluation result by specific metric.
Args:
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
Returns:
The metric score.
"""
items = [(score, 1.0) for score in review_res_list]
return weighted_mean(items)

def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
"""
Generate the report for the model output.
Args:
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
report_name: The user-defined report name.
Returns: A dict of metric calculation results. The format is like:
{
"name":"ARC",
"metric":"WeightedAverageAccuracy",
"score":0.3389,
"category":[
{
"name":"DEFAULT",
"score":0.4128,
"subset":[
{
"name":"ARC-Easy",
"score":0.5632
},
{
"name":"ARC-Challenge",
"score":0.3157
}
]
}
],
"total_num":7800
}
"""
total_num: int = sum([num for _, num in subset_score_map.values()])
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
cate_avg_list = [{
'name': subset_name,
'score': normalize_score(score=score)
} for subset_name, (score, _) in subset_score_map.items()]

category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)

res_map = dict(
name=report_name or 'arc',
metric=self.metric_list[0]['name'],
score=weighted_avg_acc,
category=[category_d],
total_num=total_num)

return res_map

@classmethod
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:

Expand Down
2 changes: 1 addition & 1 deletion evalscope/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class BenchmarkMeta:
data_adapter: 'DataAdapter'
model_adapter: BaseModelAdapter
subset_list: List[str] = field(default_factory=list)
metric_list: List[Dict] = field(default_factory=list)
metric_list: List[dict] = field(default_factory=list)
few_shot_num: int = 0
few_shot_random: bool = False
train_split: Optional[str] = None
Expand Down
151 changes: 91 additions & 60 deletions evalscope/benchmarks/data_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from typing import Any, Optional

from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
from evalscope.utils import normalize_score
from evalscope.utils.logger import get_logger

logger = get_logger()
Expand All @@ -22,6 +23,11 @@ def __init__(self,
prompt_template: str = '',
**kwargs):
"""
Data Adapter for the benchmark. You need to implement the following methods:
- gen_prompt
- get_gold_answer
- parse_pred_result
- match
Args:
subset_list: list of subset names for the dataset.
metric_list: list, the metric list to evaluate the model on specific benchmark.
Expand Down Expand Up @@ -141,6 +147,91 @@ def gen_prompts(self, data_dict: dict) -> dict:

return res_dict

def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
"""
Generate report for the evaluation results for all subsets.
Args:
subset_score_map: The subset-score map.
e.g. {subset_name: (score, num)}
report_name: str, the user-defined report name. Default: None
Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
Here is a format example for ARC-Challenge:
{
"name":"ARC-Challenge",
"metric":"WeightedAverageAccuracy",
"score": 0.3389,
"category":[
{
"name":"DEFAULT",
"score": 0.3389,
"subset":[
{
"name":"ARC-Challenge",
"score": 0.3389,
"num": 100
},
]
}
],
"total_num":100
}
"""
total_num: int = sum([num for _, num in subset_score_map.values()])
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
cate_avg_list = [{
'name': subset_name,
'score': normalize_score(score=score),
'num': num
} for subset_name, (score, num) in subset_score_map.items()]

category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)

res_map = dict(
name=report_name or 'DEFAULT',
metric=self.metric_list[0]['name'],
score=weighted_avg_acc,
category=[category_d],
total_num=total_num)

return res_map

def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):

if k > len(data_list):
k = len(data_list)
if few_shot_random:
return random.sample(data_list, k)
else:
return data_list[:k]

def compute_metric(self, review_res_list: list) -> Any:
"""
Compute evaluation result by specific metrics.
Args:
review_res_list: list, the review result list, each item of which is match result for gold and pred.
Attributes:
DataAdapter.metric_func_map: metric_name -> metric_func mapping,
e.g. {'WeightedAverageAccuracy': weighted_average_acc}
Returns:
Metric results.
"""
if len(self.metric_list) == 0:
raise ValueError('No metric list found for the benchmark.')
elif len(self.metric_list) == 1:
# review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
items = [(score, 1.0) for score in review_res_list]
return self.metric_list[0]['object'](items)
else:
raise ValueError('Please implement the compute_metric method for multiple metrics.')

@abstractmethod
def gen_prompt(self, *args, **kwargs) -> Any:
"""
Expand Down Expand Up @@ -203,63 +294,3 @@ def match(self, gold: Any, pred: Any) -> Any:
The match result. Usually a score (float) for chat/multiple-choice-questions.
"""
raise NotImplementedError

@abstractmethod
def compute_metric(self, review_res_list: list) -> Any:
"""
Compute evaluation result by specific metrics.
Args:
review_res_list: list, the review result list, each item of which is match result for gold and pred.
Attributes:
DataAdapter.metric_func_map: metric_name -> metric_func mapping,
e.g. {'WeightedAverageAccuracy': weighted_average_acc}
Returns:
Metric results.
"""
raise NotImplementedError

def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
"""
Generate report for the evaluation results for all subsets.
Args:
subset_score_map: The subset-score map.
e.g. {subset_name: (score, num)}
report_name: str, the user-defined report name. Default: None
Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
Here is a format example for ARC-Challenge:
{
"name":"ARC-Challenge",
"metric":"WeightedAverageAccuracy",
"score": 0.3389,
"category":[
{
"name":"DEFAULT",
"score": 0.3389,
"subset":[
{
"name":"ARC-Challenge",
"score": 0.3389
},
]
}
],
"total_num":100
}
"""
raise NotImplementedError

def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):

if k > len(data_list):
k = len(data_list)
if few_shot_random:
return random.sample(data_list, k)
else:
return data_list[:k]
2 changes: 0 additions & 2 deletions evalscope/benchmarks/gsm8k/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter
Loading

0 comments on commit 655d49c

Please sign in to comment.