diff --git a/evalscope/benchmarks/arc/__init__.py b/evalscope/benchmarks/arc/__init__.py index 8b7d5dc..b937315 100644 --- a/evalscope/benchmarks/arc/__init__.py +++ b/evalscope/benchmarks/arc/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.arc.arc_adapter import ARCAdapter -from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass -from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/arc/arc_adapter.py b/evalscope/benchmarks/arc/arc_adapter.py index 46b1f6a..e00cf78 100644 --- a/evalscope/benchmarks/arc/arc_adapter.py +++ b/evalscope/benchmarks/arc/arc_adapter.py @@ -3,40 +3,34 @@ import json import os -from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import exact_match, weighted_mean -from evalscope.utils import ResponseParser, normalize_score +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.metrics import WeightedAverageAccuracy, exact_match +from evalscope.models import MultiChoiceModelAdapter +from evalscope.utils import ResponseParser from evalscope.utils.logger import get_logger # flake8: noqa logger = get_logger() -DATASET_ID = 'modelscope/ai2_arc' - -# task_list = ['ARC-Easy', 'ARC-Challenge'] -SUBSET_LIST = ['ARC-Challenge'] - +@Benchmark.register( + name='arc', + dataset_id='modelscope/ai2_arc', + model_adapter=MultiChoiceModelAdapter, + subset_list=['ARC-Easy', 'ARC-Challenge'], + metric_list=[WeightedAverageAccuracy], + few_shot_num=0, + train_split='train', + eval_split='test', + prompt_template='', +) class ARCAdapter(DataAdapter): choices = ['A', 'B', 'C', 'D'] - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'train', - eval_split: str = 'test', - prompt_template: str = '', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - + def __init__(self, **kwargs): + few_shot_num = kwargs.get('few_shot_num', None) if few_shot_num is None: # Use 0-shot by default logger.info(f'Set 0-shot examples by system for ARC.') @@ -45,14 +39,7 @@ def __init__(self, if few_shot_num != 0: logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.') - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - prompt_template=prompt_template, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: """ @@ -158,70 +145,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st def match(self, gold: str, pred: str) -> float: return exact_match(gold=gold, pred=pred) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate the report for the model output. - - Args: - subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...} - report_name: The user-defined report name. - - Returns: A dict of metric calculation results. The format is like: - { - "name":"ARC", - "metric":"WeightedAverageAccuracy", - "score":0.3389, - "category":[ - { - "name":"DEFAULT", - "score":0.4128, - "subset":[ - { - "name":"ARC-Easy", - "score":0.5632 - }, - { - "name":"ARC-Challenge", - "score":0.3157 - } - ] - } - ], - "total_num":7800 - } - """ - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - weighted_avg_acc = normalize_score(score=weighted_avg_acc) - cate_avg_list = [{ - 'name': subset_name, - 'score': normalize_score(score=score) - } for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'arc', - metric=self.metric_list[0]['name'], - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _generate_prompt(cls, input_d: dict, include_answer=True) -> str: diff --git a/evalscope/benchmarks/benchmark.py b/evalscope/benchmarks/benchmark.py index 7985536..3011392 100644 --- a/evalscope/benchmarks/benchmark.py +++ b/evalscope/benchmarks/benchmark.py @@ -17,7 +17,7 @@ class BenchmarkMeta: data_adapter: 'DataAdapter' model_adapter: BaseModelAdapter subset_list: List[str] = field(default_factory=list) - metric_list: List[Dict] = field(default_factory=list) + metric_list: List[dict] = field(default_factory=list) few_shot_num: int = 0 few_shot_random: bool = False train_split: Optional[str] = None diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py index fc1e6b4..da3a72e 100644 --- a/evalscope/benchmarks/data_adapter.py +++ b/evalscope/benchmarks/data_adapter.py @@ -6,6 +6,7 @@ from typing import Any, Optional from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType +from evalscope.utils import normalize_score from evalscope.utils.logger import get_logger logger = get_logger() @@ -22,6 +23,11 @@ def __init__(self, prompt_template: str = '', **kwargs): """ + Data Adapter for the benchmark. You need to implement the following methods: + - gen_prompt + - get_gold_answer + - parse_pred_result + - match Args: subset_list: list of subset names for the dataset. metric_list: list, the metric list to evaluate the model on specific benchmark. @@ -141,6 +147,91 @@ def gen_prompts(self, data_dict: dict) -> dict: return res_dict + def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: + """ + Generate report for the evaluation results for all subsets. + + Args: + subset_score_map: The subset-score map. + e.g. {subset_name: (score, num)} + + report_name: str, the user-defined report name. Default: None + + Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils. + + Here is a format example for ARC-Challenge: + { + "name":"ARC-Challenge", + "metric":"WeightedAverageAccuracy", + "score": 0.3389, + "category":[ + { + "name":"DEFAULT", + "score": 0.3389, + "subset":[ + { + "name":"ARC-Challenge", + "score": 0.3389, + "num": 100 + }, + ] + } + ], + "total_num":100 + } + """ + total_num: int = sum([num for _, num in subset_score_map.values()]) + weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num + weighted_avg_acc = normalize_score(score=weighted_avg_acc) + cate_avg_list = [{ + 'name': subset_name, + 'score': normalize_score(score=score), + 'num': num + } for subset_name, (score, num) in subset_score_map.items()] + + category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) + + res_map = dict( + name=report_name or 'DEFAULT', + metric=self.metric_list[0]['name'], + score=weighted_avg_acc, + category=[category_d], + total_num=total_num) + + return res_map + + def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True): + + if k > len(data_list): + k = len(data_list) + if few_shot_random: + return random.sample(data_list, k) + else: + return data_list[:k] + + def compute_metric(self, review_res_list: list) -> Any: + """ + Compute evaluation result by specific metrics. + + Args: + review_res_list: list, the review result list, each item of which is match result for gold and pred. + + Attributes: + DataAdapter.metric_func_map: metric_name -> metric_func mapping, + e.g. {'WeightedAverageAccuracy': weighted_average_acc} + + Returns: + Metric results. + """ + if len(self.metric_list) == 0: + raise ValueError('No metric list found for the benchmark.') + elif len(self.metric_list) == 1: + # review_res_list: review score list, e.g. [0, 1, 1, 0, ...] + items = [(score, 1.0) for score in review_res_list] + return self.metric_list[0]['object'](items) + else: + raise ValueError('Please implement the compute_metric method for multiple metrics.') + @abstractmethod def gen_prompt(self, *args, **kwargs) -> Any: """ @@ -203,63 +294,3 @@ def match(self, gold: Any, pred: Any) -> Any: The match result. Usually a score (float) for chat/multiple-choice-questions. """ raise NotImplementedError - - @abstractmethod - def compute_metric(self, review_res_list: list) -> Any: - """ - Compute evaluation result by specific metrics. - - Args: - review_res_list: list, the review result list, each item of which is match result for gold and pred. - - Attributes: - DataAdapter.metric_func_map: metric_name -> metric_func mapping, - e.g. {'WeightedAverageAccuracy': weighted_average_acc} - - Returns: - Metric results. - """ - raise NotImplementedError - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate report for the evaluation results for all subsets. - - Args: - subset_score_map: The subset-score map. - e.g. {subset_name: (score, num)} - - report_name: str, the user-defined report name. Default: None - - Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils. - - Here is a format example for ARC-Challenge: - { - "name":"ARC-Challenge", - "metric":"WeightedAverageAccuracy", - "score": 0.3389, - "category":[ - { - "name":"DEFAULT", - "score": 0.3389, - "subset":[ - { - "name":"ARC-Challenge", - "score": 0.3389 - }, - ] - } - ], - "total_num":100 - } - """ - raise NotImplementedError - - def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True): - - if k > len(data_list): - k = len(data_list) - if few_shot_random: - return random.sample(data_list, k) - else: - return data_list[:k] diff --git a/evalscope/benchmarks/gsm8k/__init__.py b/evalscope/benchmarks/gsm8k/__init__.py index bf63ba4..b937315 100644 --- a/evalscope/benchmarks/gsm8k/__init__.py +++ b/evalscope/benchmarks/gsm8k/__init__.py @@ -1,3 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter diff --git a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py index d0d830b..5aa67e9 100644 --- a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +++ b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py @@ -6,9 +6,8 @@ import re from evalscope.benchmarks import Benchmark, DataAdapter -from evalscope.metrics.metrics import weighted_mean +from evalscope.metrics import WeightedAverageAccuracy from evalscope.models import ChatGenerationModelAdapter -from evalscope.utils import normalize_score from evalscope.utils.io_utils import jsonl_to_list from evalscope.utils.logger import get_logger @@ -19,10 +18,7 @@ name='gsm8k', dataset_id='modelscope/gsm8k', subset_list=['main'], - metric_list=[{ - 'name': 'WeightedAverageAccuracy', - 'object': weighted_mean - }], + metric_list=[WeightedAverageAccuracy], few_shot_num=4, train_split='train', eval_split='test', @@ -126,66 +122,6 @@ def number_equal(gold_ans, pred_ans): return number_equal(gold_ans=gold, pred_ans=pred) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate the report for the model output. - - Args: - subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...} - report_name: The user-defined report name. Default: None - - Returns: A dict of metric calculation results. The format is like: - { - "name":"GSM8K", - "metric":"WeightedAverageAccuracy", - "score":0.5632, - "category":[ - { - "name":"DEFAULT", - "score":0.5632, - "subset":[ - { - "name":"main", - "score":0.5632 - }, - ] - } - ], - "total_num":100 - } - """ - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - weighted_avg_acc = normalize_score(score=weighted_avg_acc) - cate_avg_list = [{ - 'name': subset_name, - 'score': normalize_score(score=score) - } for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'gsm8k', - metric=self.metric_list[0]['name'], - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _generate_prompt(cls, input_d: dict, few_shot_list: list, use_fewshot: bool = True) -> str: if use_fewshot: diff --git a/evalscope/metrics/__init__.py b/evalscope/metrics/__init__.py index b937315..7c7ff37 100644 --- a/evalscope/metrics/__init__.py +++ b/evalscope/metrics/__init__.py @@ -1 +1,4 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from evalscope.metrics.metrics import exact_match, weighted_mean + +WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean}