update data adapter

modelscope · Dec 19, 2024 · 655d49c · 655d49c
1 parent 376afc8
commit 655d49c
Show file tree

Hide file tree

Showing 7 changed files with 115 additions and 229 deletions.
diff --git a/evalscope/benchmarks/arc/__init__.py b/evalscope/benchmarks/arc/__init__.py
@@ -1,6 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.arc.arc_adapter import DATASET_ID, SUBSET_LIST
-from evalscope.benchmarks.arc.arc_adapter import ARCAdapter
-from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
-from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass  # noqa
diff --git a/evalscope/benchmarks/arc/arc_adapter.py b/evalscope/benchmarks/arc/arc_adapter.py
@@ -3,40 +3,34 @@
 import json
 import os
 
-from evalscope.benchmarks.data_adapter import DataAdapter
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.models import MultiChoiceModelAdapter
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 
 # flake8: noqa
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/ai2_arc'
-
-# task_list = ['ARC-Easy', 'ARC-Challenge']
-SUBSET_LIST = ['ARC-Challenge']
-
 
+@Benchmark.register(
+    name='arc',
+    dataset_id='modelscope/ai2_arc',
+    model_adapter=MultiChoiceModelAdapter,
+    subset_list=['ARC-Easy', 'ARC-Challenge'],
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=0,
+    train_split='train',
+    eval_split='test',
+    prompt_template='',
+)
 class ARCAdapter(DataAdapter):
 
     choices = ['A', 'B', 'C', 'D']
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = 'train',
-                 eval_split: str = 'test',
-                 prompt_template: str = '',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
-
+    def __init__(self, **kwargs):
+        few_shot_num = kwargs.get('few_shot_num', None)
         if few_shot_num is None:
             # Use 0-shot by default
             logger.info(f'Set 0-shot examples by system for ARC.')
@@ -45,14 +39,7 @@ def __init__(self,
         if few_shot_num != 0:
             logger.warning(f'few_shot_num is recommended to set 0 for ARC, got {few_shot_num}.')
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            prompt_template=prompt_template,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         """
@@ -158,70 +145,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"ARC",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.4128,
-                    "subset":[
-                        {
-                            "name":"ARC-Easy",
-                            "score":0.5632
-                        },
-                        {
-                            "name":"ARC-Challenge",
-                            "score":0.3157
-                        }
-                    ]
-                }
-            ],
-            "total_num":7800
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'arc',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
 

diff --git a/evalscope/benchmarks/benchmark.py b/evalscope/benchmarks/benchmark.py
@@ -17,7 +17,7 @@ class BenchmarkMeta:
     data_adapter: 'DataAdapter'
     model_adapter: BaseModelAdapter
     subset_list: List[str] = field(default_factory=list)
-    metric_list: List[Dict] = field(default_factory=list)
+    metric_list: List[dict] = field(default_factory=list)
     few_shot_num: int = 0
     few_shot_random: bool = False
     train_split: Optional[str] = None

diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py
@@ -6,6 +6,7 @@
 from typing import Any, Optional
 
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, HubType
+from evalscope.utils import normalize_score
 from evalscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -22,6 +23,11 @@ def __init__(self,
                  prompt_template: str = '',
                  **kwargs):
         """
+        Data Adapter for the benchmark. You need to implement the following methods:
+            - gen_prompt
+            - get_gold_answer
+            - parse_pred_result
+            - match
         Args:
             subset_list: list of subset names for the dataset.
             metric_list: list, the metric list to evaluate the model on specific benchmark.
@@ -141,6 +147,91 @@ def gen_prompts(self, data_dict: dict) -> dict:
 
         return res_dict
 
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        """
+        Generate report for the evaluation results for all subsets.
+
+        Args:
+            subset_score_map: The subset-score map.
+                e.g. {subset_name: (score, num)}
+
+            report_name: str, the user-defined report name. Default: None
+
+        Returns: The evaluation report.  Note: should normalize the score by normalize_score method in utils.
+
+        Here is a format example for ARC-Challenge:
+        {
+            "name":"ARC-Challenge",
+            "metric":"WeightedAverageAccuracy",
+            "score": 0.3389,
+            "category":[
+                {
+                    "name":"DEFAULT",
+                    "score": 0.3389,
+                    "subset":[
+                        {
+                            "name":"ARC-Challenge",
+                            "score": 0.3389,
+                            "num": 100
+                        },
+                    ]
+                }
+            ],
+            "total_num":100
+        }
+        """
+        total_num: int = sum([num for _, num in subset_score_map.values()])
+        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
+        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score),
+            'num': num
+        } for subset_name, (score, num) in subset_score_map.items()]
+
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
+
+        res_map = dict(
+            name=report_name or 'DEFAULT',
+            metric=self.metric_list[0]['name'],
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
+
+        return res_map
+
+    def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
+
+        if k > len(data_list):
+            k = len(data_list)
+        if few_shot_random:
+            return random.sample(data_list, k)
+        else:
+            return data_list[:k]
+
+    def compute_metric(self, review_res_list: list) -> Any:
+        """
+        Compute evaluation result by specific metrics.
+
+        Args:
+            review_res_list: list, the review result list, each item of which is match result for gold and pred.
+
+        Attributes:
+            DataAdapter.metric_func_map: metric_name -> metric_func mapping,
+                e.g. {'WeightedAverageAccuracy': weighted_average_acc}
+
+        Returns:
+            Metric results.
+        """
+        if len(self.metric_list) == 0:
+            raise ValueError('No metric list found for the benchmark.')
+        elif len(self.metric_list) == 1:
+            # review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
+            items = [(score, 1.0) for score in review_res_list]
+            return self.metric_list[0]['object'](items)
+        else:
+            raise ValueError('Please implement the compute_metric method for multiple metrics.')
+
     @abstractmethod
     def gen_prompt(self, *args, **kwargs) -> Any:
         """
@@ -203,63 +294,3 @@ def match(self, gold: Any, pred: Any) -> Any:
             The match result. Usually a score (float) for chat/multiple-choice-questions.
         """
         raise NotImplementedError
-
-    @abstractmethod
-    def compute_metric(self, review_res_list: list) -> Any:
-        """
-        Compute evaluation result by specific metrics.
-
-        Args:
-            review_res_list: list, the review result list, each item of which is match result for gold and pred.
-
-        Attributes:
-            DataAdapter.metric_func_map: metric_name -> metric_func mapping,
-                e.g. {'WeightedAverageAccuracy': weighted_average_acc}
-
-        Returns:
-            Metric results.
-        """
-        raise NotImplementedError
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate report for the evaluation results for all subsets.
-
-        Args:
-            subset_score_map: The subset-score map.
-                e.g. {subset_name: (score, num)}
-
-            report_name: str, the user-defined report name. Default: None
-
-        Returns: The evaluation report.  Note: should normalize the score by normalize_score method in utils.
-
-        Here is a format example for ARC-Challenge:
-        {
-            "name":"ARC-Challenge",
-            "metric":"WeightedAverageAccuracy",
-            "score": 0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score": 0.3389,
-                    "subset":[
-                        {
-                            "name":"ARC-Challenge",
-                            "score": 0.3389
-                        },
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        raise NotImplementedError
-
-    def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
-
-        if k > len(data_list):
-            k = len(data_list)
-        if few_shot_random:
-            return random.sample(data_list, k)
-        else:
-            return data_list[:k]
diff --git a/evalscope/benchmarks/gsm8k/__init__.py b/evalscope/benchmarks/gsm8k/__init__.py
@@ -1,3 +1 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-
-from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter
Original file line number	Diff line number	Diff line change
		@@ -1,3 +1 @@
		# Copyright (c) Alibaba, Inc. and its affiliates.

		from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter