fix humaneval for native eval (#248)

* fix humaneval * update doc
modelscope · Dec 17, 2024 · e8ea98b · e8ea98b
1 parent d76e35f
commit e8ea98b
Show file tree

Hide file tree

Showing 9 changed files with 268 additions and 249 deletions.
diff --git a/docs/en/get_started/supported_dataset.md b/docs/en/get_started/supported_dataset.md
@@ -5,19 +5,21 @@
 The framework currently supports the following datasets. If the dataset you need is not in the list, please submit an issue, or use the [OpenCompass backend](../user_guides/backend/opencompass_backend.md) for evaluation, or use the [VLMEvalKit backend](../user_guides/backend/vlmevalkit_backend.md) for multi-modal model evaluation.
 ```
 
-| Dataset Name       | Link                                                                                   | Status | Note |
-|--------------------|----------------------------------------------------------------------------------------|--------|------|
-| `mmlu`             | [mmlu](https://modelscope.cn/datasets/modelscope/mmlu/summary)                         | Active |      |
-| `ceval`            | [ceval](https://modelscope.cn/datasets/modelscope/ceval-exam/summary)                  | Active |      |
-| `gsm8k`            | [gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k/summary)                       | Active |      |
-| `arc`              | [arc](https://modelscope.cn/datasets/modelscope/ai2_arc/summary)                       | Active |      |
-| `hellaswag`        | [hellaswag](https://modelscope.cn/datasets/modelscope/hellaswag/summary)               | Active |      |
-| `truthful_qa`      | [truthful_qa](https://modelscope.cn/datasets/modelscope/truthful_qa/summary)           | Active |      |
-| `competition_math` | [competition_math](https://modelscope.cn/datasets/modelscope/competition_math/summary) | Active |      |
-| `humaneval`        | [humaneval](https://modelscope.cn/datasets/modelscope/humaneval/summary)               | Active |      |
-| `bbh`              | [bbh](https://modelscope.cn/datasets/modelscope/bbh/summary)                           | Active |      |
-| `race`             | [race](https://modelscope.cn/datasets/modelscope/race/summary)                         | Active |      |
-| `trivia_qa`        | [trivia_qa](https://modelscope.cn/datasets/modelscope/trivia_qa/summary)               | To be integrated |      |
+以下是翻译成英文后的表格：
+
+| Name            | Link                                                                                     | Notes |
+|-----------------|------------------------------------------------------------------------------------------|-------|
+| `mmlu`          | [mmlu](https://modelscope.cn/datasets/modelscope/mmlu/summary)                           |       |
+| `ceval`         | [ceval](https://modelscope.cn/datasets/modelscope/ceval-exam/summary)                    |       |
+| `gsm8k`         | [gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k/summary)                         |       |
+| `arc`           | [arc](https://modelscope.cn/datasets/modelscope/ai2_arc/summary)                         |       |
+| `hellaswag`     | [hellaswag](https://modelscope.cn/datasets/modelscope/hellaswag/summary)                 |       |
+| `truthful_qa`   | [truthful_qa](https://modelscope.cn/datasets/modelscope/truthful_qa/summary)             |       |
+| `competition_math` | [competition_math](https://modelscope.cn/datasets/modelscope/competition_math/summary) |       |
+| `humaneval`     | [humaneval](https://modelscope.cn/datasets/modelscope/humaneval/summary)                 | Requires [humaneval](https://github.com/openai/human-eval/tree/master#installation) installation. Since it involves some code execution operations, it is recommended to run in a sandbox environment (docker). |
+| `bbh`           | [bbh](https://modelscope.cn/datasets/modelscope/bbh/summary)                             |       |
+| `race`          | [race](https://modelscope.cn/datasets/modelscope/race/summary)                           |       |
+| `trivia_qa`     | [trivia_qa](https://modelscope.cn/datasets/modelscope/trivia_qa/summary)                 |       |
 
 ## 2. OpenCompass Backend
 Refer to the [detailed explanation](https://github.com/open-compass/opencompass#-dataset-support)

diff --git a/docs/zh/get_started/supported_dataset.md b/docs/zh/get_started/supported_dataset.md
@@ -6,19 +6,22 @@
 目前框架原生支持如下数据集，若您需要的数据集不在列表中，请提交issue；或者使用[OpenCompass backend](../user_guides/backend/opencompass_backend.md)进行语言模型评测；或使用[VLMEvalKit backend](../user_guides/backend/vlmevalkit_backend.md)进行多模态模型评测。
 ```
 
-| 名称        | 链接                                                                                   | 状态 | 备注 |
-|--------------------|----------------------------------------------------------------------------------------|--------|------|
-| `mmlu`             | [mmlu](https://modelscope.cn/datasets/modelscope/mmlu/summary)                         | Active |      |
-| `ceval`            | [ceval](https://modelscope.cn/datasets/modelscope/ceval-exam/summary)                  | Active |      |
-| `gsm8k`            | [gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k/summary)                       | Active |      |
-| `arc`              | [arc](https://modelscope.cn/datasets/modelscope/ai2_arc/summary)                       | Active |      |
-| `hellaswag`        | [hellaswag](https://modelscope.cn/datasets/modelscope/hellaswag/summary)               | Active |      |
-| `truthful_qa`      | [truthful_qa](https://modelscope.cn/datasets/modelscope/truthful_qa/summary)           | Active |      |
-| `competition_math` | [competition_math](https://modelscope.cn/datasets/modelscope/competition_math/summary) | Active |      |
-| `humaneval`        | [humaneval](https://modelscope.cn/datasets/modelscope/humaneval/summary)               | Active |      |
-| `bbh`              | [bbh](https://modelscope.cn/datasets/modelscope/bbh/summary)                           | Active |      |
-| `race`             | [race](https://modelscope.cn/datasets/modelscope/race/summary)                         | Active |      |
-| `trivia_qa`        | [trivia_qa](https://modelscope.cn/datasets/modelscope/trivia_qa/summary)               | Active |      |
+以下是去掉“状态”列后的表格：
+
+| 名称        | 链接                                                                                   | 备注 |
+|-------------|----------------------------------------------------------------------------------------|------|
+| `mmlu`      | [mmlu](https://modelscope.cn/datasets/modelscope/mmlu/summary)                         |      |
+| `ceval`     | [ceval](https://modelscope.cn/datasets/modelscope/ceval-exam/summary)                  |      |
+| `gsm8k`     | [gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k/summary)                       |      |
+| `arc`       | [arc](https://modelscope.cn/datasets/modelscope/ai2_arc/summary)                       |      |
+| `hellaswag` | [hellaswag](https://modelscope.cn/datasets/modelscope/hellaswag/summary)               |      |
+| `truthful_qa`| [truthful_qa](https://modelscope.cn/datasets/modelscope/truthful_qa/summary)          |      |
+| `competition_math` | [competition_math](https://modelscope.cn/datasets/modelscope/competition_math/summary) |  |
+| `humaneval` | [humaneval](https://modelscope.cn/datasets/modelscope/humaneval/summary)               | 需安装[humaneval](https://github.com/openai/human-eval/tree/master#installation)，因为涉及到一些代码运行的操作，建议在sandbox环境（docker）中运行 |
+| `bbh`       | [bbh](https://modelscope.cn/datasets/modelscope/bbh/summary)                           |      |
+| `race`      | [race](https://modelscope.cn/datasets/modelscope/race/summary)                         |      |
+| `trivia_qa` | [trivia_qa](https://modelscope.cn/datasets/modelscope/trivia_qa/summary)               |      |
+
 
 ## 2. OpenCompass评测后端支持的数据集
 

diff --git a/evalscope/benchmarks/humaneval/humaneval_adapter.py b/evalscope/benchmarks/humaneval/humaneval_adapter.py
@@ -1,20 +1,206 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import os
+import re
+from tqdm import tqdm
+from typing import List
 
-# flake8: noqa
+from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.metrics.metrics import weighted_mean
+from evalscope.tools.combine_reports import gen_table
+from evalscope.utils import normalize_score
+from evalscope.utils.logger import get_logger
+
+logger = get_logger()
 
 DATASET_ID = 'modelscope/humaneval'
 SUBSET_LIST = ['openai_humaneval']
 
-# Note: ONLY FOR CLASS IMPORT, No implementation here.
-
 # Example:
-# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
+# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}  # noqa
 
 
-class HumanevalAdapter:
+class HumanevalAdapter(DataAdapter):
     """
     A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
     """
 
-    def __init__(self):
-        ...
+    def __init__(self,
+                 subset_list: list = None,
+                 metric_list: list = None,
+                 few_shot_num: int = None,
+                 train_split: str = None,
+                 eval_split: str = 'test',
+                 prompt_template: str = 'Complete the following python code:\n',
+                 **kwargs):
+        try:
+            from human_eval.data import stream_jsonl, write_jsonl
+            from human_eval.evaluation import check_correctness
+        except ImportError:
+            raise ImportError('Please install human_eval:'
+                              'https://github.com/openai/human-eval/tree/master#installation , '
+                              'Note that you need to enable the execution code in the human_eval/execution.py first.')
+
+        if subset_list is None:
+            subset_list = SUBSET_LIST
+
+        if metric_list is None:
+            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+
+        self.k = [1]
+        self.num_workers = 4
+        self.timeout = 4.0
+        self.outputs = kwargs.get('outputs', None)
+
+        self.read_problems_func = stream_jsonl
+        self.write_jsonl_func = write_jsonl
+        self.eval_func = check_correctness
+
+        super().__init__(
+            subset_list=subset_list,
+            metric_list=metric_list,
+            few_shot_num=few_shot_num,
+            train_split=train_split,
+            eval_split=eval_split,
+            prompt_template=prompt_template,
+            **kwargs)
+
+    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
+        data_dict = {}
+        for subset_name in subset_list:
+            data_dict[subset_name] = {}
+            # [{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...]
+            data_dict[subset_name][self.eval_split] = [task for task in self.read_problems_func(dataset_name_or_path)]
+
+        return data_dict
+
+    def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate prompt for the model.
+
+        Args:
+            input_d (dict): The raw input. A single data format of the Humaneval:
+            {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
+        """
+        full_prompt = input_d['prompt']
+        full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
+
+        return {'data': [full_prompt]}
+
+    def get_answers(self, infer_cfg: dict) -> List[dict]:
+        ans_list: list = []
+        system_prompt: str = ''
+        for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
+            prompt: str = system_prompt + data_d['prompt']
+            inputs: dict = {'data': [prompt]}
+
+            pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
+
+            pred_ans: str = pred_res['choices'][0]['message']['content']
+            pred_ans = self._postprocess(pred_ans)
+
+            ans_list.append({'task_id': task_id, 'completion': pred_ans})
+
+        return ans_list
+
+    def eval(self, infer_cfg: dict, **kwargs):
+
+        # predict
+        ans_list: list = self.get_answers(infer_cfg)
+        ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
+
+        self.write_jsonl_func(filename=ans_out_file, data=ans_list)
+        # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
+        logger.info('** Dump predictions successfully.')
+
+        # evaluate  results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
+        results = self.eval_func(
+            sample_file=ans_out_file,
+            k=self.k,
+            n_workers=self.num_workers,
+            timeout=self.timeout,
+            problem_file=self.problem_file)
+
+        # output: report
+        report_map: dict = self.gen_report(results=results)
+        report_dir: str = self.outputs_structure.reports_dir
+        report_file: str = os.path.join(report_dir, 'human_eval_report.json')
+
+        with open(report_file, 'w') as f:
+            f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
+        # logger.info(f'** Dump report to {report_file} \n')
+        logger.info('** Dump report \n')
+
+        try:
+            # Make table
+            report_table: str = gen_table([report_dir])
+            logger.info(f'** Report table: \n {report_table} \n')
+        except Exception:
+            logger.error('Failed to generate report table.')
+
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        total_num: int = sum([num for _, num in subset_score_map.values()])
+        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
+        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
+        cate_avg_list = [{
+            'name': subset_name,
+            'score': normalize_score(score=score)
+        } for subset_name, (score, _) in subset_score_map.items()]
+
+        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
+
+        res_map = dict(
+            name=report_name or 'HumanEval',
+            metric='pass@1',
+            score=weighted_avg_acc,
+            category=[category_d],
+            total_num=total_num)
+
+        return res_map
+
+    @classmethod
+    def _postprocess(cls, text: str) -> str:
+        if '```' in text:
+            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+            if len(blocks) == 0:
+                text = text.split('```')[1]  # fall back to default strategy
+            else:
+                text = blocks[0]  # fetch the first code block
+                if not text.startswith('\n'):  # in case starting with ```python
+                    text = text[max(text.find('\n') + 1, 0):]
+        if text.strip().startswith('from') or text.strip().startswith('import'):
+            def_idx = text.find('def')
+            if def_idx != -1:
+                text = text[max(text.find('\n', def_idx) + 1, 0):]
+        text = text.split('\n\n')[0]
+        if text.strip().startswith('def'):
+            text = '\n'.join(text.split('\n')[1:])
+        if not text.startswith('    '):
+            if text.startswith(' '):
+                text = '    ' + text.lstrip()
+            else:
+                text = '\n'.join(['    ' + line for line in text.split('\n')])
+        return text
+
+    def compute_metric(self, review_res_list: list) -> float:
+        """
+        Compute evaluation result by specific metric.
+
+        Args:
+            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
+
+        Returns:
+            The metric score.
+        """
+        items = [(score, 1.0) for score in review_res_list]
+        return weighted_mean(items)
+
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        return self._postprocess(result)
+
+    def get_gold_answer(self, input_d: dict) -> str:
+        return input_d
+
+    def match(self, gold: str, pred: str) -> float:
+        res = self.eval_func(gold, pred, self.timeout)
+        return float(res['passed'])
diff --git a/evalscope/evaluator/__init__.py b/evalscope/evaluator/__init__.py
@@ -1,4 +1,3 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 from evalscope.evaluator.evaluator import Evaluator
-from evalscope.evaluator.humaneval_evaluator import HumanevalEvaluator
diff --git a/evalscope/evaluator/evaluator.py b/evalscope/evaluator/evaluator.py
@@ -57,7 +57,7 @@ def __init__(self,
                  **kwargs):
 
         self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
-        self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
+        self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
         self.model_name = overall_task_cfg.model_id
         self.custom_task_name = f'{self.model_name}_{self.dataset_name}'