unify opencompass (#242)

* unify opencompass * move uilts * fix union * unify vlmeval and update doc * update
modelscope · Dec 16, 2024 · b03d9e3 · b03d9e3
1 parent bffa15b
commit b03d9e3
Show file tree

Hide file tree

Showing 41 changed files with 266 additions and 282 deletions.
diff --git a/docs/en/user_guides/backend/vlmevalkit_backend.md b/docs/en/user_guides/backend/vlmevalkit_backend.md
@@ -184,6 +184,7 @@ Create configuration files:
 ::::{tab-set}
 :::{tab-item} YAML Configuration File
 ```yaml
+work_dir: outputs
 eval_backend: VLMEvalKit
 eval_config:
   model: 
@@ -198,15 +199,15 @@ eval_config:
     - ChartQA_TEST
   mode: all
   limit: 20
-  reuse: true
-  work_dir: outputs
+  reuse: false
   nproc: 16
 ```
 :::
 
 :::{tab-item} Python Dictionary
 ```python
 task_cfg_dict = {
+    'work_dir': 'outputs',
     'eval_backend': 'VLMEvalKit',
     'eval_config': 
             {'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
@@ -219,8 +220,7 @@ task_cfg_dict = {
                 'temperature': 0.0,
                 'type': 'qwen-vl-chat'}
                 ],
-            'reuse': True,
-            'work_dir': 'output'}}
+            'reuse': False}}
 ```
 :::
 ::::
@@ -232,6 +232,7 @@ This method does not involve starting a model service; instead, it directly conf
 ::::{tab-set}
 :::{tab-item} YAML Configuration File
 ```yaml
+work_dir: outputs
 eval_backend: VLMEvalKit
 eval_config:
   model: 
@@ -242,7 +243,7 @@ eval_config:
     - ChartQA_TEST
   mode: all
   limit: 20
-  reuse: true
+  reuse: false
   work_dir: outputs
   nproc: 16
 ```
@@ -251,6 +252,7 @@ eval_config:
 :::{tab-item} Python Dictionary
 ```python
 task_cfg_dict = {
+    'work_dir': 'outputs',
     'eval_backend': 'VLMEvalKit',
     'eval_config': 
             {'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
@@ -260,13 +262,13 @@ task_cfg_dict = {
                 {'name': 'qwen_chat',
                 'model_path': 'models/Qwen-VL-Chat'}
                 ],
-            'reuse': True,
-            'work_dir': 'outputs'}}
+            'reuse': False}}
 ```
 :::
 ::::
 
 ## Parameters
+- `work_dir`: A string specifying the directory where evaluation results, logs, and summaries are saved. The default value is `outputs`.
 - `eval_backend`: Default value is `VLMEvalKit`, indicating the use of the VLMEvalKit evaluation backend.
 - `eval_config`: A dictionary containing the following fields:
   - `data`: A list referencing the [currently supported datasets](#2-data-preparation).
@@ -291,7 +293,7 @@ Certainly! Here's the translated text in English, while maintaining the original
   - `limit`: Integer indicating the number of evaluation data; default value is `None`, meaning all examples will be run.
   - `reuse`: Boolean indicating whether to reuse the evaluation, which will delete all temporary evaluation files.
     ```{note}
-    For `ms-vlmeval>=0.0.11`, the parameter `rerun` has been renamed to `reuse`, with a default value of `False`.
+    For `ms-vlmeval>=0.0.11`, the parameter `rerun` has been renamed to `reuse`, with a default value of `False`. When set to `True`, you need to add `use_cache` in the `task_cfg_dict` to specify the cache directory to be used.
     ```
   - `work_dir`: String specifying the directory to save evaluation results, logs, and summaries; default value is `outputs`.
   - `nproc`: Integer indicating the number of API calls in parallel.

diff --git a/docs/en/user_guides/stress_test/parameters.md b/docs/en/user_guides/stress_test/parameters.md
@@ -12,6 +12,7 @@ Execute `evalscope perf --help` to get a full parameter description:
   - Select `local` to use local files as models and perform inference using transformers. `--model` should be the model file path or model_id, which will be automatically downloaded from modelscope, e.g., `Qwen/Qwen2.5-0.5B-Instruct`.
   - Select `local_vllm` to use local files as models and start the vllm inference service. `--model` should be the model file path or model_id, which will be automatically downloaded from modelscope, e.g., `Qwen/Qwen2.5-0.5B-Instruct`.
   - You can also use a custom API, refer to [Custom API Guide](./custom.md#custom-request-api).
+- `--port`: The port for the local inference service, defaulting to 8877. This is only applicable to `local` and `local_vllm`.
 - `--attn-implementation`: Attention implementation method, default is None, optional [flash_attention_2|eager|sdpa], only effective when `api` is `local`.
 - `--api-key`: API key, optional.
 - `--debug`: Output debug information.

diff --git a/docs/zh/user_guides/backend/vlmevalkit_backend.md b/docs/zh/user_guides/backend/vlmevalkit_backend.md
@@ -194,6 +194,7 @@ ollama create llava -f ./Modelfile
 ::::{tab-set}
 :::{tab-item} yaml 配置文件
 ```yaml
+work_dir: outputs
 eval_backend: VLMEvalKit
 eval_config:
   model: 
@@ -208,8 +209,7 @@ eval_config:
     - ChartQA_TEST
   mode: all
   limit: 20
-  reuse: true
-  work_dir: outputs
+  reuse: false
   nproc: 16
 ```
 :::
@@ -218,6 +218,7 @@ eval_config:
 
 ```python
 task_cfg_dict = {
+    'work_dir': 'outputs',
     'eval_backend': 'VLMEvalKit',
     'eval_config': 
             {'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
@@ -230,8 +231,7 @@ task_cfg_dict = {
                 'temperature': 0.0,
                 'type': 'qwen-vl-chat'}
                 ],
-            'reuse': True,
-            'work_dir': 'output'}}
+            'reuse': False,}}
 ```
 :::
 ::::
@@ -248,6 +248,7 @@ task_cfg_dict = {
 ```{code-block} yaml 
 :caption: eval_openai_api.json
 
+work_dir: outputs
 eval_backend: VLMEvalKit
 eval_config:
   model: 
@@ -258,7 +259,7 @@ eval_config:
     - ChartQA_TEST
   mode: all
   limit: 20
-  reuse: true
+  reuse: false
   work_dir: outputs
   nproc: 16
 ```
@@ -268,6 +269,7 @@ eval_config:
 
 ```python
 task_cfg_dict = {
+    'work_dir': 'outputs',
     'eval_backend': 'VLMEvalKit',
     'eval_config': 
             {'data': ['SEEDBench_IMG', 'ChartQA_TEST'],
@@ -277,15 +279,15 @@ task_cfg_dict = {
                 {'name': 'qwen_chat',
                 'model_path': 'models/Qwen-VL-Chat'}
                 ],
-            'reuse': True,
-            'work_dir': 'outputs'}}
+            'reuse': False}}
 ```
 :::
 ::::
 
 ### 参数说明
 
 - `eval_backend`：默认值为 `VLMEvalKit`，表示使用 VLMEvalKit 评测后端。
+- `work_dir`：字符串，保存评测结果、日志和摘要的目录。默认值为 `outputs`。
 - `eval_config`：字典，包含以下字段：
   - `data`：列表，参考[目前支持的数据集](#2-数据准备)
   - `model`：字典列表，每个字典可以指定以下字段：
@@ -307,9 +309,8 @@ task_cfg_dict = {
   - `limit`：整数，评测的数据数量，默认值为 `None`，表示运行所有示例。
   - `reuse`：布尔值，是否重用评测结果，否则将删除所有评测临时文件。
     ```{note}
-    对与`ms-vlmeval>=0.0.11`参数`rerun` 更名为`reuse`，默认值为`False`。
+    对与`ms-vlmeval>=0.0.11`参数`rerun` 更名为`reuse`，默认值为`False`。设置为`True`时需要在task_cfg_dict中添加`use_cache`来指定使用的缓存目录。
     ```
-  - `work_dir`：字符串，保存评测结果、日志和摘要的目录。默认值为 `outputs`
   - `nproc`：整数，并行调用 API 的数量。
   - `nframe`：整数，视频数据集的视频帧数，默认值为 `8`，
   - `fps`：整数，视频数据集的帧率，默认值为 `-1`，表示使用`nframe`；设置为大于0，则使用`fps`来计算视频帧数。

diff --git a/docs/zh/user_guides/stress_test/parameters.md b/docs/zh/user_guides/stress_test/parameters.md
@@ -13,6 +13,7 @@
   - 指定为`local`，则使用本地文件作为模型，并使用transformers进行推理。`--model`为模型文件路径，也可为model_id，将自动从modelscope下载模型，例如`Qwen/Qwen2.5-0.5B-Instruct`。
   - 指定为`local_vllm`，则使用本地文件作为模型，并启动vllm推理服务。`--model`为模型文件路径，也可为model_id，将自动从modelscope下载模型，例如`Qwen/Qwen2.5-0.5B-Instruct`。
   - 您也可以自定义API，请参考[自定义API指南](./custom.md/#自定义请求-api)。
+- `--port` 本地推理服务端口，默认为8877，仅对`local`和`local_vllm`有效。
 - `--attn-implementation` Attention实现方式，默认为None，可选[flash_attention_2|eager|sdpa]，仅在`api`为`local`时有效。
 - `--api-key` API密钥，可选。
 - `--debug` 输出调试信息。

diff --git a/evalscope/backend/base.py b/evalscope/backend/base.py
@@ -2,7 +2,7 @@
 from typing import Union
 
 from evalscope.config import TaskConfig
-from evalscope.utils import yaml_to_dict
+from evalscope.utils.io_utils import yaml_to_dict
 
 
 class BackendManager:

diff --git a/evalscope/backend/rag_eval/utils/clip.py b/evalscope/backend/rag_eval/utils/clip.py
@@ -4,7 +4,7 @@
 from langchain_core.embeddings import Embeddings
 from PIL import Image
 from transformers import AutoModel, AutoProcessor
-from typing import List
+from typing import List, Union
 
 from evalscope.backend.rag_eval.utils.tools import PIL_to_base64, download_model
 from evalscope.constants import HubType
@@ -86,7 +86,7 @@ def __init__(
         self.transform = self.processor.image_processor
         self.tokenizer = self.processor.tokenizer
 
-    def encode_text(self, batch_texts: List[str] | List[List[str]]):
+    def encode_text(self, batch_texts: Union[List[str], List[List[str]]]):
         if isinstance(batch_texts[0], list):
             batch_texts = [text for _, texts in enumerate(batch_texts) for text in texts]
         # Ensure that the input texts are within the token limit

diff --git a/evalscope/backend/rag_eval/utils/embedding.py b/evalscope/backend/rag_eval/utils/embedding.py
@@ -80,7 +80,7 @@ def encode_queries(self, queries: List[str], **kwargs) -> list[torch.Tensor]:
         """Embed query text. Compact mteb."""
         raise NotImplementedError
 
-    def encode_corpus(self, corpus: List[str] | List[Dict[str, str]], **kwargs) -> list[torch.Tensor]:
+    def encode_corpus(self, corpus: Union[List[str], List[Dict[str, str]]], **kwargs) -> list[torch.Tensor]:
         """Embed search docs . Compact mteb."""
         raise NotImplementedError
 

diff --git a/evalscope/benchmarks/general_qa/general_qa_adapter.py b/evalscope/benchmarks/general_qa/general_qa_adapter.py
@@ -8,7 +8,7 @@
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
 from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
-from evalscope.utils import jsonl_to_list
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
 logger = get_logger()

diff --git a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py
@@ -6,7 +6,8 @@
 
 from evalscope.benchmarks import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import jsonl_to_list, normalize_score
+from evalscope.utils import normalize_score
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
 # flake8: noqa

diff --git a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py
@@ -5,7 +5,8 @@
 
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import jsonl_to_list, normalize_score
+from evalscope.utils import normalize_score
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
 # flake8: noqa

diff --git a/evalscope/benchmarks/race/race_adapter.py b/evalscope/benchmarks/race/race_adapter.py
@@ -5,7 +5,8 @@
 
 from evalscope.benchmarks.data_adapter import DataAdapter
 from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import jsonl_to_list, normalize_score
+from evalscope.utils import normalize_score
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
 # flake8: noqa

diff --git a/evalscope/config.py b/evalscope/config.py
@@ -9,7 +9,8 @@
 
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
 from evalscope.models.custom import CustomModel
-from evalscope.utils import dict_to_yaml, gen_hash, json_to_dict, yaml_to_dict
+from evalscope.utils import gen_hash
+from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
 from evalscope.utils.logger import get_logger
 
 logger = get_logger()

diff --git a/evalscope/constants.py b/evalscope/constants.py
@@ -76,33 +76,6 @@ class ArenaMode:
     PAIRWISE_BASELINE = 'pairwise_baseline'
 
 
-class OutputsStructure:
-    LOGS_DIR = 'logs'
-    PREDICTIONS_DIR = 'predictions'
-    REVIEWS_DIR = 'reviews'
-    REPORTS_DIR = 'reports'
-    CONFIGS_DIR = 'configs'
-
-    def __init__(self, outputs_dir: str, is_make: bool = True):
-        self.outputs_dir = outputs_dir
-        self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
-        self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
-        self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
-        self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
-        self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
-
-        if is_make:
-            self.create_directories()
-
-    def create_directories(self):
-        os.makedirs(self.outputs_dir, exist_ok=True)
-        os.makedirs(self.logs_dir, exist_ok=True)
-        os.makedirs(self.predictions_dir, exist_ok=True)
-        os.makedirs(self.reviews_dir, exist_ok=True)
-        os.makedirs(self.reports_dir, exist_ok=True)
-        os.makedirs(self.configs_dir, exist_ok=True)
-
-
 class AnswerKeys:
     ANSWER_ID = 'answer_id'
     RAW_INPUT = 'raw_input'

diff --git a/evalscope/evaluator/evaluator.py b/evalscope/evaluator/evaluator.py
@@ -11,10 +11,11 @@
 from evalscope.benchmarks import DataAdapter
 from evalscope.config import TaskConfig
 from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
-                                 OutputsStructure, ReviewKeys)
+                                 ReviewKeys)
 from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
 from evalscope.tools.combine_reports import gen_table
-from evalscope.utils import dict_torch_dtype_to_str, dump_jsonl_data, gen_hash, jsonl_to_list
+from evalscope.utils import dict_torch_dtype_to_str, gen_hash
+from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
 from evalscope.utils.logger import get_logger
 
 logger = get_logger()

diff --git a/evalscope/evaluator/humaneval_evaluator.py b/evalscope/evaluator/humaneval_evaluator.py
@@ -4,11 +4,11 @@
 from tqdm import tqdm
 from typing import List, Optional
 
-from evalscope.constants import OutputsStructure
 from evalscope.evaluator.evaluator import logger
 from evalscope.models.model_adapter import BaseModelAdapter
 from evalscope.tools.combine_reports import gen_table
 from evalscope.utils import normalize_score
+from evalscope.utils.io_utils import OutputsStructure
 
 
 class HumanevalEvaluator(object):

diff --git a/evalscope/evaluator/rating_eval.py b/evalscope/evaluator/rating_eval.py
@@ -5,8 +5,8 @@
 from typing import List, Union
 
 from evalscope.constants import MetricMembers
-from evalscope.utils import jsonl_to_list
 from evalscope.utils.arena_utils import compute_elo
+from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
 
 logger = get_logger()

diff --git a/evalscope/evaluator/reviewer/auto_reviewer.py b/evalscope/evaluator/reviewer/auto_reviewer.py
@@ -12,8 +12,9 @@
 
 from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
 from evalscope.models.openai_model import OpenAIModel
-from evalscope.utils import completion_parsers, dump_jsonl_data, jsonl_to_list, random_seeded_choice
+from evalscope.utils import completion_parsers, random_seeded_choice
 from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
+from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
 from evalscope.utils.logger import get_logger
 
 logger = get_logger()

diff --git a/evalscope/perf/arguments.py b/evalscope/perf/arguments.py
@@ -16,7 +16,7 @@ class Arguments:
     attn_implementation: Optional[str] = None  # Attention implementaion, only for local inference
     api: str = 'openai'  # API to be used (default: 'openai')
     tokenizer_path: Optional[str] = None  # Path to the tokenizer
-    port: str = '8877'  # Port number for the local API server
+    port: int = 8877  # Port number for the local API server
 
     # Connection settings
     url: str = 'http://127.0.0.1:8877/v1/chat/completions'  # URL for the API connection
@@ -138,6 +138,7 @@ def add_argument(parser: argparse.ArgumentParser):
 
     # Connection settings
     parser.add_argument('--url', type=str, default='http://127.0.0.1:8877/v1/chat/completions')
+    parser.add_argument('--port', type=int, default=8877, help='The port for local inference')
     parser.add_argument('--headers', nargs='+', dest='headers', action=ParseKVAction, help='Extra HTTP headers')
     parser.add_argument('--api-key', type=str, required=False, default='EMPTY', help='The API key for authentication')
     parser.add_argument('--connect-timeout', type=int, default=120, help='The network connection timeout')

diff --git a/evalscope/perf/main.py b/evalscope/perf/main.py
@@ -8,7 +8,7 @@
 from evalscope.perf.benchmark import benchmark
 from evalscope.perf.utils.db_util import get_output_path
 from evalscope.perf.utils.handler import add_signal_handlers
-from evalscope.utils.logger import get_logger
+from evalscope.utils.logger import configure_logging, get_logger
 from evalscope.utils.utils import seed_everything
 
 logger = get_logger()
@@ -23,10 +23,7 @@ def run_perf_benchmark(args):
 
     # Setup logger and output
     args.outputs_dir = get_output_path(args)
-    get_logger(log_file=os.path.join(args.outputs_dir, 'benchmark.log'), force=True)
-
-    if args.debug:
-        get_logger(log_level=logging.DEBUG, force=True)
+    configure_logging(args.debug, os.path.join(args.outputs_dir, 'benchmark.log'))
 
     logger.info('Starting benchmark...')
     logger.info(args)