Refactor setup and add UTs (#85)

* add opencompass cmd parser * fix dry-run * update * refactor for backend & add backend module * add example for oc * update * update readme and example * temp * update opencompass parser * add opencompas cli and chat_medium collection * update cli and eval_api * add OpenCompassArguments for cli * update eval_api for models * update version * add check end for ms-opencompass * add backend args merge and datasets filter * add models and api meta template * assert models * add tmpfile for task config * fix datasets in self.args.config * update * pop dataset_name key in datasets * set meta_template to None for mmlu, ceval, ... * fix models path * update test task * update eval task * update test tasks * add debug info * add example for swift eval * add download data * update * update import for eval_datasets * add json config for toolbench eval * add entry task in run.py; update example_eval_swift_openai_api * add yaml and json config * update example * update gsm8k * update example * update summarizer * support opencompass backend in Summaryzer * update summarizer * update exmaple * update example; set dataset_dir in config.py * update version * update args name * update example * set mmlu to 0-shot for swift * add api key for openai api * update eval datasets * add limit * add limit example * update datasets * update readme for oc backend * update example * update readme for oc backend * add readme for en * fix eval_config assertion * fix eval_backend assertion * fix eval_backend assertion * fix eval_backend assertion * add ut for swift-eval * add test run all * update tests * add logger for ut * update * update pypi source * add debug * update * fix swift deploy subprocess * update * add check service * update * update * fix check swift server * update terminate process * update * update example * update example * update example * fix eval_backend checking * update version * update UTs and examples * refactor setup * add eval_backend and eval_config in config.py * fix pr issues
modelscope · Jul 28, 2024 · 1056a56 · 1056a56
1 parent d15017a
commit 1056a56
Show file tree

Hide file tree

Showing 19 changed files with 218 additions and 39 deletions.
diff --git a/examples/example_eval_swift_openai_api.py b/examples/example_eval_swift_openai_api.py
@@ -45,11 +45,12 @@ def run_swift_eval():
         eval_backend='OpenCompass',
         eval_config={'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
                      'models': [
-                         {'path': 'llama3-8b-instruct', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'},
-                         {'path': 'llama3-8b', 'is_chat': False, 'key': 'EMPTY', 'openai_api_base': 'http://127.0.0.1:8001/v1/completions'}
+                         {'path': 'llama3-8b-instruct', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions', 'batch_size': 100},
+                         {'path': 'llama3-8b', 'is_chat': False, 'key': 'EMPTY', 'openai_api_base': 'http://127.0.0.1:8001/v1/completions', 'batch_size': 100}
                      ],
                      'work_dir': 'outputs/llama3_eval_result',
-                     'limit': 5,  # Could be int/float/str, e.g. 5 or 5.0 or `[10:20]`, default to None, it means run all examples
+                     # Could be int/float/str, e.g. 5 or 5.0 or `[10:20]`, default to None, it means run all examples
+                     'limit': 10,
                      },
     )
 

diff --git a/examples/example_eval_vlm_local.py b/examples/example_eval_vlm_local.py
@@ -2,7 +2,7 @@
 
 """
 1. Installation
-eval-scope: pip install llmuses[vlmeval]>=0.4.0
+eval-scope: pip install llmuses[vlmeval]>=0.4.3
 
 2. Deploy judge model
 
@@ -15,6 +15,7 @@
 
 logger = get_logger()
 
+
 def run_swift_eval():
 
     # List all datasets

diff --git a/examples/example_eval_vlm_swift.py b/examples/example_eval_vlm_swift.py
@@ -2,7 +2,7 @@
 
 """
 1. Installation
-eval-scope: pip install llmuses[vlmeval]>=0.4.0
+eval-scope: pip install llmuses[vlmeval]>=0.4.3
 
 2. Deploy judge model
 

diff --git a/llmuses/cli/cli.py b/llmuses/cli/cli.py
@@ -2,6 +2,8 @@
 
 import argparse
 from llmuses.cli.start_perf import PerfBenchCMD
+
+
 def run_cmd():
     parser = argparse.ArgumentParser(
         'LLMUses Command Line tool', usage='llmuses <command> [<args>]')
@@ -17,7 +19,8 @@ def run_cmd():
 
     cmd = args.func(args)
     cmd.execute()
-# --url 'http://11.122.132.12:8000/v1/chat/completions' --parallel 1 --model 'qwen' --dataset 'datasets/LongAlpaca-12k.jsonl'  --log-every-n-query 1 --read-timeout=120  --parser 'openai.longalpaca_12k_qwen.py' -n 10 --max-prompt-length 128000 --tokenizer-path ''
+    # --url 'http://11.122.132.12:8000/v1/chat/completions' --parallel 1 --model 'qwen' --dataset 'datasets/LongAlpaca-12k.jsonl'  --log-every-n-query 1 --read-timeout=120  --parser 'openai.longalpaca_12k_qwen.py' -n 10 --max-prompt-length 128000 --tokenizer-path ''
+
 
 if __name__ == '__main__':
-    run_cmd()
+    run_cmd()
diff --git a/llmuses/config.py b/llmuses/config.py
@@ -47,6 +47,8 @@ class TaskConfig:
     dataset_hub: str = 'ModelScope'
     dataset_dir: str = DEFAULT_ROOT_CACHE_DIR
     limit: int = None
+    eval_backend: str = 'Native'
+    eval_config: dict = field(default_factory=dict)
 
     # def __post_init__(self):
     #     self.registry_tasks = {

diff --git a/llmuses/run.py b/llmuses/run.py
@@ -188,11 +188,16 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[
         raise ValueError('** Args: Please provide a valid task config. **')
 
     # Check and run evaluation backend
-    if task_cfg.get('eval_backend') != EvalBackend.NATIVE.value:
-        eval_backend = task_cfg.get('eval_backend')
-        eval_config: Union[str, dict] = task_cfg.get('eval_config')
+    if task_cfg.get('eval_backend') is None:
+        task_cfg['eval_backend'] = EvalBackend.NATIVE.value
 
-        assert eval_config, f'Please provide the eval task config for evaluation backend {eval_backend}'
+    eval_backend = task_cfg.get('eval_backend')
+    eval_config: Union[str, dict] = task_cfg.get('eval_config')
+
+    if eval_backend != EvalBackend.NATIVE.value:
+
+        if eval_config is None:
+            logger.warning(f'Got eval_backend {eval_backend}, but eval_config is not provided.')
 
         if eval_backend == EvalBackend.OPEN_COMPASS.value:
             from llmuses.backend.opencompass import OpenCompassBackendManager

diff --git a/llmuses/version.py b/llmuses/version.py
@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-__version__ = '0.4.0'
-__release_datetime__ = '2024-06-27 08:00:00'
+__version__ = '0.4.3'
+__release_datetime__ = '2024-07-28 08:00:00'
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+requirements/framework.txt
diff --git a/requirements/requirements.txt → requirements/framework.txt b/requirements/requirements.txt → requirements/framework.txt
@@ -1,4 +1,4 @@
-# pip3 install --no-dependencies -r requirements.txt
+# pip3 install --no-dependencies -r requirements/framework.txt
 torch
 absl-py
 #auto-gptq
@@ -29,7 +29,7 @@ simple-ddl-parser
 tabulate
 tiktoken
 tqdm
-transformers
+transformers>=4.33,<4.43
 transformers_stream_generator
 jieba
 rouge-chinese

diff --git a/requirements/requirements_inner.txt → requirements/inner.txt b/requirements/requirements_inner.txt → requirements/inner.txt
@@ -22,5 +22,5 @@ seaborn
 simple-ddl-parser
 streamlit
 tqdm
-transformers
+transformers>=4.33,<4.43
 transformers_stream_generator
diff --git a/requirements/opencompass.txt b/requirements/opencompass.txt
@@ -0,0 +1 @@
+ms-opencompass
diff --git a/requirements/vlmeval.txt b/requirements/vlmeval.txt
@@ -0,0 +1 @@
+ms-vlmeval
diff --git a/setup.py b/setup.py
@@ -126,22 +126,30 @@ def pack_resource():
 
     proj_dir = root_dir + 'llmuses/'
     shutil.copytree('llmuses', proj_dir)
-    shutil.copy('requirements/requirements.txt', 'package/requirements.txt')
+    shutil.copytree('requirements', root_dir + 'requirements')
+    shutil.copy('requirements.txt', root_dir + 'requirements.txt')
     # shutil.copy('./MANIFEST.in', 'package/MANIFEST.in')
-    shutil.copy('./README.md', 'package/README.md')
+    shutil.copy('./README.md', root_dir + 'README.md')
 
 
 if __name__ == '__main__':
-    print('Usage: python3 setup.py bdist_wheel')
+    print('Usage: python3 setup.py bdist_wheel or pip3 install .[opencompass] for test')
 
     pack_resource()
     os.chdir('package')
-    install_requires, deps_link = parse_requirements('requirements.txt')
+    install_requires, deps_link = parse_requirements('requirements/framework.txt')
+
+    extra_requires = {}
+    all_requires = []
+    extra_requires['opencompass'], _ = parse_requirements('requirements/opencompass.txt')
+    extra_requires['vlmeval'], _ = parse_requirements('requirements/vlmeval.txt')
+    extra_requires['inner'], _ = parse_requirements('requirements/inner.txt')
+
+    all_requires.extend(install_requires)
+    all_requires.extend(extra_requires['opencompass'])
+    all_requires.extend(extra_requires['vlmeval'])
+    extra_requires['all'] = all_requires
 
-    extras_requires={}
-    extras_requires['opencompass']='ms-opencompass'
-    extras_requires['vlmeval']='ms-vlmeval'
-
     setup(
         name='llmuses',
         version=get_version(),
@@ -161,17 +169,16 @@ def pack_resource():
             'License :: OSI Approved :: Apache Software License',
             'Operating System :: OS Independent',
             'Programming Language :: Python :: 3',
-            'Programming Language :: Python :: 3.7',
             'Programming Language :: Python :: 3.8',
             'Programming Language :: Python :: 3.9',
             'Programming Language :: Python :: 3.10',
         ],
-        python_requires='>=3.7',
+        python_requires='>=3.8',
         zip_safe=False,
         install_requires=install_requires,
         entry_points={  
             'console_scripts': ['llmuses=llmuses.cli.cli:run_cmd']
         },
         dependency_links=deps_link,
-        extras_require=extras_requires
+        extras_require=extra_requires,
     )
diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
@@ -11,6 +11,7 @@
 class TestRun(unittest.TestCase):
 
     def setUp(self) -> None:
+        logger.info(f'Init env for llmuses native run UTs ...\n')
         self._check_env('llmuses')
 
     def tearDown(self) -> None:
@@ -36,6 +37,7 @@ def test_run_simple_eval(self):
                      f'--datasets {datasets} ' \
                      f'--limit {limit}'
 
+        logger.info(f'Start to run command: {cmd_simple}')
         run_res = subprocess.run(cmd_simple, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
         assert run_res.returncode == 0, f'Failed to run command: {cmd_simple}'
@@ -58,6 +60,7 @@ def test_run_eval_with_args(self):
                         f'--generation-config do_sample=false,temperature=0.0 ' \
                         f"""--dataset-args \'{dataset_args}\' """
 
+        logger.info(f'Start to run command: {cmd_with_args}')
         run_res = subprocess.run(cmd_with_args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
         assert run_res.returncode == 0, f'Failed to run command: {cmd_with_args}'

diff --git a/tests/swift/__init__.py b/tests/swift/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
diff --git a/tests/swift/test_run_swift_eval.py b/tests/swift/test_run_swift_eval.py
@@ -0,0 +1,143 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import json
+import time
+import requests
+import subprocess
+import unittest
+
+from llmuses.backend.opencompass import OpenCompassBackendManager
+from llmuses.run import run_task
+from llmuses.summarizer import Summarizer
+from llmuses.utils import test_level_list, is_module_installed
+
+from llmuses.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
+DEFAULT_BASE_MODEL_URL = 'http://127.0.0.1:8001/v1/completions'
+
+
+class TestRunSwiftEval(unittest.TestCase):
+
+    def setUp(self) -> None:
+        logger.info(f'Init env for swift-eval UTs ...\n')
+
+        self.model_name = 'llama3-8b-instruct'
+        assert is_module_installed('llmuses'), 'Please install `llmuses` from pypi or source code.'
+
+        logger.warning('Note: installing ms-opencompass ...')
+        subprocess.run('pip3 install ms-opencompass -U', shell=True, check=True)
+
+        logger.warning('Note: installing ms-swift ...')
+        subprocess.run('pip3 install ms-swift -U', shell=True, check=True)
+
+        logger.warning('vllm not installed, use native swift deploy service instead.')
+
+        logger.info(f'\nStaring run swift deploy ...')
+        self.process_swift_deploy = subprocess.Popen(f'swift deploy --model_type {self.model_name}',
+                                                     text=True, shell=True,
+                                                     stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        self.all_datasets = OpenCompassBackendManager.list_datasets()
+        assert len(self.all_datasets) > 0, f'Failed to list datasets from OpenCompass backend: {self.all_datasets}'
+
+    def tearDown(self) -> None:
+        # Stop the swift deploy model service
+        logger.warning(f'\nStopping swift deploy ...')
+        self.process_swift_deploy.terminate()
+        self.process_swift_deploy.wait()
+        logger.info(f'Process swift-deploy terminated successfully.')
+
+    @staticmethod
+    def find_and_kill_pid(pids: list):
+        if len(pids) > 0:
+            for pid in pids:
+                subprocess.run(["kill", str(pid)])
+                logger.warning(f"Killed process {pid}.")
+        else:
+            logger.info(f"No pids found.")
+
+    @staticmethod
+    def find_and_kill_service(service_name):
+        try:
+            # find pid
+            result = subprocess.run(
+                ["ps", "-ef"], stdout=subprocess.PIPE, text=True
+            )
+
+            lines = result.stdout.splitlines()
+            pids = []
+            for line in lines:
+                if service_name in line and "grep" not in line:
+                    parts = line.split()
+                    pid = parts[1]
+                    pids.append(pid)
+
+            if not pids:
+                logger.info(f"No process found for {service_name}.")
+            else:
+                for pid in pids:
+                    subprocess.run(["kill", pid])
+                    logger.warning(f"Killed process {pid} for service {service_name}.")
+        except Exception as e:
+            logger.error(f"An error occurred: {e}")
+
+    @staticmethod
+    def check_service_status(url: str, data: dict, retries: int = 20, delay: int = 10):
+        for i in range(retries):
+            try:
+                logger.info(f"Attempt {i + 1}: Checking service at {url} ...")
+                response = requests.post(url,
+                                         data=json.dumps(data),
+                                         headers={'Content-Type': 'application/json'},
+                                         timeout=30)
+                if response.status_code == 200:
+                    logger.info(f"Service at {url} is available !\n\n")
+                    return True
+                else:
+                    logger.info(f"Service at {url} returned status code {response.status_code}.")
+            except requests.exceptions.RequestException as e:
+                logger.info(f"Attempt {i + 1}: An error occurred: {e}")
+
+            time.sleep(delay)
+
+        logger.info(f"Service at {url} is not available after {retries} retries.")
+        return False
+
+    @unittest.skipUnless(1 in test_level_list(), 'skip test in current test level')
+    def test_run_task(self):
+        # Prepare the config
+        task_cfg = dict(
+            eval_backend='OpenCompass',
+            eval_config={'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
+                         'models': [
+                             {'path': 'llama3-8b-instruct',
+                              'openai_api_base': DEFAULT_CHAT_MODEL_URL,
+                              'batch_size': 8},
+                         ],
+                         'work_dir': 'outputs/llama3_eval_result',
+                         'reuse': None,      # string, `latest` or timestamp, e.g. `20230516_144254`, default to None
+                         'limit': '[2:5]',   # string or int or float, e.g. `[2:5]`, 5, 5.0, default to None, it means run all examples
+                         },
+        )
+
+        # Check the service status
+        data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
+        assert self.check_service_status(DEFAULT_CHAT_MODEL_URL, data=data), f'Failed to check service status: {DEFAULT_CHAT_MODEL_URL}'
+
+        # Submit the task
+        logger.info(f'Start to run UT with cfg: {task_cfg}')
+        run_task(task_cfg=task_cfg)
+
+        # Get the final report with summarizer
+        report_list = Summarizer.get_report_from_cfg(task_cfg)
+        logger.info(f'>>The report list:\n{report_list}')
+
+        assert len(report_list) > 0, f'Failed to get report list: {report_list}'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_run_all.py b/tests/test_run_all.py
@@ -0,0 +1,12 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import subprocess
+
+if __name__ == '__main__':
+    cmd = f'TEST_LEVEL_LIST=0,1 python3 -m unittest discover .'
+    run_res = subprocess.run(cmd, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    if run_res.returncode == 0:
+        print(f'>>test_run_all stdout: {run_res.stdout}')
+    else:
+        print(f'>>test_run_all stderr: {run_res.stderr}')
diff --git a/tests/vlm/__init__.py b/tests/vlm/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Copyright (c) Alibaba, Inc. and its affiliates.