Skip to content

Commit

Permalink
Refactor setup and add UTs (#85)
Browse files Browse the repository at this point in the history
* add opencompass cmd parser

* fix dry-run

* update

* refactor for backend & add backend module

* add example for oc

* update

* update readme and example

* temp

* update opencompass parser

* add opencompas cli and chat_medium collection

* update cli and eval_api

* add OpenCompassArguments for cli

* update eval_api for models

* update version

* add check end for ms-opencompass

* add backend args merge and datasets filter

* add models and api meta template

* assert models

* add tmpfile for task config

* fix datasets in self.args.config

* update

* pop dataset_name key in datasets

* set meta_template to None for mmlu, ceval, ...

* fix models path

* update test task

* update eval task

* update test tasks

* add debug info

* add example for swift eval

* add download data

* update

* update import for eval_datasets

* add json config for toolbench eval

* add entry task in run.py; update example_eval_swift_openai_api

* add yaml and json config

* update example

* update gsm8k

* update example

* update summarizer

* support opencompass backend in Summaryzer

* update summarizer

* update exmaple

* update example; set dataset_dir in config.py

* update version

* update args name

* update example

* set mmlu to 0-shot for swift

* add api key for openai api

* update eval datasets

* add limit

* add limit example

* update datasets

* update readme for oc backend

* update example

* update readme for oc backend

* add readme for en

* fix eval_config assertion

* fix eval_backend assertion

* fix eval_backend assertion

* fix eval_backend assertion

* add ut for swift-eval

* add test run all

* update tests

* add logger for ut

* update

* update pypi source

* add debug

* update

* fix swift deploy subprocess

* update

* add check service

* update

* update

* fix check swift server

* update terminate process

* update

* update example

* update example

* update example

* fix eval_backend checking

* update version

* update UTs and examples

* refactor setup

* add eval_backend and eval_config in config.py

* fix pr issues
  • Loading branch information
wangxingjun778 authored Jul 28, 2024
1 parent d15017a commit 1056a56
Show file tree
Hide file tree
Showing 19 changed files with 218 additions and 39 deletions.
7 changes: 4 additions & 3 deletions examples/example_eval_swift_openai_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ def run_swift_eval():
eval_backend='OpenCompass',
eval_config={'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
'models': [
{'path': 'llama3-8b-instruct', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'},
{'path': 'llama3-8b', 'is_chat': False, 'key': 'EMPTY', 'openai_api_base': 'http://127.0.0.1:8001/v1/completions'}
{'path': 'llama3-8b-instruct', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions', 'batch_size': 100},
{'path': 'llama3-8b', 'is_chat': False, 'key': 'EMPTY', 'openai_api_base': 'http://127.0.0.1:8001/v1/completions', 'batch_size': 100}
],
'work_dir': 'outputs/llama3_eval_result',
'limit': 5, # Could be int/float/str, e.g. 5 or 5.0 or `[10:20]`, default to None, it means run all examples
# Could be int/float/str, e.g. 5 or 5.0 or `[10:20]`, default to None, it means run all examples
'limit': 10,
},
)

Expand Down
3 changes: 2 additions & 1 deletion examples/example_eval_vlm_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

"""
1. Installation
eval-scope: pip install llmuses[vlmeval]>=0.4.0
eval-scope: pip install llmuses[vlmeval]>=0.4.3
2. Deploy judge model
Expand All @@ -15,6 +15,7 @@

logger = get_logger()


def run_swift_eval():

# List all datasets
Expand Down
2 changes: 1 addition & 1 deletion examples/example_eval_vlm_swift.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

"""
1. Installation
eval-scope: pip install llmuses[vlmeval]>=0.4.0
eval-scope: pip install llmuses[vlmeval]>=0.4.3
2. Deploy judge model
Expand Down
7 changes: 5 additions & 2 deletions llmuses/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import argparse
from llmuses.cli.start_perf import PerfBenchCMD


def run_cmd():
parser = argparse.ArgumentParser(
'LLMUses Command Line tool', usage='llmuses <command> [<args>]')
Expand All @@ -17,7 +19,8 @@ def run_cmd():

cmd = args.func(args)
cmd.execute()
# --url 'http://11.122.132.12:8000/v1/chat/completions' --parallel 1 --model 'qwen' --dataset 'datasets/LongAlpaca-12k.jsonl' --log-every-n-query 1 --read-timeout=120 --parser 'openai.longalpaca_12k_qwen.py' -n 10 --max-prompt-length 128000 --tokenizer-path ''
# --url 'http://11.122.132.12:8000/v1/chat/completions' --parallel 1 --model 'qwen' --dataset 'datasets/LongAlpaca-12k.jsonl' --log-every-n-query 1 --read-timeout=120 --parser 'openai.longalpaca_12k_qwen.py' -n 10 --max-prompt-length 128000 --tokenizer-path ''


if __name__ == '__main__':
run_cmd()
run_cmd()
2 changes: 2 additions & 0 deletions llmuses/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ class TaskConfig:
dataset_hub: str = 'ModelScope'
dataset_dir: str = DEFAULT_ROOT_CACHE_DIR
limit: int = None
eval_backend: str = 'Native'
eval_config: dict = field(default_factory=dict)

# def __post_init__(self):
# self.registry_tasks = {
Expand Down
13 changes: 9 additions & 4 deletions llmuses/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,11 +188,16 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[
raise ValueError('** Args: Please provide a valid task config. **')

# Check and run evaluation backend
if task_cfg.get('eval_backend') != EvalBackend.NATIVE.value:
eval_backend = task_cfg.get('eval_backend')
eval_config: Union[str, dict] = task_cfg.get('eval_config')
if task_cfg.get('eval_backend') is None:
task_cfg['eval_backend'] = EvalBackend.NATIVE.value

assert eval_config, f'Please provide the eval task config for evaluation backend {eval_backend}'
eval_backend = task_cfg.get('eval_backend')
eval_config: Union[str, dict] = task_cfg.get('eval_config')

if eval_backend != EvalBackend.NATIVE.value:

if eval_config is None:
logger.warning(f'Got eval_backend {eval_backend}, but eval_config is not provided.')

if eval_backend == EvalBackend.OPEN_COMPASS.value:
from llmuses.backend.opencompass import OpenCompassBackendManager
Expand Down
4 changes: 2 additions & 2 deletions llmuses/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

__version__ = '0.4.0'
__release_datetime__ = '2024-06-27 08:00:00'
__version__ = '0.4.3'
__release_datetime__ = '2024-07-28 08:00:00'
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
requirements/framework.txt
4 changes: 2 additions & 2 deletions requirements/requirements.txt → requirements/framework.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# pip3 install --no-dependencies -r requirements.txt
# pip3 install --no-dependencies -r requirements/framework.txt
torch
absl-py
#auto-gptq
Expand Down Expand Up @@ -29,7 +29,7 @@ simple-ddl-parser
tabulate
tiktoken
tqdm
transformers
transformers>=4.33,<4.43
transformers_stream_generator
jieba
rouge-chinese
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ seaborn
simple-ddl-parser
streamlit
tqdm
transformers
transformers>=4.33,<4.43
transformers_stream_generator
1 change: 1 addition & 0 deletions requirements/opencompass.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ms-opencompass
1 change: 1 addition & 0 deletions requirements/vlmeval.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ms-vlmeval
29 changes: 18 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,22 +126,30 @@ def pack_resource():

proj_dir = root_dir + 'llmuses/'
shutil.copytree('llmuses', proj_dir)
shutil.copy('requirements/requirements.txt', 'package/requirements.txt')
shutil.copytree('requirements', root_dir + 'requirements')
shutil.copy('requirements.txt', root_dir + 'requirements.txt')
# shutil.copy('./MANIFEST.in', 'package/MANIFEST.in')
shutil.copy('./README.md', 'package/README.md')
shutil.copy('./README.md', root_dir + 'README.md')


if __name__ == '__main__':
print('Usage: python3 setup.py bdist_wheel')
print('Usage: python3 setup.py bdist_wheel or pip3 install .[opencompass] for test')

pack_resource()
os.chdir('package')
install_requires, deps_link = parse_requirements('requirements.txt')
install_requires, deps_link = parse_requirements('requirements/framework.txt')

extra_requires = {}
all_requires = []
extra_requires['opencompass'], _ = parse_requirements('requirements/opencompass.txt')
extra_requires['vlmeval'], _ = parse_requirements('requirements/vlmeval.txt')
extra_requires['inner'], _ = parse_requirements('requirements/inner.txt')

all_requires.extend(install_requires)
all_requires.extend(extra_requires['opencompass'])
all_requires.extend(extra_requires['vlmeval'])
extra_requires['all'] = all_requires

extras_requires={}
extras_requires['opencompass']='ms-opencompass'
extras_requires['vlmeval']='ms-vlmeval'

setup(
name='llmuses',
version=get_version(),
Expand All @@ -161,17 +169,16 @@ def pack_resource():
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
],
python_requires='>=3.7',
python_requires='>=3.8',
zip_safe=False,
install_requires=install_requires,
entry_points={
'console_scripts': ['llmuses=llmuses.cli.cli:run_cmd']
},
dependency_links=deps_link,
extras_require=extras_requires
extras_require=extra_requires,
)
3 changes: 3 additions & 0 deletions tests/cli/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
class TestRun(unittest.TestCase):

def setUp(self) -> None:
logger.info(f'Init env for llmuses native run UTs ...\n')
self._check_env('llmuses')

def tearDown(self) -> None:
Expand All @@ -36,6 +37,7 @@ def test_run_simple_eval(self):
f'--datasets {datasets} ' \
f'--limit {limit}'

logger.info(f'Start to run command: {cmd_simple}')
run_res = subprocess.run(cmd_simple, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

assert run_res.returncode == 0, f'Failed to run command: {cmd_simple}'
Expand All @@ -58,6 +60,7 @@ def test_run_eval_with_args(self):
f'--generation-config do_sample=false,temperature=0.0 ' \
f"""--dataset-args \'{dataset_args}\' """

logger.info(f'Start to run command: {cmd_with_args}')
run_res = subprocess.run(cmd_with_args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

assert run_res.returncode == 0, f'Failed to run command: {cmd_with_args}'
Expand Down
1 change: 1 addition & 0 deletions tests/swift/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
143 changes: 143 additions & 0 deletions tests/swift/test_run_swift_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import os
import json
import time
import requests
import subprocess
import unittest

from llmuses.backend.opencompass import OpenCompassBackendManager
from llmuses.run import run_task
from llmuses.summarizer import Summarizer
from llmuses.utils import test_level_list, is_module_installed

from llmuses.utils.logger import get_logger

logger = get_logger(__name__)

DEFAULT_CHAT_MODEL_URL = 'http://127.0.0.1:8000/v1/chat/completions'
DEFAULT_BASE_MODEL_URL = 'http://127.0.0.1:8001/v1/completions'


class TestRunSwiftEval(unittest.TestCase):

def setUp(self) -> None:
logger.info(f'Init env for swift-eval UTs ...\n')

self.model_name = 'llama3-8b-instruct'
assert is_module_installed('llmuses'), 'Please install `llmuses` from pypi or source code.'

logger.warning('Note: installing ms-opencompass ...')
subprocess.run('pip3 install ms-opencompass -U', shell=True, check=True)

logger.warning('Note: installing ms-swift ...')
subprocess.run('pip3 install ms-swift -U', shell=True, check=True)

logger.warning('vllm not installed, use native swift deploy service instead.')

logger.info(f'\nStaring run swift deploy ...')
self.process_swift_deploy = subprocess.Popen(f'swift deploy --model_type {self.model_name}',
text=True, shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)

self.all_datasets = OpenCompassBackendManager.list_datasets()
assert len(self.all_datasets) > 0, f'Failed to list datasets from OpenCompass backend: {self.all_datasets}'

def tearDown(self) -> None:
# Stop the swift deploy model service
logger.warning(f'\nStopping swift deploy ...')
self.process_swift_deploy.terminate()
self.process_swift_deploy.wait()
logger.info(f'Process swift-deploy terminated successfully.')

@staticmethod
def find_and_kill_pid(pids: list):
if len(pids) > 0:
for pid in pids:
subprocess.run(["kill", str(pid)])
logger.warning(f"Killed process {pid}.")
else:
logger.info(f"No pids found.")

@staticmethod
def find_and_kill_service(service_name):
try:
# find pid
result = subprocess.run(
["ps", "-ef"], stdout=subprocess.PIPE, text=True
)

lines = result.stdout.splitlines()
pids = []
for line in lines:
if service_name in line and "grep" not in line:
parts = line.split()
pid = parts[1]
pids.append(pid)

if not pids:
logger.info(f"No process found for {service_name}.")
else:
for pid in pids:
subprocess.run(["kill", pid])
logger.warning(f"Killed process {pid} for service {service_name}.")
except Exception as e:
logger.error(f"An error occurred: {e}")

@staticmethod
def check_service_status(url: str, data: dict, retries: int = 20, delay: int = 10):
for i in range(retries):
try:
logger.info(f"Attempt {i + 1}: Checking service at {url} ...")
response = requests.post(url,
data=json.dumps(data),
headers={'Content-Type': 'application/json'},
timeout=30)
if response.status_code == 200:
logger.info(f"Service at {url} is available !\n\n")
return True
else:
logger.info(f"Service at {url} returned status code {response.status_code}.")
except requests.exceptions.RequestException as e:
logger.info(f"Attempt {i + 1}: An error occurred: {e}")

time.sleep(delay)

logger.info(f"Service at {url} is not available after {retries} retries.")
return False

@unittest.skipUnless(1 in test_level_list(), 'skip test in current test level')
def test_run_task(self):
# Prepare the config
task_cfg = dict(
eval_backend='OpenCompass',
eval_config={'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
'models': [
{'path': 'llama3-8b-instruct',
'openai_api_base': DEFAULT_CHAT_MODEL_URL,
'batch_size': 8},
],
'work_dir': 'outputs/llama3_eval_result',
'reuse': None, # string, `latest` or timestamp, e.g. `20230516_144254`, default to None
'limit': '[2:5]', # string or int or float, e.g. `[2:5]`, 5, 5.0, default to None, it means run all examples
},
)

# Check the service status
data = {'model': self.model_name, 'messages': [{'role': 'user', 'content': 'who are you?'}]}
assert self.check_service_status(DEFAULT_CHAT_MODEL_URL, data=data), f'Failed to check service status: {DEFAULT_CHAT_MODEL_URL}'

# Submit the task
logger.info(f'Start to run UT with cfg: {task_cfg}')
run_task(task_cfg=task_cfg)

# Get the final report with summarizer
report_list = Summarizer.get_report_from_cfg(task_cfg)
logger.info(f'>>The report list:\n{report_list}')

assert len(report_list) > 0, f'Failed to get report list: {report_list}'


if __name__ == '__main__':
unittest.main()
12 changes: 12 additions & 0 deletions tests/test_run_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import subprocess

if __name__ == '__main__':
cmd = f'TEST_LEVEL_LIST=0,1 python3 -m unittest discover .'
run_res = subprocess.run(cmd, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

if run_res.returncode == 0:
print(f'>>test_run_all stdout: {run_res.stdout}')
else:
print(f'>>test_run_all stderr: {run_res.stderr}')
1 change: 1 addition & 0 deletions tests/vlm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
Loading

0 comments on commit 1056a56

Please sign in to comment.