diff --git a/evalscope/perf/benchmark.py b/evalscope/perf/benchmark.py index fecd815..0db50e6 100644 --- a/evalscope/perf/benchmark.py +++ b/evalscope/perf/benchmark.py @@ -157,7 +157,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue, while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()): try: # Attempt to get benchmark data from the queue with a timeout - benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=1) + benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01) benchmark_data_queue.task_done() except asyncio.TimeoutError: # If timeout, continue to the next iteration diff --git a/evalscope/perf/plugin/api/openai_api.py b/evalscope/perf/plugin/api/openai_api.py index c451313..e7629d2 100644 --- a/evalscope/perf/plugin/api/openai_api.py +++ b/evalscope/perf/plugin/api/openai_api.py @@ -96,19 +96,21 @@ def __compose_query_from_parameter(self, payload: Dict, param: Arguments): def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict: """Parser responses and return number of request and response tokens. - One response for non-stream, multiple responses for stream. + Only one response for non-stream, multiple responses for stream. """ - delta_contents = {} - input_tokens = None - output_tokens = None + # when stream, the last response is the full usage + # when non-stream, the last response is the first response + last_response_js = json.loads(responses[-1]) + if 'usage' in last_response_js and last_response_js['usage']: + input_tokens = last_response_js['usage']['prompt_tokens'] + output_tokens = last_response_js['usage']['completion_tokens'] + return input_tokens, output_tokens + + # no usage information in the response, parse the response to get the tokens + delta_contents = {} for response in responses: js = json.loads(response) - if 'usage' in js and js['usage']: - input_tokens = js['usage']['prompt_tokens'] - output_tokens = js['usage']['completion_tokens'] - return input_tokens, output_tokens - if 'object' in js: self.__process_response_object(js, delta_contents) else: diff --git a/tests/perf/test_perf.py b/tests/perf/test_perf.py index 35e7cee..e672092 100644 --- a/tests/perf/test_perf.py +++ b/tests/perf/test_perf.py @@ -19,13 +19,13 @@ def tearDown(self) -> None: @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_run_perf(self): task_cfg = { - 'url': 'http://127.0.0.1:8000/v1/chat/completions', + 'url': 'http://127.0.0.1:8001/v1/chat/completions', 'parallel': 1, 'model': 'qwen2.5', 'number': 15, 'api': 'openai', 'dataset': 'openqa', - 'stream': True, + # 'stream': True, 'debug': True, } run_perf_benchmark(task_cfg) @@ -47,7 +47,7 @@ def test_run_perf_stream(self): @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_run_perf_speed_benchmark(self): task_cfg = { - 'url': 'http://127.0.0.1:8801/v1/completions', + 'url': 'http://127.0.0.1:8001/v1/completions', 'parallel': 1, 'model': 'qwen2.5', 'api': 'openai',