diff --git a/docs/en/user_guides/stress_test/quick_start.md b/docs/en/user_guides/stress_test/quick_start.md index a0e1807..1b20993 100644 --- a/docs/en/user_guides/stress_test/quick_start.md +++ b/docs/en/user_guides/stress_test/quick_start.md @@ -63,74 +63,83 @@ Parameter Descriptions: ### Output Results ```text Benchmarking summary: -+----------------------------------------------+------------------------------------------------+ -| key | Value | -+==============================================+================================================+ -| Time taken for tests (seconds) | 7.539 | -+----------------------------------------------+------------------------------------------------+ -| Number of concurrency | 1 | -+----------------------------------------------+------------------------------------------------+ -| Total requests | 15 | -+----------------------------------------------+------------------------------------------------+ -| Succeeded requests | 15 | -+----------------------------------------------+------------------------------------------------+ -| Failed requests | 0 | -+----------------------------------------------+------------------------------------------------+ -| Average QPS | 1.99 | -+----------------------------------------------+------------------------------------------------+ -| Average latency | 0.492 | -+----------------------------------------------+------------------------------------------------+ -| Average time to first token | 0.026 | -+----------------------------------------------+------------------------------------------------+ -| Throughput (average output tokens per second) | 334.006 | -+----------------------------------------------+------------------------------------------------+ -| Average time per output token | 0.00299 | -+----------------------------------------------+------------------------------------------------+ -| Average package per request | 167.867 | -+----------------------------------------------+------------------------------------------------+ -| Average package latency | 0.003 | -+----------------------------------------------+------------------------------------------------+ -| Average input tokens per request | 40.133 | -+----------------------------------------------+------------------------------------------------+ -| Average output tokens per request | 167.867 | -+----------------------------------------------+------------------------------------------------+ -| Expected number of requests | 15 | -+----------------------------------------------+------------------------------------------------+ -| Result DB path | ./outputs/qwen2.5_benchmark_20241107_201413.db | -+----------------------------------------------+------------------------------------------------+ ++-----------------------------------+-----------------------------------------------------+ +| Key | Value | ++===================================+=====================================================+ +| Time taken for tests (s) | 10.739 | ++-----------------------------------+-----------------------------------------------------+ +| Number of concurrency | 1 | ++-----------------------------------+-----------------------------------------------------+ +| Total requests | 15 | ++-----------------------------------+-----------------------------------------------------+ +| Succeed requests | 15 | ++-----------------------------------+-----------------------------------------------------+ +| Failed requests | 0 | ++-----------------------------------+-----------------------------------------------------+ +| Throughput(average tokens/s) | 324.059 | ++-----------------------------------+-----------------------------------------------------+ +| Average QPS | 1.397 | ++-----------------------------------+-----------------------------------------------------+ +| Average latency (s) | 0.696 | ++-----------------------------------+-----------------------------------------------------+ +| Average time to first token (s) | 0.029 | ++-----------------------------------+-----------------------------------------------------+ +| Average time per output token (s) | 0.00309 | ++-----------------------------------+-----------------------------------------------------+ +| Average input tokens per request | 50.133 | ++-----------------------------------+-----------------------------------------------------+ +| Average output tokens per request | 232.0 | ++-----------------------------------+-----------------------------------------------------+ +| Average package latency (s) | 0.003 | ++-----------------------------------+-----------------------------------------------------+ +| Average package per request | 232.0 | ++-----------------------------------+-----------------------------------------------------+ +| Expected number of requests | 15 | ++-----------------------------------+-----------------------------------------------------+ +| Result DB path | ./outputs/20241216_194204/qwen2.5/benchmark_data.db | ++-----------------------------------+-----------------------------------------------------+ Percentile results: -+------------+---------------------+---------+ -| Percentile | First Chunk Latency | Latency | -+------------+---------------------+---------+ -| 10% | 0.0178 | 0.1577 | -| 25% | 0.0183 | 0.2358 | -| 50% | 0.0199 | 0.4311 | -| 66% | 0.0218 | 0.6317 | -| 75% | 0.0429 | 0.7121 | -| 80% | 0.0432 | 0.7957 | -| 90% | 0.0432 | 0.9153 | -| 95% | 0.0433 | 0.9897 | -| 98% | 0.0433 | 0.9897 | -| 99% | 0.0433 | 0.9897 | -+------------+---------------------+---------+ ++------------+----------+----------+-------------+--------------+---------------+----------------------+ +| Percentile | TTFT (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Throughput(tokens/s) | ++------------+----------+----------+-------------+--------------+---------------+----------------------+ +| 10% | 0.0202 | 0.0027 | 0.1846 | 41 | 50 | 270.8324 | +| 25% | 0.0209 | 0.0028 | 0.2861 | 44 | 83 | 290.0714 | +| 50% | 0.0233 | 0.0028 | 0.7293 | 49 | 250 | 335.644 | +| 66% | 0.0267 | 0.0029 | 0.9052 | 50 | 308 | 340.2603 | +| 75% | 0.0437 | 0.0029 | 0.9683 | 53 | 325 | 341.947 | +| 80% | 0.0438 | 0.003 | 1.0799 | 58 | 376 | 342.7985 | +| 90% | 0.0439 | 0.0032 | 1.2474 | 62 | 424 | 345.5268 | +| 95% | 0.0463 | 0.0033 | 1.3038 | 66 | 431 | 348.1648 | +| 98% | 0.0463 | 0.0035 | 1.3038 | 66 | 431 | 348.1648 | +| 99% | 0.0463 | 0.0037 | 1.3038 | 66 | 431 | 348.1648 | ++------------+----------+----------+-------------+--------------+---------------+----------------------+ ``` ### Metric Descriptions -| **Metric** | **Description** | **Value** | -|------------------------------------------|---------------------------------|-----------------| -| Total requests | Total number of requests | 15 | -| Succeeded requests | Number of successful requests | 15 | -| Failed requests | Number of failed requests | 0 | -| Average QPS | Average requests per second | 1.99 | -| Average latency | Average latency for all requests| 0.492 | -| Throughput (average output tokens per second) | Average output tokens per second| 334.006 | -| Average time to first token | Average delay for the first token| 0.026 | -| Average input tokens per request | Average number of input tokens per request | 40.133 | -| Average output tokens per request | Average number of output tokens per request | 167.867 | -| Average time per output token | Average time for each output token | 0.00299 | -| Average package per request | Average number of packages per request | 167.867 | -| Average package latency | Average latency for each package| 0.003 | -| Percentile of time to first token (p10, ..., p99) | Percentiles for the first token latency | | -| Percentile of request latency (p10, ..., p99) | Percentiles for request latency | | \ No newline at end of file +| Metric | Description | +|--------------------------------------|-------------------------------------------------------------------------------------------| +| Time taken for tests (s) | Time used for tests (seconds) | +| Number of concurrency | Number of concurrent requests | +| Total requests | Total number of requests | +| Succeed requests | Number of successful requests | +| Failed requests | Number of failed requests | +| Throughput (average tokens/s) | Throughput (average number of tokens processed per second) | +| Average QPS | Average number of queries per second (Queries Per Second) | +| Average latency (s) | Average latency time (seconds) | +| Average time to first token (s) | Average time to first token (seconds) | +| Average time per output token (s) | Average time per output token (seconds) | +| Average input tokens per request | Average number of input tokens per request | +| Average output tokens per request | Average number of output tokens per request | +| Average package latency (s) | Average package latency time (seconds) | +| Average package per request | Average number of packages per request | +| Expected number of requests | Expected number of requests | +| Result DB path | Result database path | +| **Percentile** | **Data is divided into 100 equal parts, and the nth percentile indicates that n% of the data points are below this value** | +| TTFT (s) | Time to First Token, the time to generate the first token | +| TPOT (s) | Time Per Output Token, the time to generate each output token | +| Latency (s) | Latency time, the time between request and response | +| Input tokens | Number of input tokens | +| Output tokens | Number of output tokens | +| Throughput (tokens/s) | Throughput, the number of tokens processed per second | \ No newline at end of file diff --git a/docs/zh/user_guides/stress_test/quick_start.md b/docs/zh/user_guides/stress_test/quick_start.md index 97d9fc3..7c53516 100644 --- a/docs/zh/user_guides/stress_test/quick_start.md +++ b/docs/zh/user_guides/stress_test/quick_start.md @@ -64,74 +64,83 @@ run_perf_benchmark(task_cfg) ### 输出结果 ```text Benchmarking summary: -+----------------------------------------------+------------------------------------------------+ -| key | Value | -+==============================================+================================================+ -| Time taken for tests (senconds) | 7.539 | -+----------------------------------------------+------------------------------------------------+ -| Number of concurrency | 1 | -+----------------------------------------------+------------------------------------------------+ -| Total requests | 15 | -+----------------------------------------------+------------------------------------------------+ -| Succeed requests | 15 | -+----------------------------------------------+------------------------------------------------+ -| Failed requests | 0 | -+----------------------------------------------+------------------------------------------------+ -| Average QPS | 1.99 | -+----------------------------------------------+------------------------------------------------+ -| Average latency | 0.492 | -+----------------------------------------------+------------------------------------------------+ -| Average time to first token | 0.026 | -+----------------------------------------------+------------------------------------------------+ -| Throughput(average output tokens per second) | 334.006 | -+----------------------------------------------+------------------------------------------------+ -| Average time per output token | 0.00299 | -+----------------------------------------------+------------------------------------------------+ -| Average package per request | 167.867 | -+----------------------------------------------+------------------------------------------------+ -| Average package latency | 0.003 | -+----------------------------------------------+------------------------------------------------+ -| Average input tokens per request | 40.133 | -+----------------------------------------------+------------------------------------------------+ -| Average output tokens per request | 167.867 | -+----------------------------------------------+------------------------------------------------+ -| Expected number of requests | 15 | -+----------------------------------------------+------------------------------------------------+ -| Result DB path | ./outputs/qwen2.5_benchmark_20241107_201413.db | -+----------------------------------------------+------------------------------------------------+ ++-----------------------------------+-----------------------------------------------------+ +| Key | Value | ++===================================+=====================================================+ +| Time taken for tests (s) | 10.739 | ++-----------------------------------+-----------------------------------------------------+ +| Number of concurrency | 1 | ++-----------------------------------+-----------------------------------------------------+ +| Total requests | 15 | ++-----------------------------------+-----------------------------------------------------+ +| Succeed requests | 15 | ++-----------------------------------+-----------------------------------------------------+ +| Failed requests | 0 | ++-----------------------------------+-----------------------------------------------------+ +| Throughput(average tokens/s) | 324.059 | ++-----------------------------------+-----------------------------------------------------+ +| Average QPS | 1.397 | ++-----------------------------------+-----------------------------------------------------+ +| Average latency (s) | 0.696 | ++-----------------------------------+-----------------------------------------------------+ +| Average time to first token (s) | 0.029 | ++-----------------------------------+-----------------------------------------------------+ +| Average time per output token (s) | 0.00309 | ++-----------------------------------+-----------------------------------------------------+ +| Average input tokens per request | 50.133 | ++-----------------------------------+-----------------------------------------------------+ +| Average output tokens per request | 232.0 | ++-----------------------------------+-----------------------------------------------------+ +| Average package latency (s) | 0.003 | ++-----------------------------------+-----------------------------------------------------+ +| Average package per request | 232.0 | ++-----------------------------------+-----------------------------------------------------+ +| Expected number of requests | 15 | ++-----------------------------------+-----------------------------------------------------+ +| Result DB path | ./outputs/20241216_194204/qwen2.5/benchmark_data.db | ++-----------------------------------+-----------------------------------------------------+ Percentile results: -+------------+---------------------+---------+ -| Percentile | First Chunk Latency | Latency | -+------------+---------------------+---------+ -| 10% | 0.0178 | 0.1577 | -| 25% | 0.0183 | 0.2358 | -| 50% | 0.0199 | 0.4311 | -| 66% | 0.0218 | 0.6317 | -| 75% | 0.0429 | 0.7121 | -| 80% | 0.0432 | 0.7957 | -| 90% | 0.0432 | 0.9153 | -| 95% | 0.0433 | 0.9897 | -| 98% | 0.0433 | 0.9897 | -| 99% | 0.0433 | 0.9897 | -+------------+---------------------+---------+ ++------------+----------+----------+-------------+--------------+---------------+----------------------+ +| Percentile | TTFT (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Throughput(tokens/s) | ++------------+----------+----------+-------------+--------------+---------------+----------------------+ +| 10% | 0.0202 | 0.0027 | 0.1846 | 41 | 50 | 270.8324 | +| 25% | 0.0209 | 0.0028 | 0.2861 | 44 | 83 | 290.0714 | +| 50% | 0.0233 | 0.0028 | 0.7293 | 49 | 250 | 335.644 | +| 66% | 0.0267 | 0.0029 | 0.9052 | 50 | 308 | 340.2603 | +| 75% | 0.0437 | 0.0029 | 0.9683 | 53 | 325 | 341.947 | +| 80% | 0.0438 | 0.003 | 1.0799 | 58 | 376 | 342.7985 | +| 90% | 0.0439 | 0.0032 | 1.2474 | 62 | 424 | 345.5268 | +| 95% | 0.0463 | 0.0033 | 1.3038 | 66 | 431 | 348.1648 | +| 98% | 0.0463 | 0.0035 | 1.3038 | 66 | 431 | 348.1648 | +| 99% | 0.0463 | 0.0037 | 1.3038 | 66 | 431 | 348.1648 | ++------------+----------+----------+-------------+--------------+---------------+----------------------+ ``` ### 指标说明 -| **指标** | **说明** | **数值** | -|------------------------------------------|-----------------------------|-----------------| -| Total requests | 总请求数 | 15 | -| Succeeded requests | 成功请求数 | 15 | -| Failed requests | 失败请求数 | 0 | -| Average QPS | 每秒平均请求数 | 1.99 | -| Average latency | 所有请求的平均延迟 | 0.492 | -| Throughput(average output tokens per second) | 每秒输出token数量 | 334.006 | -| Average time to first token | 首token的平均延时 | 0.026 | -| Average input tokens per request | 每个请求的平均输入token数量 | 40.133 | -| Average output tokens per request | 每个请求的平均输出token数量 | 167.867 | -| Average time per output token | 输出每个token的平均时间 | 0.00299 | -| Average package per request | 每个请求的平均包数 | 167.867 | -| Average package latency | 每个包的平均延迟 | 0.003 | -| Percentile of time to first token (p10, ..., p99) | 首token延时百分位 | | -| Percentile of request latency (p10, ..., p99) | 请求延迟的百分位 | | +| 指标 | 说明 | +|------------------------------------|---------------------------------------------------------------------------------------| +| Time taken for tests (s) | 测试所用的时间(秒) | +| Number of concurrency | 并发数量 | +| Total requests | 总请求数 | +| Succeed requests | 成功的请求数 | +| Failed requests | 失败的请求数 | +| Throughput(average tokens/s) | 吞吐量(平均每秒处理的token数) | +| Average QPS | 平均每秒请求数(Queries Per Second) | +| Average latency (s) | 平均延迟时间(秒) | +| Average time to first token (s) | 平均首次token时间(秒) | +| Average time per output token (s) | 平均每个输出token的时间(秒) | +| Average input tokens per request | 每个请求的平均输入token数 | +| Average output tokens per request | 每个请求的平均输出token数 | +| Average package latency (s) | 平均包延迟时间(秒) | +| Average package per request | 每个请求的平均包数 | +| Expected number of requests | 预期的请求数 | +| Result DB path | 结果数据库路径 | +| **Percentile** | **数据被分为100个相等部分,第n百分位表示n%的数据点在此值之下** | +| TTFT (s) | Time to First Token,首次生成token的时间 | +| TPOT (s) | Time Per Output Token,生成每个输出token的时间 | +| Latency (s) | 延迟时间,指请求到响应之间的时间 | +| Input tokens | 输入的token数量 | +| Output tokens | 输出的token数量 | +| Throughput (tokens/s) | 吞吐量,指每秒处理token的数量 | \ No newline at end of file diff --git a/evalscope/perf/utils/benchmark_util.py b/evalscope/perf/utils/benchmark_util.py index 50dfaf7..60c1676 100644 --- a/evalscope/perf/utils/benchmark_util.py +++ b/evalscope/perf/utils/benchmark_util.py @@ -116,19 +116,19 @@ def calculate_averages(self): def create_message(self, default_ndigits=3): message = { - 'Time taken for tests (senconds)': round(self.total_time, default_ndigits), + 'Time taken for tests (s)': round(self.total_time, default_ndigits), 'Number of concurrency': self.concurrency, 'Total requests': int(self.n_total_queries), 'Succeed requests': self.n_succeed_queries, 'Failed requests': self.n_failed_queries, + 'Throughput(average tokens/s)': round(self.avg_token_per_seconds, default_ndigits), 'Average QPS': round(self.qps, default_ndigits), 'Average latency (s)': round(self.avg_latency, default_ndigits), 'Average time to first token (s)': round(self.avg_first_chunk_latency, default_ndigits), 'Average time per output token (s)': round(self.avg_time_per_token, 5), - 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits), - 'Average package per request': round(self.n_avg_chunks, default_ndigits), - 'Throughput(average output tokens per second)': round(self.avg_token_per_seconds, default_ndigits), 'Average input tokens per request': round(self.avg_prompt_tokens, default_ndigits), 'Average output tokens per request': round(self.avg_completion_tokens, default_ndigits), + 'Average package latency (s)': round(self.avg_chunk_time, default_ndigits), + 'Average package per request': round(self.n_avg_chunks, default_ndigits), } return message diff --git a/evalscope/perf/utils/db_util.py b/evalscope/perf/utils/db_util.py index f96d982..521f54f 100644 --- a/evalscope/perf/utils/db_util.py +++ b/evalscope/perf/utils/db_util.py @@ -6,6 +6,7 @@ import sys from datetime import datetime from tabulate import tabulate +from typing import Dict, List from evalscope.perf.arguments import Arguments from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics @@ -107,46 +108,87 @@ def get_result_db_path(args: Arguments): return result_db_path -def get_percentile_results(result_db_path: str): +def calculate_percentiles(data: List[float], percentiles: List[int]) -> Dict[int, float]: + """ + Calculate the percentiles for a specific list of data. - def percentile_results(rows, index, percentiles): - results = {} - n_success_queries = len(rows) - for percentile in percentiles: + :param data: List of values for a specific metric. + :param percentiles: List of percentiles to calculate. + :return: Dictionary of calculated percentiles. + """ + results = {} + n_success_queries = len(data) + data.sort() + for percentile in percentiles: + try: idx = int(n_success_queries * percentile / 100) - row = rows[idx] - value = row[index] if row[index] is not None else float('inf') + value = data[idx] if data[idx] is not None else float('nan') results[percentile] = round(value, 4) - return results + except IndexError: + results[percentile] = float('nan') + return results + + +def get_percentile_results(result_db_path: str) -> Dict[str, List[float]]: + """ + Compute and return quantiles for various metrics from the database results. + + :param result_db_path: Path to the SQLite database file. + :return: Dictionary of percentiles for various metrics. + """ + + def inter_token_latencies(chunk_times_json: str) -> List[float]: + try: + chunk_times = json.loads(chunk_times_json) + return [t2 - t1 for t1, t2 in zip(chunk_times[:-1], chunk_times[1:])] + except (json.JSONDecodeError, TypeError) as e: + logger.error(f'Error parsing chunk times: {e}') + return [] query_sql = ('SELECT start_time, chunk_times, success, completed_time, latency, first_chunk_latency, ' 'n_chunks, chunk_time, prompt_tokens, completion_tokens ' - 'FROM result WHERE success=1 ORDER BY first_chunk_latency ASC') + 'FROM result WHERE success=1') + percentiles = [10, 25, 50, 66, 75, 80, 90, 95, 98, 99] with sqlite3.connect(result_db_path) as con: rows = con.execute(query_sql).fetchall() - if len(rows) <= len(percentiles): + if len(rows) < len(percentiles): logger.info('Too little data to calculate quantiles!') return {} - # Calculate percentiles for first chunk latency and latency - first_chunk_latency_index = 5 - latency_index = 4 + # Define index variables for columns + CHUNK_TIMES_INDEX = 1 + LATENCY_INDEX = 4 + FIRST_CHUNK_LATENCY_INDEX = 5 + PROMPT_TOKENS_INDEX = 8 + COMPLETION_TOKENS_INDEX = 9 + + # Prepare data for each metric + inter_token_latencies_all = [] + for row in rows: + inter_token_latencies_all.extend(inter_token_latencies(row[CHUNK_TIMES_INDEX])) + + metrics = { + 'TTFT (s)': [row[FIRST_CHUNK_LATENCY_INDEX] for row in rows], + 'TPOT (s)': + inter_token_latencies_all, + 'Latency (s)': [row[LATENCY_INDEX] for row in rows], + 'Input tokens': [row[PROMPT_TOKENS_INDEX] for row in rows], + 'Output tokens': [row[COMPLETION_TOKENS_INDEX] for row in rows], + 'Throughput(tokens/s)': + [(row[COMPLETION_TOKENS_INDEX] / row[LATENCY_INDEX]) if row[LATENCY_INDEX] > 0 else float('nan') + for row in rows] + } - first_chunk_latency_results = percentile_results(rows, first_chunk_latency_index, percentiles) - rows.sort(key=lambda x: x[latency_index]) - latency_results = percentile_results(rows, latency_index, percentiles) + # Calculate percentiles for each metric + results = {'Percentile': [f'{p}%' for p in percentiles]} + for metric_name, data in metrics.items(): + metric_percentiles = calculate_percentiles(data, percentiles) + results[metric_name] = [metric_percentiles[p] for p in percentiles] - # Prepare data for tabulation - # Throughput = number of output tokens per second across all concurrency requests - # Latency = TTFT + (TPOT) * (the number of tokens to be generated) - return { - 'Percentile': [f'{p}%' for p in percentiles], - 'Time to first token (s)': [first_chunk_latency_results[p] for p in percentiles], - 'Latency (s)': [latency_results[p] for p in percentiles] - } + return results def summary_result(args: Arguments, metrics: BenchmarkMetrics, expected_number_of_queries: int, result_db_path: str): diff --git a/tests/perf/test_perf.py b/tests/perf/test_perf.py index b59674d..35e7cee 100644 --- a/tests/perf/test_perf.py +++ b/tests/perf/test_perf.py @@ -25,6 +25,7 @@ def test_run_perf(self): 'number': 15, 'api': 'openai', 'dataset': 'openqa', + 'stream': True, 'debug': True, } run_perf_benchmark(task_cfg)