Skip to content

Commit

Permalink
add more details when printing
Browse files Browse the repository at this point in the history
  • Loading branch information
Dawnfz-Lenfeng committed Aug 7, 2024
1 parent f4bf869 commit a6c62fa
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 48 deletions.
9 changes: 8 additions & 1 deletion benchmark/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,15 @@

class LatencyBenchmarkRunner(BenchmarkRunner):
async def _run(self):
for request in self.input_requests:
total_requests = len(self.input_requests)
for i, request in enumerate(self.input_requests):
await self.send_request(request)
remaining = total_requests - (i + 1)
print(
f"\rProcessed {i + 1}/{total_requests} requests, {remaining} remaining.",
end="",
)
print("")


def main(args: argparse.Namespace):
Expand Down
160 changes: 113 additions & 47 deletions benchmark/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def remove_prefix(text: str, prefix: str) -> str:

@dataclass
class RequestOutput:
success: bool = True
success: bool = False
prompt_len: int = 0
completion_tokens: int = 0
latency: float = 0.0
Expand Down Expand Up @@ -164,8 +164,8 @@ async def send_request(self, request: tuple, warming_up: bool = False):
output.completion_tokens = data["usage"]["completion_tokens"]
else:
resp = await response.json()
latency = time.perf_counter() - st
output.latency = latency
output.latency = time.perf_counter() - st
output.success = True
output.completion_tokens = resp["usage"]["completion_tokens"]
except Exception:
output.success = False
Expand Down Expand Up @@ -193,7 +193,8 @@ def print_stats(self):
total_input += output.prompt_len
if output.completion_tokens > 1:
tpots.append(
(output.latency - output.ttft) / (output.completion_tokens - 1)
(output.latency - output.ttft)
/ (output.completion_tokens - 1)
)
itls += output.itl
ttfts.append(output.ttft)
Expand All @@ -214,20 +215,20 @@ def print_stats(self):
input_throughput = total_input / total_time if total_time > 0 else 0
output_throughput = total_output / total_time if total_time > 0 else 0

mean_ttft = np.mean(ttfts) if ttfts else 0
median_ttft = np.median(ttfts) if ttfts else 0
std_ttft = np.std(ttfts) if ttfts else 0
p99_ttft = np.percentile(ttfts, 99) if ttfts else 0
mean_ttft = np.mean(ttfts) * 1000 if ttfts else 0
median_ttft = np.median(ttfts) * 1000 if ttfts else 0
std_ttft = np.std(ttfts) * 1000 if ttfts else 0
p99_ttft = np.percentile(ttfts, 99) * 1000 if ttfts else 0

mean_tpot = np.mean(tpots) if tpots else 0
median_tpot = np.median(tpots) if tpots else 0
std_tpot = np.std(tpots) if tpots else 0
p99_tpot = np.percentile(tpots, 99) if tpots else 0
mean_tpot = np.mean(tpots) * 1000 if tpots else 0
median_tpot = np.median(tpots) * 1000 if tpots else 0
std_tpot = np.std(tpots) * 1000 if tpots else 0
p99_tpot = np.percentile(tpots, 99) * 1000 if tpots else 0

mean_itl = np.mean(itls) if itls else 0
median_itl = np.median(itls) if itls else 0
std_itl = np.std(itls) if itls else 0
p99_itl = np.percentile(itls, 99) if itls else 0
mean_itl = np.mean(itls) * 1000 if itls else 0
median_itl = np.median(itls) * 1000 if itls else 0
std_itl = np.std(itls) * 1000 if itls else 0
p99_itl = np.percentile(itls, 99) * 1000 if itls else 0

# Print benchmark results
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
Expand All @@ -236,7 +237,9 @@ def print_stats(self):
print("{:<40} {:<10}".format("Total input tokens:", total_input))
print("{:<40} {:<10}".format("Total generated tokens:", total_output))
print(
"{:<40} {:<10.2f}".format("Request throughput (req/s):", request_throughput)
"{:<40} {:<10.2f}".format(
"Request throughput (req/s):", request_throughput
)
)
print(
"{:<40} {:<10.2f}".format(
Expand All @@ -250,51 +253,114 @@ def print_stats(self):
)

print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
print("{:<40} {:<10.4f}".format("Mean TTFT (s):", mean_ttft))
print("{:<40} {:<10.4f}".format("Median TTFT (s):", median_ttft))
print("{:<40} {:<10.4f}".format("Std TTFT (s):", std_ttft))
print("{:<40} {:<10.4f}".format("P99 TTFT (s):", p99_ttft))
print("{:<40} {:<10.4f}".format("Mean TTFT (ms):", mean_ttft))
print("{:<40} {:<10.4f}".format("Median TTFT (ms):", median_ttft))
print("{:<40} {:<10.4f}".format("Std TTFT (ms):", std_ttft))
print("{:<40} {:<10.4f}".format("P99 TTFT (ms):", p99_ttft))

print(
"{s:{c}^{n}}".format(
s="Time per Output Token (excl. 1st token)", n=50, c="-"
)
)
print("{:<40} {:<10.4f}".format("Mean TPOT (s):", mean_tpot))
print("{:<40} {:<10.4f}".format("Median TPOT (s):", median_tpot))
print("{:<40} {:<10.4f}".format("Std TPOT (s):", std_tpot))
print("{:<40} {:<10.4f}".format("P99 TPOT (s):", p99_tpot))
print("{:<40} {:<10.4f}".format("Mean TPOT (ms):", mean_tpot))
print("{:<40} {:<10.4f}".format("Median TPOT (ms):", median_tpot))
print("{:<40} {:<10.4f}".format("Std TPOT (ms):", std_tpot))
print("{:<40} {:<10.4f}".format("P99 TPOT (ms):", p99_tpot))

print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
print("{:<40} {:<10.4f}".format("Mean ITL (s):", mean_itl))
print("{:<40} {:<10.4f}".format("Median ITL (s):", median_itl))
print("{:<40} {:<10.4f}".format("Std ITL (s):", std_itl))
print("{:<40} {:<10.4f}".format("P99 ITL (s):", p99_itl))
print("{:<40} {:<10.4f}".format("Mean ITL (ms):", mean_itl))
print("{:<40} {:<10.4f}".format("Median ITL (ms):", median_itl))
print("{:<40} {:<10.4f}".format("Std ITL (ms):", std_itl))
print("{:<40} {:<10.4f}".format("P99 ITL (ms):", p99_itl))

print("=" * 50)
else:
print(f"Total time: {total_time:.2f} s")
print(f"Throughput: {len(self.outputs) / total_time:.2f} requests/s")
# Initialize variables for metrics
total_input = 0
completed = 0
actual_output_lens = []
latencies = []
per_token_latencies = []
per_output_token_latencies = []

for output in self.outputs:
if output.success:
actual_output_lens.append(output.completion_tokens)
total_input += output.prompt_len
latencies.append(output.latency)
per_token_latencies.append(
output.latency / (output.prompt_len + output.completion_tokens)
)
if output.completion_tokens > 0:
per_output_token_latencies.append(
output.latency / output.completion_tokens
)
completed += 1
else:
actual_output_lens.append(0)

if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2,
)

# Calculate statistics
total_output = sum(actual_output_lens)
request_throughput = len(self.outputs) / total_time if total_time > 0 else 0
input_throughput = total_input / total_time if total_time > 0 else 0
output_throughput = total_output / total_time if total_time > 0 else 0

# Compute the latency statistics.
avg_latency = np.mean([output.latency for output in self.outputs])
print(f"Average latency: {avg_latency:.2f} s")
avg_per_token_latency = np.mean(
[
output.latency / (output.prompt_len + output.completion_tokens)
for output in self.outputs
]
mean_latency = np.mean(latencies) if latencies else 0
mean_per_token_latency = (
np.mean(per_token_latencies) if per_token_latencies else 0
)
print(f"Average latency per token: {avg_per_token_latency:.2f} s")
avg_per_output_token_latency = np.mean(
[output.latency / output.completion_tokens for output in self.outputs]
mean_per_output_token_latency = (
np.mean(per_output_token_latencies) if per_output_token_latencies else 0
)
print("Average latency per output token: " f"{avg_per_output_token_latency:.2f} s")
throughput = (
sum([output.completion_tokens for output in self.outputs])
/ total_time

# Print benchmark results
print("{s:{c}^{n}}".format(s=" Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", completed))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", total_time))
print("{:<40} {:<10}".format("Total input tokens:", total_input))
print("{:<40} {:<10}".format("Total generated tokens:", total_output))
print(
"{:<40} {:<10.2f}".format(
"Request throughput (req/s):", request_throughput
)
)
print(f"Throughput: {throughput} tokens/s")
print(
"{:<40} {:<10.2f}".format(
"Input token throughput (tok/s):", input_throughput
)
)
print(
"{:<40} {:<10.2f}".format(
"Output token throughput (tok/s):", output_throughput
)
)

print("{s:{c}^{n}}".format(s="Latency Statistics", n=50, c="-"))
print("{:<40} {:<10.4f}".format("Mean latency (s):", mean_latency))
print(
"{:<40} {:<10.4f}".format(
"Mean latency per token (s):", mean_per_token_latency
)
)
print(
"{:<40} {:<10.4f}".format(
"Mean latency per output token (s):", mean_per_output_token_latency
)
)

print("=" * 50)

print(f"Total time: {total_time:.2f} s")
print(f"Throughput: {len(self.outputs) / total_time:.2f} requests/s")


class ConcurrentBenchmarkRunner(BenchmarkRunner):
def __init__(
Expand Down

0 comments on commit a6c62fa

Please sign in to comment.