scripts/run_benchmark_read_chunks.py

#!/usr/bin/env python3

import subprocess
import re
import pandas as pd
import math
from _run_benchmark import clear_cache, time_args

implementation_to_args = {
    "zarrs_rust": ["zarrs_benchmark_read_sync", "--concurrent-chunks"],
    "zarrs_rust_async_as_sync": ["zarrs_benchmark_read_async_as_sync", "--concurrent-chunks"],
    "zarrs_rust_async": ["zarrs_benchmark_read_async", "--concurrent-chunks"],
    "tensorstore_python": ["./scripts/tensorstore_python_benchmark_read.py", "--concurrent_chunks"],
    "zarr_python": ["./scripts/zarr_python_benchmark_read.py", "--concurrent_chunks"],
    "zarr_dask_python": ["./scripts/zarr_dask_python_benchmark_read.py", "--concurrent_chunks"],
    "zarrs_python": ["./scripts/zarrs_python_benchmark_read.py", "--concurrent_chunks"],
    "zarrs_dask_python": ["./scripts/zarrs_dask_python_benchmark_read.py", "--concurrent_chunks"],
}

implementations = [
    "zarrs_rust",
    # "zarrs_rust_async_as_sync",
    # "zarrs_rust_async",
    "tensorstore_python",
    "zarr_python",
    "zarrs_python",
    "zarr_dask_python",
    "zarrs_dask_python",
]

images = [
    "data/benchmark.zarr",
    "data/benchmark_compress.zarr",
    "data/benchmark_compress_shard.zarr",
]
concurrent_chunks_list = [1, 2, 4, 8, 16, 32]

index = []
for image in images:
    for concurrent_chunks in concurrent_chunks_list:
        index.append((image, concurrent_chunks))

rows = []
for image in images:
    for concurrent_chunks in concurrent_chunks_list:
        wall_times = []
        memory_usages = []
        for implementation in implementations:
            print(implementation, image, concurrent_chunks)
            args = time_args() + implementation_to_args[implementation] + [str(concurrent_chunks)] + [image]
            clear_cache()
            pipes = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            std_out, std_err = pipes.communicate()
            # print(std_err)

            wall_time = re.search(
                r"Elapsed \(wall clock\) time \(h:mm:ss or m:ss\): (\d+?):([\d\.]+?)\\n",
                str(std_err),
            )
            memory_usage = re.search(
                r"Maximum resident set size \(kbytes\): (\d+?)\\n", str(std_err)
            )
            if wall_time and memory_usage and pipes.returncode == 0:
                m = int(wall_time.group(1))
                s = float(wall_time.group(2))
                wall_time_s = m * 60 + s
                memory_usage_kb = int(memory_usage.group(1))
                memory_usage_gb = float(memory_usage_kb) / 1.0e6
                print(wall_time_s, memory_usage_gb)
                wall_times.append(wall_time_s)
                memory_usages.append(memory_usage_gb)
            else:
                wall_times.append(math.nan)
                memory_usages.append(math.nan)
        row = wall_times + memory_usages
        rows.append(row)

columns_pandas = []
columns_markdown = []
for metric in ["Time (s)", "Memory (GB)"]:
    include_metric = True
    last_implementation = ""
    for implementation in implementations:
        column_markdown = ""

        # Metric
        if include_metric:
            column_markdown += metric
        column_markdown += "<br>"
        include_metric = False

        # Implementation
        if implementation != last_implementation:
            last_implementation = implementation
            column_markdown += implementation.replace("_", "<br>")

        columns_markdown.append(column_markdown)
        columns_pandas.append((metric, implementation))

data = {
    "index": index,
    "columns": columns_pandas,
    "data": rows,
    "index_names": ["Image", "Concurrency"],
    "column_names": ["Metric", "Implementation"],
}

# print(data)

# Print and save as CSV
df = pd.DataFrame.from_dict(data, orient="tight")
print(df)
print()
df.to_csv("measurements/benchmark_read_chunks.csv")

# Print and save markdown
df_markdown = df.copy()
df_markdown.columns = columns_markdown
df_markdown.reset_index(inplace=True)
print(df_markdown.to_markdown(index=False, floatfmt=".02f"))
df_markdown.to_markdown("measurements/benchmark_read_chunks.md", floatfmt=".02f")