New Benchmark Tests for two alt. implementations

The chunked_parallel_special_metric has been provided with two alternative implementations: one including IF clause adjusting for bounds for symmetrical matrices (so skipping useless computations) and another implementing the full iteration regardless. Benchmark have been injected using the pytest-benchmark plugin (will be added to CI in the next commit)
lmcinnes · Feb 17, 2021 · 7c67081 · 7c67081
1 parent b4b47de
commit 7c67081
Showing 1 changed file with 208 additions and 0 deletions.
diff --git a/umap/tests/test_chunked_parallel_spatial_metric.py b/umap/tests/test_chunked_parallel_spatial_metric.py
@@ -4,6 +4,10 @@
 from numpy.testing import assert_array_equal
 from umap import distances as dist
 
+# --------
+# Fixtures
+# --------
+
 
 @pytest.fixture(scope="function")
 def stashed_chunked_implementation():
@@ -55,6 +59,114 @@ def stashed_chunked_parallel_special_metric(
     return stashed_chunked_parallel_special_metric
 
 
+@pytest.fixture(scope="function")
+def chunked_parallel_if_clause():
+    @numba.njit(parallel=True, nogil=True)
+    def chunked_parallel_special_metric(
+        X, Y=None, metric=dist.named_distances["hellinger"], chunk_size=16
+    ):
+        if Y is None:
+            XX = X
+            row_size = col_size = X.shape[0]
+            symmetrical = True
+        else:
+            XX = Y
+            row_size = X.shape[0]
+            col_size = Y.shape[0]
+            symmetrical = False
+
+        result = np.zeros((row_size, col_size), dtype=np.float32)
+        n_row_chunks = (row_size // chunk_size) + 1
+        for chunk_idx in numba.prange(n_row_chunks):
+            n = chunk_idx * chunk_size
+            chunk_end_n = min(n + chunk_size, row_size)
+            m_start = 0 if not symmetrical else n
+            for m in range(m_start, col_size, chunk_size):
+                chunk_end_m = min(m + chunk_size, col_size)
+                for i in range(n, chunk_end_n):
+                    j_start = m if not symmetrical else i + 1
+                    for j in range(j_start, chunk_end_m):
+                        d = metric(X[i], XX[j])
+                        result[i, j] = d
+                        if symmetrical:
+                            result[j, i] = d
+        return result
+
+    return chunked_parallel_special_metric
+
+
+@pytest.fixture(scope="function")
+def chunked_parallel_full_iterations():
+    @numba.njit(parallel=True, nogil=True)
+    def chunked_parallel_special_metric(
+        X, Y=None, metric=dist.named_distances["hellinger"], chunk_size=16
+    ):
+        if Y is None:
+            XX = X
+            row_size = col_size = X.shape[0]
+            symmetrical = True
+        else:
+            XX = Y
+            row_size, col_size = X.shape[0], Y.shape[0]
+            symmetrical = False
+
+        result = np.zeros((row_size, col_size), dtype=np.float32)
+        n_row_chunks = (row_size // chunk_size) + 1
+        for chunk_idx in numba.prange(n_row_chunks):
+            n = chunk_idx * chunk_size
+            chunk_end_n = min(n + chunk_size, row_size)
+            m_start = 0 if not symmetrical else n
+            for m in range(m_start, col_size, chunk_size):
+                chunk_end_m = min(m + chunk_size, col_size)
+                for i in range(n, chunk_end_n):
+                    for j in range(m, chunk_end_m):
+                        d = metric(X[i], XX[j])
+                        result[i, j] = d
+        return result
+
+    return chunked_parallel_special_metric
+
+
+@pytest.fixture(scope="function")
+def benchmark_data(request):
+    shape = request.param
+    spatial_data = np.random.randn(*shape).astype(np.float32)
+    return np.abs(spatial_data)
+
+
+# ---------------------------------------------------------------
+
+# Uncomment this to skip the tests
+# @pytest.mark.skip(reason="Focus on benchmark for now. This passes!")
+def test_chunked_parallel_alternative_implementations(
+    spatial_data, chunked_parallel_if_clause, chunked_parallel_full_iterations
+):
+    # Base tests that must pass!
+    dist_matrix_x = chunked_parallel_if_clause(np.abs(spatial_data[:-2]))
+    dist_matrix_xy = chunked_parallel_if_clause(
+        np.abs(spatial_data[:-2]), np.abs(spatial_data[:-2])
+    )
+
+    dist_matrix_x_full = chunked_parallel_full_iterations(np.abs(spatial_data[:-2]))
+    dist_matrix_xy_full = chunked_parallel_full_iterations(
+        np.abs(spatial_data[:-2]), np.abs(spatial_data[:-2])
+    )
+
+    assert_array_equal(
+        dist_matrix_x_full,
+        dist_matrix_x,
+        err_msg="Distances don't match for metric hellinger",
+    )
+
+    assert_array_equal(
+        dist_matrix_xy_full,
+        dist_matrix_xy,
+        err_msg="Distances don't match for metric hellinger",
+    )
+
+
+# Uncomment this to skip the tests
+# @pytest.mark.skip(reason="Focus on benchmark for now. This passes!")
 def test_chunked_parallel_special_metric_implementation_hellinger(
     spatial_data, stashed_chunked_implementation
 ):
@@ -118,3 +230,99 @@ def test_chunked_parallel_special_metric_implementation_hellinger(
         dist_chunked_diff_pair,
         err_msg="Distances don't match between stashed and current chunked_parallel implementations",
     )
+
+
+@pytest.mark.benchmark(
+    group="benchmark_single_param",
+)
+@pytest.mark.parametrize(
+    "benchmark_data",
+    [(10 * s, 10 * s) for s in range(1, 101, 10)],
+    indirect=["benchmark_data"],
+)
+def test_benchmark_full_iteration_no_symmetrical_skips_x_only(
+    benchmark,
+    benchmark_data,
+    chunked_parallel_full_iterations,
+):
+
+    # single argument
+    benchmark.pedantic(
+        chunked_parallel_full_iterations,
+        kwargs={"X": benchmark_data, "Y": None},
+        warmup_rounds=5,
+        iterations=10,
+        rounds=10,
+    )
+
+
+@pytest.mark.benchmark(
+    group="benchmark_single_param",
+)
+@pytest.mark.parametrize(
+    "benchmark_data",
+    [(10 * s, 10 * s) for s in range(1, 101, 10)],
+    indirect=["benchmark_data"],
+)
+def test_benchmark_check_symmetrical_and_skips_x_only(
+    benchmark,
+    benchmark_data,
+    chunked_parallel_if_clause,
+):
+
+    # single argument
+    benchmark.pedantic(
+        chunked_parallel_if_clause,
+        kwargs={"X": benchmark_data, "Y": None},
+        warmup_rounds=5,
+        iterations=10,
+        rounds=10,
+    )
+
+
+@pytest.mark.benchmark(
+    group="benchmark_X_Y_params",
+)
+@pytest.mark.parametrize(
+    "benchmark_data",
+    [(10 * s, 10 * s) for s in range(1, 101, 10)],
+    indirect=["benchmark_data"],
+)
+def test_benchmark_full_iteration_no_symmetrical_skips_x_y(
+    benchmark,
+    benchmark_data,
+    chunked_parallel_full_iterations,
+):
+
+    # single argument
+    benchmark.pedantic(
+        chunked_parallel_full_iterations,
+        kwargs={"X": benchmark_data, "Y": benchmark_data},
+        warmup_rounds=5,
+        iterations=10,
+        rounds=10,
+    )
+
+
+@pytest.mark.benchmark(
+    group="benchmark_X_Y_params",
+)
+@pytest.mark.parametrize(
+    "benchmark_data",
+    [(10 * s, 10 * s) for s in range(1, 101, 10)],
+    indirect=["benchmark_data"],
+)
+def test_benchmark_check_symmetrical_and_skips_x_y(
+    benchmark,
+    benchmark_data,
+    chunked_parallel_if_clause,
+):
+
+    # single argument
+    benchmark.pedantic(
+        chunked_parallel_if_clause,
+        kwargs={"X": benchmark_data, "Y": benchmark_data},
+        warmup_rounds=5,
+        iterations=10,
+        rounds=10,
+    )