rapidsai · ahendriksen · May 5, 2023 · May 5, 2023 · May 5, 2023 · May 5, 2023
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Tuning benchmarks.
+//
+// Goals:
+//
+// 1. Fast compile times to maintain iteration speed.
+// 2. Create benchmarks that can inform the design of the kernels.
+//
+// Non-goals:
+//
+// 1. Measure every distance operation. Instead measures just one distance
+//    operation at the same time.
+// 2. Be useful for finding performance regressions. This is handled by the
+//    normal benchmarks.
+//
+// So far, both goals are partly achieved.
+//
+// RE (1), COMPILE TIMES: kernel.cu is fast to compile. This file is not.
+// When the internals of a pairwise distance kernel is changed, this file is not
+// recompiled.
+//
+// RE 2, benchmarks with intent: this file contains a benchmark to check the
+// maximal throughput of a kernel. Measuring other things, like performance on
+// skinny or wide matrices is not yet implemented.
+
+#include "kernel_cutlass.cuh"                               // launch_kernel
+#include <algorithm>                                        // std::min
+#include <common/benchmark.hpp>                             // RAFT_BENCH_REGISTER
+#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
+#include <rmm/device_uvector.hpp>                           // rmm::device_uvector
+#include <vector>                                           // std::vector
+
+namespace raft::bench::distance::tune_cutlass {
+
+// Max throughput benchmark.
+//
+// Goal: Measure the maximum distances/sec that can be computed.
+//
+// To achieve this, we make sure that:
+//
+// - Input data size is a multiple of the block tile size.
+//
+// - Perfect distribution of work between SMs, i.e. the number of block tiles is
+//   a large multiple (num_waves) of the number of blocks (#SMs * occupancy).
+//
+// - Multiple iterations over Kblk are executed (num_k_iters).
+struct throughput_param {
+  int m, n, k;
+  bool use_1x_tfloat;
+};
+
+const std::vector<throughput_param> throughput_params{
+  {1024, 1024, 1024, true},
+  {1024, 1024, 1 << 11, true},
+  {1024, 1024, 1 << 12, true},
+  {1024, 1024, 1 << 13, true},
+  {1024, 1 << 14, 1024, true},
+  {1024, 1 << 14, 1 << 11, true},
+  {1024, 1 << 14, 1 << 12, true},
+  {1024, 1 << 14, 1 << 13, true},
+
+  {1024, 1024, 1024, false},
+  {1024, 1024, 1 << 11, false},
+  {1024, 1024, 1 << 12, false},
+  {1024, 1024, 1 << 13, false},
+  {1024, 1 << 14, 1024, false},
+  {1024, 1 << 14, 1 << 11, false},
+  {1024, 1 << 14, 1 << 12, false},
+  {1024, 1 << 14, 1 << 13, false},
+};
+
+struct throughput_cutlass : public fixture {
+  const throughput_param p;
+
+  throughput_cutlass(const throughput_param& p_) : p(p_) {}
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    size_t m = p.m;
+    size_t n = p.n;
+    size_t k = p.k;
+
+    // DataT, OutT, IdxT, etc, are defined in tuned_kernel.cuh
+    rmm::device_uvector<DataT> x_vec(m * k, stream);
+    rmm::device_uvector<DataT> y_vec(n * k, stream);
+    rmm::device_uvector<DataT> x_norm_vec(m, stream);
+    rmm::device_uvector<DataT> y_norm_vec(n, stream);
+    rmm::device_uvector<OutT> out_vec(m * n, stream);
+
+    auto x      = x_vec.data();
+    auto y      = y_vec.data();
+    auto x_norm = x_norm_vec.data();
+    auto y_norm = y_norm_vec.data();
+    auto out    = out_vec.data();
+    FinOpT fin_op{};
+
+    // Create kernel parameter struct. Flip x and y if column major.
+    IdxT ldx    = row_major ? k : m;
+    IdxT ldy    = row_major ? k : n;
+    IdxT ld_out = row_major ? n : m;
+
+    // Template parameters of pairwise_matrix_params are defined in kernel.cuh
+    pairwise_matrix_params kparams{
+      IdxT(m), IdxT(n), IdxT(k), ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, row_major};
+
+    // Run benchmark
+    loop_on_state(state, [&]() { launch_kernel(kparams, p.use_1x_tfloat, stream); });
+
+    // Report metrics. We don't report flop/s because we do not know for each
+    // distance operation how many flops it costs. For L2_unexp and l1, we can
+    // double this number to get the flop/s. For l2 expanded, core_ops/s should
+    // equal flop/s (modulo the sqrt and subtracting from the norm).
-    // double this number to get the flop/s. For l2 expanded, core_ops/s should
-    // equal flop/s (modulo the sqrt and subtracting from the norm).
+    // double this number to get the flop/s. For l2 expanded, 2*core_ops/s should
+    // equal flop/s (ignoring the sqrt and subtracting from the norm).
-    // double this number to get the flop/s. For l2 expanded, core_ops/s should
-    // equal flop/s (modulo the sqrt and subtracting from the norm).
+    // double this number to get the flop/s. For l2 expanded, 2*core_ops/s should
+    // equal flop/s (ignoring the sqrt and subtracting from the norm).
+    size_t num_core_ops = m * n * k;
+    size_t read_elts    = n * k + m * k;
+    size_t write_elts   = m * n;
+
+    state.counters["m"]        = benchmark::Counter(m);
+    state.counters["n"]        = benchmark::Counter(n);
+    state.counters["k"]        = benchmark::Counter(k);
+    state.counters["1xtfloat"] = benchmark::Counter(p.use_1x_tfloat);
+
+    state.counters["core_ops/s"] = benchmark::Counter(num_core_ops,
+                                                      benchmark::Counter::kIsIterationInvariantRate,
+                                                      benchmark::Counter::OneK::kIs1000);
+
+    state.counters["BW"] = benchmark::Counter(write_elts * sizeof(OutT) + read_elts * sizeof(DataT),
+                                              benchmark::Counter::kIsIterationInvariantRate,
+                                              benchmark::Counter::OneK::kIs1000);
+  }
+};
+
+RAFT_BENCH_REGISTER(throughput_cutlass, "", throughput_params);
+
+}  // namespace raft::bench::distance::tune_cutlass
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_cutlass.cuh"
+#include <raft/distance/detail/distance_ops/all_ops.cuh>  // distance_op
+#include <raft/distance/detail/pairwise_matrix/dispatch_sm80.cuh>
+#include <raft/distance/detail/pairwise_matrix/params.cuh>
+#include <raft/distance/distance_types.hpp>  // Compute_options
+#include <raft/util/arch.cuh>                // raft::util::arch::SM_compute_arch
+
+namespace raft::bench::distance::tune_cutlass {
+
+// Distance op
+using OpT = raft::distance::detail::ops::l2_exp_distance_op<DataT, AccT, IdxT>;
+
+constexpr bool perform_sqrt = false;
+OpT distance_op{perform_sqrt};
+
+// Architecture
+namespace arch                 = raft::util::arch;
+constexpr auto sm_compat_range = arch::SM_range(arch::SM_80(), arch::SM_future());
+
+void launch_kernel(pairwise_matrix_params params, bool use_1x_tfloat, cudaStream_t stream)
+{
+  raft::distance::detail::pairwise_matrix_sm80_dispatch(
+    distance_op,
+    use_1x_tfloat ? raft::distance::Compute_options::Fast_Reduced_Precision
+                  : raft::distance::Compute_options::Fast_Similar_Precision,
+    params,
+    sm_compat_range,
+    stream);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+}  // namespace raft::bench::distance::tune_cutlass
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/operators.hpp>                          // raft::identity_op
+#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
+
+namespace raft::bench::distance::tune_cutlass {
+
+// Launch one specific kernel with the following template parameters
+constexpr bool row_major = true;
+using DataT              = float;
+using AccT               = float;
+using OutT               = DataT;
+using IdxT               = int;
+
+using FinOpT = raft::identity_op;
+
+using pairwise_matrix_params =
+  raft::distance::detail::pairwise_matrix_params<IdxT, DataT, OutT, FinOpT>;
+
+void launch_kernel(pairwise_matrix_params params, bool use_1x_tfloat, cudaStream_t stream);
+
+}  // namespace raft::bench::distance::tune_cutlass