From 007c3d2c9efddec6dc46549a5aa2e9f48d4d1612 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 5 Dec 2024 15:48:41 -0500
Subject: [PATCH] Moving spectral embedding and kernel gramm APIs to cuVS
 (#463)

Partially addresses #455

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Ben Frederickson (https://github.com/benfred)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/463
---
 build.sh                                      |  12 +-
 cpp/CMakeLists.txt                            |   4 +
 cpp/include/cuvs/cluster/agglomerative.hpp    |   1 +
 cpp/include/cuvs/distance/grammian.hpp        | 665 +++++++++++++++
 cpp/include/cuvs/embed/spectral.hpp           |  40 +
 .../distance/detail/kernels/gram_matrix.cu    | 481 +++++++++++
 .../distance/detail/kernels/gram_matrix.cuh   | 488 -----------
 .../distance/detail/kernels/kernel_factory.cu |  61 ++
 .../detail/kernels/kernel_factory.cuh         |  65 --
 .../detail/kernels/kernel_matrices.cu         | 726 ++++++++++++++++
 .../detail/kernels/kernel_matrices.cuh        | 777 ------------------
 .../distance/detail/kernels/rbf_fin_op.cuh    |   4 +-
 .../detail/pairwise_matrix/dispatch-ext.cuh   |   4 +-
 .../detail/pairwise_matrix/dispatch_rbf.cu    |   6 +-
 cpp/src/distance/distance-ext.cuh             |   4 +-
 cpp/src/distance/distance.cu                  |   4 +-
 cpp/src/embed/spectral.cu                     |  53 ++
 cpp/src/sparse/cluster/cluster_solvers.cuh    | 100 +++
 cpp/src/sparse/cluster/detail/spectral.cuh    | 111 +++
 .../spectral/modularity_maximization.hpp      | 176 ++++
 .../cluster/detail/spectral/partition.hpp     | 188 +++++
 .../cluster/detail/spectral/spectral_util.cuh | 181 ++++
 cpp/src/sparse/cluster/eigen_solvers.cuh      | 107 +++
 .../cluster/modularity_maximization.cuh       |  86 ++
 cpp/src/sparse/cluster/partition.cuh          |  95 +++
 cpp/test/CMakeLists.txt                       |   6 +
 cpp/test/distance/gram.cu                     | 174 ++++
 cpp/test/distance/gram_base.cuh               |  91 ++
 cpp/test/sparse/cluster/cluster_solvers.cu    | 105 +++
 cpp/test/sparse/cluster/eigen_solvers.cu      | 119 +++
 cpp/test/sparse/cluster/spectral.cu           | 109 +++
 cpp/test/sparse/cluster/spectral_matrix.cu    |  84 ++
 cpp/test/sparse/gram.cu                       | 330 ++++++++
 33 files changed, 4105 insertions(+), 1352 deletions(-)
 create mode 100644 cpp/include/cuvs/distance/grammian.hpp
 create mode 100644 cpp/include/cuvs/embed/spectral.hpp
 create mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cu
 delete mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cuh
 create mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cu
 delete mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cuh
 create mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cu
 delete mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cuh
 create mode 100644 cpp/src/embed/spectral.cu
 create mode 100644 cpp/src/sparse/cluster/cluster_solvers.cuh
 create mode 100644 cpp/src/sparse/cluster/detail/spectral.cuh
 create mode 100644 cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp
 create mode 100644 cpp/src/sparse/cluster/detail/spectral/partition.hpp
 create mode 100644 cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh
 create mode 100644 cpp/src/sparse/cluster/eigen_solvers.cuh
 create mode 100644 cpp/src/sparse/cluster/modularity_maximization.cuh
 create mode 100644 cpp/src/sparse/cluster/partition.cuh
 create mode 100644 cpp/test/distance/gram.cu
 create mode 100644 cpp/test/distance/gram_base.cuh
 create mode 100644 cpp/test/sparse/cluster/cluster_solvers.cu
 create mode 100644 cpp/test/sparse/cluster/eigen_solvers.cu
 create mode 100644 cpp/test/sparse/cluster/spectral.cu
 create mode 100644 cpp/test/sparse/cluster/spectral_matrix.cu
 create mode 100644 cpp/test/sparse/gram.cu

diff --git a/build.sh b/build.sh
index c08c2900e..bd5fa649b 100755
--- a/build.sh
+++ b/build.sh
@@ -76,8 +76,8 @@ BUILD_REPORT_METRICS=""
 BUILD_REPORT_INCL_CACHE_STATS=OFF
 BUILD_SHARED_LIBS=ON
 
-TEST_TARGETS="NEIGHBORS_ANN_CAGRA_TEST"
-ANN_BENCH_TARGETS="CUVS_ANN_BENCH_ALL"
+TEST_TARGETS=""
+ANN_BENCH_TARGETS=""
 
 CACHE_ARGS=""
 NVTX=ON
@@ -273,14 +273,6 @@ fi
 if hasArg tests || (( ${NUMARGS} == 0 )); then
     BUILD_TESTS=ON
     CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}"
-
-    # Force compile library when needed test targets are specified
-    if [[ $CMAKE_TARGET == *"CAGRA_C_TEST"* || \
-          $CMAKE_TARGET == *"INTEROP_TEST"* || \
-          $CMAKE_TARGET == *"NEIGHBORS_ANN_CAGRA_TEST"* ]]; then
-      echo "-- Enabling compiled lib for gtests"
-      COMPILE_LIBRARY=ON
-    fi
 fi
 
 if hasArg bench-ann || (( ${NUMARGS} == 0 )); then
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 199bb232d..95fb7e63b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -324,6 +324,9 @@ if(BUILD_SHARED_LIBS)
     src/cluster/kmeans_transform_float.cu
     src/cluster/single_linkage_float.cu
     src/core/bitset.cu
+    src/distance/detail/kernels/gram_matrix.cu
+    src/distance/detail/kernels/kernel_factory.cu
+    src/distance/detail/kernels/kernel_matrices.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
@@ -370,6 +373,7 @@ if(BUILD_SHARED_LIBS)
     src/distance/distance.cu
     src/distance/pairwise_distance.cu
     src/distance/sparse_distance.cu
+    src/embed/spectral.cu
     src/neighbors/brute_force.cu
     src/neighbors/brute_force_serialize.cu
     src/neighbors/cagra_build_float.cu
diff --git a/cpp/include/cuvs/cluster/agglomerative.hpp b/cpp/include/cuvs/cluster/agglomerative.hpp
index e1da04085..8f7e8675a 100644
--- a/cpp/include/cuvs/cluster/agglomerative.hpp
+++ b/cpp/include/cuvs/cluster/agglomerative.hpp
@@ -18,6 +18,7 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <optional>
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
 
diff --git a/cpp/include/cuvs/distance/grammian.hpp b/cpp/include/cuvs/distance/grammian.hpp
new file mode 100644
index 000000000..0c904d493
--- /dev/null
+++ b/cpp/include/cuvs/distance/grammian.hpp
@@ -0,0 +1,665 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <cuvs/distance/distance.hpp>
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::distance::kernels {
+
+template <typename math_t>
+using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, raft::layout_stride>;
+template <typename math_t>
+using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, raft::layout_stride>;
+template <typename math_t>
+using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int, int, int>;
+
+/**
+ * Base class for general Gram matrices
+ * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
+ * Here, the  inner product is evaluated for all elements from vectors sets X1,
+ * and X2.
+ *
+ * To be more precise, on exit the output buffer will store:
+ * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
+ * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
+ * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
+ * from the x2 set.
+ */
+template <typename math_t>
+class GramMatrixBase {
+ protected:
+  cublasHandle_t cublas_handle;
+  bool legacy_interface;
+
+ public:
+  GramMatrixBase() : legacy_interface(false){};
+  [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle)
+    : cublas_handle(cublas_handle), legacy_interface(true){};
+
+  virtual ~GramMatrixBase(){};
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::resources const& handle,
+                  dense_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr);
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
+                  dense_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr);
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *  Vector sets are provided in Matrix format
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void operator()(raft::resources const& handle,
+                  csr_input_matrix_view_t<math_t> x1,
+                  csr_input_matrix_view_t<math_t> x2,
+                  dense_output_matrix_view_t<math_t> out,
+                  math_t* norm_x1 = nullptr,
+                  math_t* norm_x2 = nullptr);
+
+  // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::resources const& handle,
+                        dense_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2);
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
+                        dense_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2);
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  virtual void evaluate(raft::resources const& handle,
+                        csr_input_matrix_view_t<math_t> x1,
+                        csr_input_matrix_view_t<math_t> x2,
+                        dense_output_matrix_view_t<math_t> out,
+                        math_t* norm_x1,
+                        math_t* norm_x2);
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] virtual void evaluate(const math_t* x1,
+                                       int n1,
+                                       int n_cols,
+                                       const math_t* x2,
+                                       int n2,
+                                       math_t* out,
+                                       bool is_row_major,
+                                       cudaStream_t stream,
+                                       int ld1,
+                                       int ld2,
+                                       int ld_out);
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  [[deprecated]] void operator()(const math_t* x1,
+                                 int n1,
+                                 int n_cols,
+                                 const math_t* x2,
+                                 int n2,
+                                 math_t* out,
+                                 bool is_row_major,
+                                 cudaStream_t stream,
+                                 int ld1    = 0,
+                                 int ld2    = 0,
+                                 int ld_out = 0);
+
+ protected:
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  [[deprecated]] void linear(const math_t* x1,
+                             int n1,
+                             int n_cols,
+                             const math_t* x2,
+                             int n2,
+                             math_t* out,
+                             bool is_row_major,
+                             cudaStream_t stream,
+                             int ld1,
+                             int ld2,
+                             int ld_out);
+
+ protected:
+  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix);
+  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix);
+  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix);
+  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix);
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::resources const& handle,
+              dense_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out);
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
+              dense_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out);
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   */
+  void linear(raft::resources const& handle,
+              csr_input_matrix_view_t<math_t> x1,
+              csr_input_matrix_view_t<math_t> x2,
+              dense_output_matrix_view_t<math_t> out);
+};
+
+template <typename math_t>
+class KernelFactory {
+ public:
+  static GramMatrixBase<math_t>* create(KernelParams params);
+  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle);
+};
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+class PolynomialKernel : public GramMatrixBase<math_t> {
+  exp_t exponent;
+  math_t gain;
+  math_t offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a polynomial kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
+   *
+   * @tparam math_t floating point type
+   * @tparam exp_t type of exponent
+   * @param exponent
+   * @param gain
+   * @param offset
+   */
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
+    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset){};
+
+  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset){};
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+class TanhKernel : public GramMatrixBase<math_t> {
+  math_t gain, offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a tanh kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   * @param offset
+   */
+  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
+
+  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset){};
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+class RBFKernel : public GramMatrixBase<math_t> {
+  math_t gain;
+
+  void applyKernel(math_t* inout,
+                   int ld,
+                   int rows,
+                   int cols,
+                   math_t* norm_x1,
+                   math_t* norm_x2,
+                   bool is_row_major,
+                   cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a RBF kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = exp(-gain*|x1_i- x2_k|^2)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   */
+  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain){};
+
+  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain){};
+
+  void matrixRowNormL2(raft::resources const& handle,
+                       dense_input_matrix_view_t<math_t> matrix,
+                       math_t* target);
+
+  void matrixRowNormL2(raft::resources const& handle,
+                       csr_input_matrix_view_t<math_t> matrix,
+                       math_t* target);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+};  // end namespace cuvs::distance::kernels
diff --git a/cpp/include/cuvs/embed/spectral.hpp b/cpp/include/cuvs/embed/spectral.hpp
new file mode 100644
index 000000000..1a8fed96a
--- /dev/null
+++ b/cpp/include/cuvs/embed/spectral.hpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/device_coo_matrix.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::embed::spectral {
+
+/**
+ * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings
+ * (lowest n_components eigenvectors), using Lanczos min cut algorithm. Please note that this
+ * algorithm does not compute a full laplacian eigenmap, as the laplacian eigenmap would embed each
+ * connected component. Laplacian eigenmaps can be built from this algorithm by running it on the
+ * vectors for each connected component.
+
+ * @param[in] handle
+ * @param[in] knn_graph KNN Graph
+ * @param[in] n_components the number of components to project into
+ * @param[out] out output array for embedding (size n*n_comonents)
+ * @param[in] seed
+ */
+void fit(const raft::resources& handle,
+         raft::device_coo_matrix_view<float, int, int, int> knn_graph,
+         int n_components,
+         raft::device_matrix_view<float, int> out,
+         unsigned long long seed = 0L);
+};  // namespace cuvs::embed::spectral
diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cu b/cpp/src/distance/detail/kernels/gram_matrix.cu
new file mode 100644
index 000000000..0e4f3e639
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/gram_matrix.cu
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../distance.cuh"
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/sparse/linalg/spmm.hpp>
+
+namespace cuvs::distance::kernels {
+
+/**
+ * Base class for general Gram matrices
+ * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
+ * Here, the  inner product is evaluated for all elements from vectors sets X1,
+ * and X2.
+ *
+ * To be more precise, on exit the output buffer will store:
+ * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
+ * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
+ * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
+ * from the x2 set.
+ */
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        dense_input_matrix_view_t<math_t> x1,
+                                        dense_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> x1,
+                                        dense_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> x1,
+                                        csr_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+// unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
+
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      dense_input_matrix_view_t<math_t> x1,
+                                      dense_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      csr_input_matrix_view_t<math_t> x1,
+                                      dense_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      csr_input_matrix_view_t<math_t> x1,
+                                      csr_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::evaluate(const math_t* x1,
+                                                     int n1,
+                                                     int n_cols,
+                                                     const math_t* x2,
+                                                     int n2,
+                                                     math_t* out,
+                                                     bool is_row_major,
+                                                     cudaStream_t stream,
+                                                     int ld1,
+                                                     int ld2,
+                                                     int ld_out)
+{
+  linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1
+ * @param ld2 leading dimension of x2
+ * @param ld_out leading dimension of out
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::operator()(const math_t* x1,
+                                                       int n1,
+                                                       int n_cols,
+                                                       const math_t* x2,
+                                                       int n2,
+                                                       math_t* out,
+                                                       bool is_row_major,
+                                                       cudaStream_t stream,
+                                                       int ld1,
+                                                       int ld2,
+                                                       int ld_out)
+{
+  ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
+  if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
+  if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+  if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+  evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1
+ * @param ld2 leading dimension of x2
+ * @param ld_out leading dimension of out
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::linear(const math_t* x1,
+                                                   int n1,
+                                                   int n_cols,
+                                                   const math_t* x2,
+                                                   int n2,
+                                                   math_t* out,
+                                                   bool is_row_major,
+                                                   cudaStream_t stream,
+                                                   int ld1,
+                                                   int ld2,
+                                                   int ld_out)
+{
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+  if (is_row_major) {
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     n2,
+                                                     n1,
+                                                     n_cols,
+                                                     &alpha,
+                                                     x2,
+                                                     ld2,
+                                                     x1,
+                                                     ld1,
+                                                     &beta,
+                                                     out,
+                                                     ld_out,
+                                                     stream));
+  } else {
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                     CUBLAS_OP_N,
+                                                     CUBLAS_OP_T,
+                                                     n1,
+                                                     n2,
+                                                     n_cols,
+                                                     &alpha,
+                                                     x1,
+                                                     ld1,
+                                                     x2,
+                                                     ld2,
+                                                     &beta,
+                                                     out,
+                                                     ld_out,
+                                                     stream));
+  }
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(1) == 1);
+}
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(1) == 1);
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(0) == 1);
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(0) == 1);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    dense_input_matrix_view_t<math_t> x1,
+                                    dense_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check is_row_major consistency
+  bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
+  bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
+  ASSERT(is_row_major || is_col_major,
+         "GramMatrix leading dimensions for x1, x2 and out do not match");
+
+  // check dimensions
+  int n1     = out.extent(0);
+  int n2     = out.extent(1);
+  int n_cols = x1.extent(1);
+  ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
+  ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
+  ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+  // extract major stride
+  int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
+  int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
+  int ld_out = is_row_major ? out.stride(0) : out.stride(1);
+
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+  if (is_row_major) {
+    // #TODO: Use mdspan-based API when stride-capable
+    // https://github.com/rapidsai/raft/issues/875
+    raft::linalg::gemm(handle,
+                       true,
+                       false,
+                       n2,
+                       n1,
+                       n_cols,
+                       &alpha,
+                       x2.data_handle(),
+                       ld2,
+                       x1.data_handle(),
+                       ld1,
+                       &beta,
+                       out.data_handle(),
+                       ld_out,
+                       raft::resource::get_cuda_stream(handle));
+  } else {
+    // #TODO: Use mdspan-based API when stride-capable
+    // https://github.com/rapidsai/raft/issues/875
+    raft::linalg::gemm(handle,
+                       false,
+                       true,
+                       n1,
+                       n2,
+                       n_cols,
+                       &alpha,
+                       x1.data_handle(),
+                       ld1,
+                       x2.data_handle(),
+                       ld2,
+                       &beta,
+                       out.data_handle(),
+                       ld_out,
+                       raft::resource::get_cuda_stream(handle));
+  }
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    csr_input_matrix_view_t<math_t> x1,
+                                    dense_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check is_row_major consistency
+  bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
+  bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
+  ASSERT(is_row_major || is_col_major, "GramMatrix leading dimensions for x2 and out do not match");
+
+  // check dimensions
+  auto x1_structure = x1.structure_view();
+  ASSERT(x1_structure.get_n_rows() == out.extent(0),
+         "GramMatrix input matrix dimensions for x1 and out do not match");
+  ASSERT(x2.extent(0) == out.extent(1),
+         "GramMatrix input matrix dimensions for x2 and out do not match");
+  ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
+         "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+
+  raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    csr_input_matrix_view_t<math_t> x1,
+                                    csr_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check layout consistency (w.r.t. strides a matrix might be both row & col major)
+  bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1);
+  bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0);
+
+  ASSERT(is_row_major_nopad || is_col_major_nopad,
+         "Sparse linear Kernel distance does not support ld_out parameter");
+
+  // switch a,b based on is_row_major
+  if (is_col_major_nopad) {
+    auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
+      out.data_handle(), out.extent(1), out.extent(0));
+
+    cuvs::distance::pairwise_distance(
+      handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
+  } else {
+    auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
+      out.data_handle(), out.extent(0), out.extent(1));
+    cuvs::distance::pairwise_distance(
+      handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
+  }
+}
+
+template class GramMatrixBase<float>;
+template class GramMatrixBase<double>;
+
+};  // namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cuh b/cpp/src/distance/detail/kernels/gram_matrix.cuh
deleted file mode 100644
index d435fb4d1..000000000
--- a/cpp/src/distance/detail/kernels/gram_matrix.cuh
+++ /dev/null
@@ -1,488 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "../../distance.cuh"
-#include <cuvs/distance/distance.hpp>
-#include <raft/core/device_csr_matrix.hpp>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/core/resources.hpp>
-// #include <raft/sparse/detail/cusparse_wrappers.h>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/sparse/distance/distance.cuh>
-#include <raft/sparse/linalg/spmm.hpp>
-
-namespace cuvs::distance::kernels::detail {
-
-template <typename math_t>
-using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, layout_stride>;
-template <typename math_t>
-using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, layout_stride>;
-template <typename math_t>
-using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int, int, int>;
-
-/**
- * Base class for general Gram matrices
- * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
- * Here, the  inner product is evaluated for all elements from vectors sets X1,
- * and X2.
- *
- * To be more precise, on exit the output buffer will store:
- * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
- * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
- * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
- * from the x2 set.
- */
-template <typename math_t>
-class GramMatrixBase {
- protected:
-  cublasHandle_t cublas_handle;
-  bool legacy_interface;
-
- public:
-  GramMatrixBase() : legacy_interface(false){};
-  [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle)
-    : cublas_handle(cublas_handle), legacy_interface(true){};
-
-  virtual ~GramMatrixBase(){};
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *  Vector sets are provided in Matrix format
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void operator()(raft::resources const& handle,
-                  dense_input_matrix_view_t<math_t> x1,
-                  dense_input_matrix_view_t<math_t> x2,
-                  dense_output_matrix_view_t<math_t> out,
-                  math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *  Vector sets are provided in Matrix format
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void operator()(raft::resources const& handle,
-                  csr_input_matrix_view_t<math_t> x1,
-                  dense_input_matrix_view_t<math_t> x2,
-                  dense_output_matrix_view_t<math_t> out,
-                  math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *  Vector sets are provided in Matrix format
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void operator()(raft::resources const& handle,
-                  csr_input_matrix_view_t<math_t> x1,
-                  csr_input_matrix_view_t<math_t> x2,
-                  dense_output_matrix_view_t<math_t> out,
-                  math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
-
-  // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
-
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  virtual void evaluate(raft::resources const& handle,
-                        dense_input_matrix_view_t<math_t> x1,
-                        dense_input_matrix_view_t<math_t> x2,
-                        dense_output_matrix_view_t<math_t> out,
-                        math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  virtual void evaluate(raft::resources const& handle,
-                        csr_input_matrix_view_t<math_t> x1,
-                        dense_input_matrix_view_t<math_t> x2,
-                        dense_output_matrix_view_t<math_t> out,
-                        math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  virtual void evaluate(raft::resources const& handle,
-                        csr_input_matrix_view_t<math_t> x1,
-                        csr_input_matrix_view_t<math_t> x2,
-                        dense_output_matrix_view_t<math_t> out,
-                        math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
-
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] virtual void evaluate(const math_t* x1,
-                                       int n1,
-                                       int n_cols,
-                                       const math_t* x2,
-                                       int n2,
-                                       math_t* out,
-                                       bool is_row_major,
-                                       cudaStream_t stream,
-                                       int ld1,
-                                       int ld2,
-                                       int ld_out)
-  {
-    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  [[deprecated]] void operator()(const math_t* x1,
-                                 int n1,
-                                 int n_cols,
-                                 const math_t* x2,
-                                 int n2,
-                                 math_t* out,
-                                 bool is_row_major,
-                                 cudaStream_t stream,
-                                 int ld1    = 0,
-                                 int ld2    = 0,
-                                 int ld_out = 0)
-  {
-    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
-    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
-    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
- protected:
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  [[deprecated]] void linear(const math_t* x1,
-                             int n1,
-                             int n_cols,
-                             const math_t* x2,
-                             int n2,
-                             math_t* out,
-                             bool is_row_major,
-                             cudaStream_t stream,
-                             int ld1,
-                             int ld2,
-                             int ld_out)
-  {
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-    if (is_row_major) {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_T,
-                                                       CUBLAS_OP_N,
-                                                       n2,
-                                                       n1,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x2,
-                                                       ld2,
-                                                       x1,
-                                                       ld1,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    } else {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_N,
-                                                       CUBLAS_OP_T,
-                                                       n1,
-                                                       n2,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x1,
-                                                       ld1,
-                                                       x2,
-                                                       ld2,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    }
-  }
-
- protected:
-  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(1) == 1);
-  }
-
-  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(1) == 1);
-  }
-
-  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(0) == 1);
-  }
-
-  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(0) == 1);
-  }
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   */
-  void linear(raft::resources const& handle,
-              dense_input_matrix_view_t<math_t> x1,
-              dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check is_row_major consistency
-    bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
-    bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
-    ASSERT(is_row_major || is_col_major,
-           "GramMatrix leading dimensions for x1, x2 and out do not match");
-
-    // check dimensions
-    int n1     = out.extent(0);
-    int n2     = out.extent(1);
-    int n_cols = x1.extent(1);
-    ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
-    ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
-    ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
-
-    // extract major stride
-    int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
-    int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
-    int ld_out = is_row_major ? out.stride(0) : out.stride(1);
-
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-    if (is_row_major) {
-      // #TODO: Use mdspan-based API when stride-capable
-      // https://github.com/rapidsai/raft/issues/875
-      raft::linalg::gemm(handle,
-                         true,
-                         false,
-                         n2,
-                         n1,
-                         n_cols,
-                         &alpha,
-                         x2.data_handle(),
-                         ld2,
-                         x1.data_handle(),
-                         ld1,
-                         &beta,
-                         out.data_handle(),
-                         ld_out,
-                         resource::get_cuda_stream(handle));
-    } else {
-      // #TODO: Use mdspan-based API when stride-capable
-      // https://github.com/rapidsai/raft/issues/875
-      raft::linalg::gemm(handle,
-                         false,
-                         true,
-                         n1,
-                         n2,
-                         n_cols,
-                         &alpha,
-                         x1.data_handle(),
-                         ld1,
-                         x2.data_handle(),
-                         ld2,
-                         &beta,
-                         out.data_handle(),
-                         ld_out,
-                         resource::get_cuda_stream(handle));
-    }
-  }
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   */
-  void linear(raft::resources const& handle,
-              csr_input_matrix_view_t<math_t> x1,
-              dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check is_row_major consistency
-    bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
-    bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
-    ASSERT(is_row_major || is_col_major,
-           "GramMatrix leading dimensions for x2 and out do not match");
-
-    // check dimensions
-    auto x1_structure = x1.structure_view();
-    ASSERT(x1_structure.get_n_rows() == out.extent(0),
-           "GramMatrix input matrix dimensions for x1 and out do not match");
-    ASSERT(x2.extent(0) == out.extent(1),
-           "GramMatrix input matrix dimensions for x2 and out do not match");
-    ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
-           "GramMatrix input matrix dimensions for x1 and x2 do not match");
-
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-
-    raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
-  }
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   */
-  void linear(raft::resources const& handle,
-              csr_input_matrix_view_t<math_t> x1,
-              csr_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check layout consistency (w.r.t. strides a matrix might be both row & col major)
-    bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1);
-    bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0);
-
-    ASSERT(is_row_major_nopad || is_col_major_nopad,
-           "Sparse linear Kernel distance does not support ld_out parameter");
-
-    // switch a,b based on is_row_major
-    if (is_col_major_nopad) {
-      auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
-        out.data_handle(), out.extent(1), out.extent(0));
-      raft::sparse::distance::pairwise_distance(
-        handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
-    } else {
-      auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
-        out.data_handle(), out.extent(0), out.extent(1));
-      raft::sparse::distance::pairwise_distance(
-        handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
-    }
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cu b/cpp/src/distance/detail/kernels/kernel_factory.cu
new file mode 100644
index 000000000..25f9e9b84
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_factory.cu
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuvs/distance/grammian.hpp>
+
+namespace cuvs::distance::kernels {
+
+template <typename math_t>
+GramMatrixBase<math_t>* KernelFactory<math_t>::create(KernelParams params)
+{
+  GramMatrixBase<math_t>* res;
+  // KernelParams is not templated, we convert the parameters to math_t here:
+  math_t coef0 = params.coef0;
+  math_t gamma = params.gamma;
+  switch (params.kernel) {
+    case LINEAR: res = new GramMatrixBase<math_t>(); break;
+    case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
+    case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
+    case RBF: res = new RBFKernel<math_t>(gamma); break;
+    default: throw raft::exception("Kernel not implemented");
+  }
+  return res;
+}
+
+template <typename math_t>
+[[deprecated]] GramMatrixBase<math_t>* KernelFactory<math_t>::create(KernelParams params,
+                                                                     cublasHandle_t handle)
+{
+  GramMatrixBase<math_t>* res;
+  // KernelParams is not templated, we convert the parameters to math_t here:
+  math_t coef0 = params.coef0;
+  math_t gamma = params.gamma;
+  switch (params.kernel) {
+    case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
+    case POLYNOMIAL:
+      res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
+      break;
+    case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
+    case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
+    default: throw raft::exception("Kernel not implemented");
+  }
+  return res;
+}
+
+template class KernelFactory<float>;
+template class KernelFactory<double>;
+
+};  // end namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cuh b/cpp/src/distance/detail/kernels/kernel_factory.cuh
deleted file mode 100644
index 5c50a95a3..000000000
--- a/cpp/src/distance/detail/kernels/kernel_factory.cuh
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "gram_matrix.cuh"
-#include "kernel_matrices.cuh"
-
-#include <cuvs/distance/distance.hpp>
-#include <raft/util/cudart_utils.hpp>
-
-namespace cuvs::distance::kernels::detail {
-
-template <typename math_t>
-class KernelFactory {
- public:
-  static GramMatrixBase<math_t>* create(KernelParams params)
-  {
-    GramMatrixBase<math_t>* res;
-    // KernelParams is not templated, we convert the parameters to math_t here:
-    math_t coef0 = params.coef0;
-    math_t gamma = params.gamma;
-    switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(); break;
-      case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
-      case RBF: res = new RBFKernel<math_t>(gamma); break;
-      default: throw raft::exception("Kernel not implemented");
-    }
-    return res;
-  }
-
-  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle)
-  {
-    GramMatrixBase<math_t>* res;
-    // KernelParams is not templated, we convert the parameters to math_t here:
-    math_t coef0 = params.coef0;
-    math_t gamma = params.gamma;
-    switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
-      case POLYNOMIAL:
-        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
-        break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
-      case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
-      default: throw raft::exception("Kernel not implemented");
-    }
-    return res;
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cu b/cpp/src/distance/detail/kernels/kernel_matrices.cu
new file mode 100644
index 000000000..526ca106f
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_matrices.cu
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../distance/distance.cuh"
+#include <cuvs/distance/grammian.hpp>
+
+#include "rbf_fin_op.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/sparse/linalg/norm.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+namespace cuvs::distance::kernels {
+
+/** Epiloge function for polynomial kernel without padding.
+ * Calculates output = (gain*in + offset)^exponent
+ * @param inout device vector in column major format, size [len]
+ * @param len array length
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+RAFT_KERNEL polynomial_kernel_nopad(
+  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = pow(gain * inout[tid] + offset, exponent);
+  }
+}
+
+/** Epiloge function for polynomial kernel with padding.
+ * Calculates output = (gain*input + offset)^exponent
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+RAFT_KERNEL polynomial_kernel(
+  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
+    }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector, size [len]
+ * @param len length of the input vector
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = tanh(gain * inout[tid] + offset);
+  }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
+    }
+}
+
+/** Epiloge function for rbf kernel using expansion.
+ *
+ * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
+ *
+ * Intended usage
+ *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
+ *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
+ *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
+ *
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param norm_x l2-norm of X's rows
+ * @param norm_y l2-norm of Y's rows
+ * @param gain
+ */
+template <typename math_t>
+RAFT_KERNEL rbf_kernel_expanded(
+  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y) {
+    math_t norm_y_val = norm_y[tidy];
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] =
+        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
+    }
+  }
+}
+
+std::tuple<dim3, dim3> generateLaunchConfig2dElementwiseOp(int n1, int n2)
+{
+  dim3 block_shape       = dim3(32, 4);
+  const int num_blocks_x = raft::ceildiv(n1, 32);
+  const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1);
+  dim3 grid_shape        = dim3(num_blocks_x, num_blocks_y);
+  return std::make_tuple(grid_shape, block_shape);
+}
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::applyKernel(
+  math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+{
+  const int n_minor = is_row_major ? cols : rows;
+  if (ld == n_minor) {
+    polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+      inout, rows * cols, exponent, gain, offset);
+  } else {
+    int n1                         = is_row_major ? cols : rows;
+    int n2                         = is_row_major ? rows : cols;
+    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+    polynomial_kernel<<<grid_shape, block_shape, 0, stream>>>(
+      inout, ld, n1, n2, exponent, gain, offset);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               dense_input_matrix_view_t<math_t> x1,
+                                               dense_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               csr_input_matrix_view_t<math_t> x1,
+                                               dense_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               csr_input_matrix_view_t<math_t> x1,
+                                               csr_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t, typename exp_t>
+[[deprecated]] void PolynomialKernel<math_t, exp_t>::evaluate(const math_t* x1,
+                                                              int n1,
+                                                              int n_cols,
+                                                              const math_t* x2,
+                                                              int n2,
+                                                              math_t* out,
+                                                              bool is_row_major,
+                                                              cudaStream_t stream,
+                                                              int ld1,
+                                                              int ld2,
+                                                              int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  GramMatrixBase<math_t>::linear(
+    x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+}
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::applyKernel(
+  math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+{
+  const int n_minor = is_row_major ? cols : rows;
+  if (ld == n_minor) {
+    tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+      inout, rows * cols, gain, offset);
+  } else {
+    int n1                         = is_row_major ? cols : rows;
+    int n2                         = is_row_major ? rows : cols;
+    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+    tanh_kernel<<<grid_shape, block_shape, 0, stream>>>(inout, ld, n1, n2, gain, offset);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  dense_input_matrix_view_t<math_t> x1,
+                                  dense_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  csr_input_matrix_view_t<math_t> x1,
+                                  dense_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  csr_input_matrix_view_t<math_t> x1,
+                                  csr_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void TanhKernel<math_t>::evaluate(const math_t* x1,
+                                                 int n1,
+                                                 int n_cols,
+                                                 const math_t* x2,
+                                                 int n2,
+                                                 math_t* out,
+                                                 bool is_row_major,
+                                                 cudaStream_t stream,
+                                                 int ld1,
+                                                 int ld2,
+                                                 int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  GramMatrixBase<math_t>::linear(
+    x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+}
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::applyKernel(math_t* inout,
+                                    int ld,
+                                    int rows,
+                                    int cols,
+                                    math_t* norm_x1,
+                                    math_t* norm_x2,
+                                    bool is_row_major,
+                                    cudaStream_t stream)
+{
+  int n1                         = is_row_major ? cols : rows;
+  int n2                         = is_row_major ? rows : cols;
+  math_t* norm_n1                = is_row_major ? norm_x2 : norm_x1;
+  math_t* norm_n2                = is_row_major ? norm_x1 : norm_x2;
+  auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+  rbf_kernel_expanded<<<grid_shape, block_shape, 0, stream>>>(
+    inout, ld, n1, n2, norm_n1, norm_n2, gain);
+}
+
+template <typename math_t>
+void RBFKernel<math_t>::matrixRowNormL2(raft::resources const& handle,
+                                        dense_input_matrix_view_t<math_t> matrix,
+                                        math_t* target)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
+  int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
+  int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
+  ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
+  raft::linalg::rowNorm(target,
+                        matrix.data_handle(),
+                        matrix.extent(1),
+                        matrix.extent(0),
+                        raft::linalg::NormType::L2Norm,
+                        is_row_major,
+                        raft::resource::get_cuda_stream(handle));
+}
+
+template <typename math_t>
+void RBFKernel<math_t>::matrixRowNormL2(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> matrix,
+                                        math_t* target)
+{
+  auto matrix_structure = matrix.structure_view();
+  raft::sparse::linalg::rowNormCsr(handle,
+                                   matrix_structure.get_indptr().data(),
+                                   matrix.get_elements().data(),
+                                   matrix_structure.get_nnz(),
+                                   matrix_structure.get_n_rows(),
+                                   target,
+                                   raft::linalg::NormType::L2Norm);
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 dense_input_matrix_view_t<math_t> x1,
+                                 dense_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.extent(0), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.extent(0), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 csr_input_matrix_view_t<math_t> x1,
+                                 dense_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.extent(0), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 csr_input_matrix_view_t<math_t> x1,
+                                 csr_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void RBFKernel<math_t>::evaluate(const math_t* x1,
+                                                int n1,
+                                                int n_cols,
+                                                const math_t* x2,
+                                                int n2,
+                                                math_t* out,
+                                                bool is_row_major,
+                                                cudaStream_t stream,
+                                                int ld1,
+                                                int ld2,
+                                                int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  int minor1    = is_row_major ? n_cols : n1;
+  int minor2    = is_row_major ? n_cols : n2;
+  int minor_out = is_row_major ? n2 : n1;
+  ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
+  ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
+  ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
+
+  math_t gain   = this->gain;
+  using index_t = int64_t;
+
+  rbf_fin_op fin_op{gain};
+
+  raft::resources handle;
+  raft::resource::set_cuda_stream(handle, stream);
+
+  cuvs::distance::distance<cuvs::distance::DistanceType::L2Unexpanded,
+                           math_t,
+                           math_t,
+                           math_t,
+                           decltype(fin_op),
+                           index_t>(handle,
+                                    const_cast<math_t*>(x1),
+                                    const_cast<math_t*>(x2),
+                                    out,
+                                    n1,
+                                    n2,
+                                    n_cols,
+                                    NULL,
+                                    0,
+                                    fin_op,
+                                    is_row_major);
+}
+
+template class PolynomialKernel<float, int>;
+template class PolynomialKernel<double, int>;
+template class TanhKernel<float>;
+template class TanhKernel<double>;
+template class RBFKernel<float>;
+template class RBFKernel<double>;
+
+};  // end namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cuh b/cpp/src/distance/detail/kernels/kernel_matrices.cuh
deleted file mode 100644
index bff5bda92..000000000
--- a/cpp/src/distance/detail/kernels/kernel_matrices.cuh
+++ /dev/null
@@ -1,777 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "gram_matrix.cuh"
-
-#include "../detail/kernels/rbf_fin_op.cuh"
-#include <cuvs/distance/distance.cuh>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/sparse/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace cuvs::distance::kernels::detail {
-
-/** Epiloge function for polynomial kernel without padding.
- * Calculates output = (gain*in + offset)^exponent
- * @param inout device vector in column major format, size [len]
- * @param len array length
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-RAFT_KERNEL polynomial_kernel_nopad(
-  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = pow(gain * inout[tid] + offset, exponent);
-  }
-}
-
-/** Epiloge function for polynomial kernel with padding.
- * Calculates output = (gain*input + offset)^exponent
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-RAFT_KERNEL polynomial_kernel(
-  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
-    }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector, size [len]
- * @param len length of the input vector
- * @param gain
- * @param offset
- */
-template <typename math_t>
-RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = tanh(gain * inout[tid] + offset);
-  }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param gain
- * @param offset
- */
-template <typename math_t>
-RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
-    }
-}
-
-/** Epiloge function for rbf kernel using expansion.
- *
- * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
- *
- * Intended usage
- *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
- *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
- *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
- *
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param norm_x l2-norm of X's rows
- * @param norm_y l2-norm of Y's rows
- * @param gain
- */
-template <typename math_t>
-RAFT_KERNEL rbf_kernel_expanded(
-  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y) {
-    math_t norm_y_val = norm_y[tidy];
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] =
-        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
-    }
-  }
-}
-
-namespace {
-std::tuple<dim3, dim3> generateLaunchConfig2dElementwiseOp(int n1, int n2)
-{
-  dim3 block_shape       = dim3(32, 4);
-  const int num_blocks_x = raft::ceildiv(n1, 32);
-  const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1);
-  dim3 grid_shape        = dim3(num_blocks_x, num_blocks_y);
-  return std::make_tuple(grid_shape, block_shape);
-}
-}  // namespace
-
-/**
- * Create a kernel matrix using polynomial kernel function.
- */
-template <typename math_t, typename exp_t>
-class PolynomialKernel : public GramMatrixBase<math_t> {
-  exp_t exponent;
-  math_t gain;
-  math_t offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, exponent, gain, offset);
-    } else {
-      int n1                         = is_row_major ? cols : rows;
-      int n2                         = is_row_major ? rows : cols;
-      auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-      polynomial_kernel<<<grid_shape, block_shape, 0, stream>>>(
-        inout, ld, n1, n2, exponent, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a polynomial kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
-   *
-   * @tparam math_t floating point type
-   * @tparam exp_t type of exponent
-   * @param exponent
-   * @param gain
-   * @param offset
-   */
-  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
-    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset)
-  {
-  }
-
-  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using tanh kernel function.
- */
-template <typename math_t>
-class TanhKernel : public GramMatrixBase<math_t> {
-  math_t gain, offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain, offset);
-    } else {
-      int n1                         = is_row_major ? cols : rows;
-      int n2                         = is_row_major ? rows : cols;
-      auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-      tanh_kernel<<<grid_shape, block_shape, 0, stream>>>(inout, ld, n1, n2, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a tanh kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   * @param offset
-   */
-  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
-
-  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using RBF kernel function.
- */
-template <typename math_t>
-class RBFKernel : public GramMatrixBase<math_t> {
-  math_t gain;
-
-  void applyKernel(math_t* inout,
-                   int ld,
-                   int rows,
-                   int cols,
-                   math_t* norm_x1,
-                   math_t* norm_x2,
-                   bool is_row_major,
-                   cudaStream_t stream)
-  {
-    int n1                         = is_row_major ? cols : rows;
-    int n2                         = is_row_major ? rows : cols;
-    math_t* norm_n1                = is_row_major ? norm_x2 : norm_x1;
-    math_t* norm_n2                = is_row_major ? norm_x1 : norm_x2;
-    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-    rbf_kernel_expanded<<<grid_shape, block_shape, 0, stream>>>(
-      inout, ld, n1, n2, norm_n1, norm_n2, gain);
-  }
-
- public:
-  /**
-   * Constructs a RBF kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = exp(-gain*|x1_i- x2_k|^2)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   */
-  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain) {}
-
-  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain)
-  {
-  }
-
-  void matrixRowNormL2(raft::resources const& handle,
-                       dense_input_matrix_view_t<math_t> matrix,
-                       math_t* target)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
-    int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
-    int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
-    ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
-    raft::linalg::rowNorm(target,
-                          matrix.data_handle(),
-                          matrix.extent(1),
-                          matrix.extent(0),
-                          raft::linalg::NormType::L2Norm,
-                          is_row_major,
-                          resource::get_cuda_stream(handle));
-  }
-
-  void matrixRowNormL2(raft::resources const& handle,
-                       csr_input_matrix_view_t<math_t> matrix,
-                       math_t* target)
-  {
-    auto matrix_structure = matrix.structure_view();
-    raft::sparse::linalg::rowNormCsr(handle,
-                                     matrix_structure.get_indptr().data(),
-                                     matrix.get_elements().data(),
-                                     matrix_structure.get_nnz(),
-                                     matrix_structure.get_n_rows(),
-                                     target,
-                                     raft::linalg::NormType::L2Norm);
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.extent(0), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.extent(0), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.extent(0), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    int minor1    = is_row_major ? n_cols : n1;
-    int minor2    = is_row_major ? n_cols : n2;
-    int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
-    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
-    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-
-    math_t gain   = this->gain;
-    using index_t = int64_t;
-
-    rbf_fin_op fin_op{gain};
-
-    raft::resources handle;
-    resource::set_cuda_stream(handle, stream);
-
-    cuvs::distance::distance<cuvs::distance::DistanceType::L2Unexpanded,
-                             math_t,
-                             math_t,
-                             math_t,
-                             decltype(fin_op),
-                             index_t>(handle,
-                                      const_cast<math_t*>(x1),
-                                      const_cast<math_t*>(x2),
-                                      out,
-                                      n1,
-                                      n2,
-                                      n_cols,
-                                      NULL,
-                                      0,
-                                      fin_op,
-                                      is_row_major);
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh
index 73588baea..53022368d 100644
--- a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh
+++ b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh
@@ -28,7 +28,7 @@
 #include <raft/core/math.hpp>                 // raft::exp
 #include <raft/util/cuda_dev_essentials.cuh>  // HD
 
-namespace cuvs::distance::kernels::detail {
+namespace cuvs::distance::kernels {
 
 /** @brief: Final op for Gram matrix with RBF kernel.
  *
@@ -48,4 +48,4 @@ struct rbf_fin_op {
   }
 };  // struct rbf_fin_op
 
-}  // namespace cuvs::distance::kernels::detail
+}  // namespace cuvs::distance::kernels
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
index edfd7cf5f..49497ab3a 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -118,9 +118,7 @@ instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::russel_rao_distance_op, int);
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo(
-  cuvs::distance::detail::ops::l2_unexp_distance_op,
-  int64_t,
-  cuvs::distance::kernels::detail::rbf_fin_op);
+  cuvs::distance::detail::ops::l2_unexp_distance_op, int64_t, cuvs::distance::kernels::rbf_fin_op);
 
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::l2_exp_distance_op, int64_t);
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
index 3c8f25109..a2e12b6df 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu
@@ -50,7 +50,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   float,
   float,
   float,
-  cuvs::distance::kernels::detail::rbf_fin_op<float>,
+  cuvs::distance::kernels::rbf_fin_op<float>,
   int64_t);
 
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
@@ -58,7 +58,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   double,
   double,
   double,
-  cuvs::distance::kernels::detail::rbf_fin_op<double>,
+  cuvs::distance::kernels::rbf_fin_op<double>,
   int64_t);
 
 instantiate_raft_distance_detail_pairwise_matrix_dispatch(
@@ -66,7 +66,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch(
   half,
   float,
   float,
-  cuvs::distance::kernels::detail::rbf_fin_op<half>,
+  cuvs::distance::kernels::rbf_fin_op<half>,
   int64_t);
 
 #undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/distance-ext.cuh b/cpp/src/distance/distance-ext.cuh
index e623f76ba..a692a62a3 100644
--- a/cpp/src/distance/distance-ext.cuh
+++ b/cpp/src/distance/distance-ext.cuh
@@ -273,13 +273,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan
                                          float,
                                          float,
                                          float,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<float>,
+                                         cuvs::distance::kernels::rbf_fin_op<float>,
                                          int64_t);
 instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded,
                                          double,
                                          double,
                                          double,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<double>,
+                                         cuvs::distance::kernels::rbf_fin_op<double>,
                                          int64_t);
 
 #undef instantiate_cuvs_distance_distance_extra
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index c1d39f360..47e72460f 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -139,13 +139,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan
                                          float,
                                          float,
                                          float,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<float>,
+                                         cuvs::distance::kernels::rbf_fin_op<float>,
                                          int64_t);
 instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded,
                                          double,
                                          double,
                                          double,
-                                         cuvs::distance::kernels::detail::rbf_fin_op<double>,
+                                         cuvs::distance::kernels::rbf_fin_op<double>,
                                          int64_t);
 
 #undef instantiate_cuvs_distance_distance_extra
diff --git a/cpp/src/embed/spectral.cu b/cpp/src/embed/spectral.cu
new file mode 100644
index 000000000..c3d4e3fc7
--- /dev/null
+++ b/cpp/src/embed/spectral.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../sparse/cluster/detail/spectral.cuh"
+#include <cuvs/embed/spectral.hpp>
+#include <raft/core/device_coo_matrix.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::embed::spectral {
+
+/**
+ * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings
+ * (lowest n_components eigenvectors), using Lanczos min cut algorithm.
+ * @param rows source vertices of knn graph (size nnz)
+ * @param cols destination vertices of knn graph (size nnz)
+ * @param vals edge weights connecting vertices of knn graph (size nnz)
+ * @param nnz size of rows/cols/vals
+ * @param n number of samples in X
+ * @param n_neighbors the number of neighbors to query for knn graph construction
+ * @param n_components the number of components to project the X into
+ * @param out output array for embedding (size n*n_comonents)
+ */
+void fit(const raft::resources& handle,
+         raft::device_coo_matrix_view<float, int, int, int> knn_graph,
+         int n_components,
+         raft::device_matrix_view<float, int> out,
+         unsigned long long seed)
+{
+  cuvs::sparse::cluster::spectral::detail::fit_embedding(
+    handle,
+    knn_graph.structure_view().get_rows().data(),
+    knn_graph.structure_view().get_cols().data(),
+    knn_graph.get_elements().data(),
+    knn_graph.structure_view().get_nnz(),
+    knn_graph.structure_view().get_n_rows(),
+    n_components,
+    out.data_handle(),
+    seed);
+}
+};  // namespace cuvs::embed::spectral
diff --git a/cpp/src/sparse/cluster/cluster_solvers.cuh b/cpp/src/sparse/cluster/cluster_solvers.cuh
new file mode 100644
index 000000000..7b4cf6ab3
--- /dev/null
+++ b/cpp/src/sparse/cluster/cluster_solvers.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CLUSTER_SOLVERS_H
+#define __CLUSTER_SOLVERS_H
+
+#pragma once
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <utility>  // for std::pair
+
+namespace cuvs {
+namespace spectral {
+
+using namespace raft::spectral::matrix;
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct cluster_solver_config_t {
+  size_type_t n_clusters;
+  size_type_t maxIter;
+
+  value_type_t tol;
+
+  unsigned long long seed{123456};
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct kmeans_solver_t {
+  explicit kmeans_solver_t(
+    cluster_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  std::pair<value_type_t, index_type_t> solve(raft::resources const& handle,
+                                              size_type_t n_obs_vecs,
+                                              size_type_t dim,
+                                              value_type_t const* __restrict__ obs,
+                                              index_type_t* __restrict__ codes) const
+  {
+    RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
+    RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
+    value_type_t residual{};
+    index_type_t iters{};
+    cuvs::cluster::kmeans::params km_params;
+    km_params.n_clusters     = config_.n_clusters;
+    km_params.tol            = config_.tol;
+    km_params.max_iter       = config_.maxIter;
+    km_params.rng_state.seed = config_.seed;
+
+    auto X      = raft::make_device_matrix_view<const value_type_t>(obs, n_obs_vecs, dim);
+    auto labels = raft::make_device_vector_view<index_type_t>(codes, n_obs_vecs);
+    auto centroids =
+      raft::make_device_matrix<value_type_t, index_type_t>(handle, config_.n_clusters, dim);
+    auto weight = raft::make_device_vector<value_type_t, index_type_t>(handle, n_obs_vecs);
+    thrust::fill(raft::resource::get_thrust_policy(handle),
+                 weight.data_handle(),
+                 weight.data_handle() + n_obs_vecs,
+                 1);
+
+    auto sw = std::make_optional((raft::device_vector_view<const value_type_t>)weight.view());
+    cuvs::cluster::kmeans::fit_predict(handle,
+                                       km_params,
+                                       X,
+                                       sw,
+                                       centroids.view(),
+                                       labels,
+                                       raft::make_host_scalar_view(&residual),
+                                       raft::make_host_scalar_view(&iters));
+    return std::make_pair(residual, iters);
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  cluster_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/detail/spectral.cuh b/cpp/src/sparse/cluster/detail/spectral.cuh
new file mode 100644
index 000000000..571d92bf5
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral.cuh
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../cluster_solvers.cuh"
+#include "../eigen_solvers.cuh"
+#include "../partition.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/coo.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace cuvs::sparse::cluster::spectral::detail {
+
+template <typename T>
+void fit_embedding(raft::resources const& handle,
+                   int* rows,
+                   int* cols,
+                   T* vals,
+                   int nnz,
+                   int n,
+                   int n_components,
+                   T* out,
+                   unsigned long long seed = 1234567)
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  rmm::device_uvector<int> src_offsets(n + 1, stream);
+  rmm::device_uvector<int> dst_cols(nnz, stream);
+  rmm::device_uvector<T> dst_vals(nnz, stream);
+  raft::sparse::convert::coo_to_csr(
+    handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data());
+
+  rmm::device_uvector<T> eigVals(n_components + 1, stream);
+  rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
+  rmm::device_uvector<int> labels(n, stream);
+
+  raft::resource::sync_stream(handle, stream);
+
+  /**
+   * Raft spectral clustering
+   */
+  using index_type = int;
+  using value_type = T;
+
+  index_type* ro = src_offsets.data();
+  index_type* ci = dst_cols.data();
+  value_type* vs = dst_vals.data();
+
+  raft::spectral::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
+    handle, ro, ci, vs, n, nnz};
+
+  index_type neigvs       = n_components + 1;
+  index_type maxiter      = 4000;  // default reset value (when set to 0);
+  value_type tol          = 0.01;
+  index_type restart_iter = 15 + neigvs;  // what cugraph is using
+
+  cuvs::spectral::eigen_solver_config_t<index_type, value_type> cfg{
+    neigvs, maxiter, restart_iter, tol};
+
+  cfg.seed = seed;
+
+  cuvs::spectral::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+
+  // cluster computation here is irrelevant,
+  // hence define a no-op such solver to
+  // feed partition():
+  //
+  struct no_op_cluster_solver_t {
+    using index_type_t = index_type;
+    using size_type_t  = index_type;
+    using value_type_t = value_type;
+
+    std::pair<value_type_t, index_type_t> solve(raft::resources const& handle,
+                                                size_type_t n_obs_vecs,
+                                                size_type_t dim,
+                                                value_type_t const* __restrict__ obs,
+                                                index_type_t* __restrict__ codes) const
+    {
+      return std::make_pair<value_type_t, index_type_t>(0, 0);
+    }
+  };
+
+  cuvs::spectral::partition(handle,
+                            r_csr_m,
+                            eig_solver,
+                            no_op_cluster_solver_t{},
+                            labels.data(),
+                            eigVals.data(),
+                            eigVecs.data());
+
+  raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+};  // namespace cuvs::sparse::cluster::spectral::detail
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp
new file mode 100644
index 000000000..a42ad2dc1
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+
+// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS
+#include "../../cluster_solvers.cuh"
+#include "../../eigen_solvers.cuh"
+#include "spectral_util.cuh"
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/normalize.cuh>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <math.h>
+#include <stdio.h>
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+namespace detail {
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Cluster
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = raft::resource::get_cuda_stream(handle);
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // Compute eigenvectors of Modularity Matrix
+
+  // Initialize Modularity Matrix
+  raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute eigenvectors corresponding to largest eigenvalues
+  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // notice that at this point the matrix has already been transposed, so we are scaling
+  // columns
+  auto dataset_view = raft::make_device_matrix_view(eigVecs, nEigVecs, n);
+  raft::linalg::row_normalize(
+    handle, raft::make_const_mdspan(dataset_view), dataset_view, raft::linalg::L2Norm);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(raft::resources const& handle,
+                       raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+  weight_t partModularity, clustersize;
+
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+  auto stream   = raft::resource::get_cuda_stream(handle);
+
+  // Device memory
+  raft::spectral::matrix::vector_t<weight_t> part_i(handle, n);
+  raft::spectral::matrix::vector_t<weight_t> Bx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Modularity
+  raft::spectral::matrix::modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
+
+  // Initialize output
+  modularity = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    modularity += partModularity;
+  }
+
+  modularity = modularity / B.diagonal_.nrm1();
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/src/sparse/cluster/detail/spectral/partition.hpp b/cpp/src/sparse/cluster/detail/spectral/partition.hpp
new file mode 100644
index 000000000..77e83c17d
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral/partition.hpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+
+// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+
+#include "../../cluster_solvers.cuh"
+#include "../../eigen_solvers.cuh"
+#include "spectral_util.cuh"
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <cuda.h>
+#include <thrust/fill.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+#include <math.h>
+#include <stdio.h>
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+namespace detail {
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param nEigVecs Number of eigenvectors to compute.
+ *  @param maxIter_lanczos Maximum number of Lanczos iterations.
+ *  @param restartIter_lanczos Maximum size of Lanczos system before
+ *    implicit restart.
+ *  @param tol_lanczos Convergence tolerance for Lanczos method.
+ *  @param maxIter_kmeans Maximum number of k-means iterations.
+ *  @param tol_kmeans Convergence tolerance for k-means algorithm.
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param iters_lanczos On exit, number of Lanczos iterations
+ *    performed.
+ *  @param iters_kmeans On exit, number of k-means iterations
+ *    performed.
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+  RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+  RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+
+  auto stream   = raft::resource::get_cuda_stream(handle);
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+
+  std::tuple<vertex_t, weight_t, vertex_t>
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
+            // cluster solver residual, # iters cluster solver
+
+  vertex_t n = csr_m.nrows_;
+
+  // -------------------------------------------------------
+  // Spectral partitioner
+  // -------------------------------------------------------
+
+  // Compute eigenvectors of Laplacian
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  auto eigen_config = eigen_solver.get_config();
+  auto nEigVecs     = eigen_config.n_eigVecs;
+
+  // Compute smallest eigenvalues and eigenvectors
+  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+
+  // Whiten eigenvector matrix
+  transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
+
+  // Find partition clustering
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+
+  std::get<1>(stats) = pair_cluster.first;
+  std::get<2>(stats) = pair_cluster.second;
+
+  return stats;
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param G Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ *  @return error flag.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(raft::resources const& handle,
+                      raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
+  RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
+
+  vertex_t i;
+  vertex_t n = csr_m.nrows_;
+
+  auto stream   = raft::resource::get_cuda_stream(handle);
+  auto cublas_h = raft::resource::get_cublas_handle(handle);
+
+  weight_t partEdgesCut, clustersize;
+
+  // Device memory
+  raft::spectral::matrix::vector_t<weight_t> part_i(handle, n);
+  raft::spectral::matrix::vector_t<weight_t> Lx(handle, n);
+
+  // Initialize cuBLAS
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+  // Initialize Laplacian
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+
+  // Initialize output
+  cost    = 0;
+  edgeCut = 0;
+
+  // Iterate through partitions
+  for (i = 0; i < nClusters; ++i) {
+    // Construct indicator vector for ith partition
+    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
+      WARNING("empty partition");
+      continue;
+    }
+
+    // Record results
+    cost += partEdgesCut / clustersize;
+    edgeCut += partEdgesCut / 2;
+  }
+}
+
+}  // namespace detail
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh
new file mode 100644
index 000000000..1d2e58e2a
--- /dev/null
+++ b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/resource/cublas_handle.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+
+// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+#include <thrust/for_each.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <algorithm>
+
+namespace cuvs {
+namespace spectral {
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void transform_eigen_matrix(raft::resources const& handle,
+                            edge_t n,
+                            vertex_t nEigVecs,
+                            weight_t* eigVecs)
+{
+  auto stream             = raft::resource::get_cuda_stream(handle);
+  auto cublas_h           = raft::resource::get_cublas_handle(handle);
+  auto thrust_exec_policy = raft::resource::get_thrust_policy(handle);
+
+  const weight_t zero{0.0};
+  const weight_t one{1.0};
+
+  // Whiten eigenvector matrix
+  for (auto i = 0; i < nEigVecs; ++i) {
+    weight_t mean, std;
+
+    mean = thrust::reduce(thrust_exec_policy,
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
+    RAFT_CHECK_CUDA(stream);
+    mean /= n;
+    thrust::transform(thrust_exec_policy,
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)),
+                      thrust::make_constant_iterator(mean),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::minus<weight_t>());
+    RAFT_CHECK_CUDA(stream);
+
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+
+    std /= std::sqrt(static_cast<weight_t>(n));
+
+    thrust::transform(thrust_exec_policy,
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)),
+                      thrust::make_constant_iterator(std),
+                      thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                      thrust::divides<weight_t>());
+    RAFT_CHECK_CUDA(stream);
+  }
+
+  // Transpose eigenvector matrix
+  //   TODO: in-place transpose
+  {
+    raft::spectral::matrix::vector_t<weight_t> work(handle, nEigVecs * n);
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    // TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     nEigVecs,
+                                                     n,
+                                                     &one,
+                                                     eigVecs,
+                                                     n,
+                                                     &zero,
+                                                     (weight_t*)NULL,
+                                                     nEigVecs,
+                                                     work.raw(),
+                                                     nEigVecs,
+                                                     stream));
+
+    RAFT_CUDA_TRY(cudaMemcpyAsync(
+      eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
+  }
+}
+
+namespace {
+/// Functor to generate indicator vectors
+/** For use in Thrust transform
+ */
+template <typename index_type_t, typename value_type_t>
+struct equal_to_i_op {
+  const index_type_t i;
+
+ public:
+  equal_to_i_op(index_type_t _i) : i(_i) {}
+  template <typename Tuple_>
+  __host__ __device__ void operator()(Tuple_ t)
+  {
+    thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
+  }
+};
+}  // namespace
+
+// Construct indicator vector for ith partition
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+bool construct_indicator(raft::resources const& handle,
+                         edge_t index,
+                         edge_t n,
+                         weight_t& clustersize,
+                         weight_t& partStats,
+                         vertex_t const* __restrict__ clusters,
+                         raft::spectral::matrix::vector_t<weight_t>& part_i,
+                         raft::spectral::matrix::vector_t<weight_t>& Bx,
+                         raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> const& B)
+{
+  auto stream             = raft::resource::get_cuda_stream(handle);
+  auto cublas_h           = raft::resource::get_cublas_handle(handle);
+  auto thrust_exec_policy = raft::resource::get_thrust_policy(handle);
+
+  thrust::for_each(
+    thrust_exec_policy,
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters),
+                                                 thrust::device_pointer_cast(part_i.raw()))),
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n),
+                                                 thrust::device_pointer_cast(part_i.raw() + n))),
+    equal_to_i_op<vertex_t, weight_t>(index));
+  RAFT_CHECK_CUDA(stream);
+
+  // Compute size of ith partition
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
+    cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
+
+  clustersize = round(clustersize);
+  if (clustersize < 0.5) { return false; }
+
+  // Compute part stats
+  B.mv(1, part_i.raw(), 0, Bx.raw());
+  // TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+
+  return true;
+}
+
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/src/sparse/cluster/eigen_solvers.cuh b/cpp/src/sparse/cluster/eigen_solvers.cuh
new file mode 100644
index 000000000..1b2501d68
--- /dev/null
+++ b/cpp/src/sparse/cluster/eigen_solvers.cuh
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __EIGEN_SOLVERS_H
+#define __EIGEN_SOLVERS_H
+
+#pragma once
+
+#include <raft/sparse/solver/lanczos.cuh>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+namespace cuvs {
+namespace spectral {
+
+// aggregate of control params for Eigen Solver:
+//
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct eigen_solver_config_t {
+  size_type_t n_eigVecs;
+  size_type_t maxIter;
+
+  size_type_t restartIter;
+  value_type_t tol;
+
+  bool reorthogonalize{false};
+  unsigned long long seed{
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
+               // Lanczos; was not the case before: there were places where a default seed = 123456
+               // was used; this may trigger slightly different # solver iterations
+};
+
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+struct lanczos_solver_t {
+  explicit lanczos_solver_t(
+    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  index_type_t solve_smallest_eigenvectors(
+    raft::resources const& handle,
+    raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    raft::sparse::solver::computeSmallestEigenvectors(handle,
+                                                      A,
+                                                      config_.n_eigVecs,
+                                                      config_.maxIter,
+                                                      config_.restartIter,
+                                                      config_.tol,
+                                                      config_.reorthogonalize,
+                                                      iters,
+                                                      eigVals,
+                                                      eigVecs,
+                                                      config_.seed);
+    return iters;
+  }
+
+  index_type_t solve_largest_eigenvectors(
+    raft::resources const& handle,
+    raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const
+  {
+    RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
+    RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
+    index_type_t iters{};
+    raft::sparse::solver::computeLargestEigenvectors(handle,
+                                                     A,
+                                                     config_.n_eigVecs,
+                                                     config_.maxIter,
+                                                     config_.restartIter,
+                                                     config_.tol,
+                                                     config_.reorthogonalize,
+                                                     iters,
+                                                     eigVals,
+                                                     eigVecs,
+                                                     config_.seed);
+    return iters;
+  }
+
+  auto const& get_config(void) const { return config_; }
+
+ private:
+  eigen_solver_config_t<index_type_t, value_type_t, size_type_t> config_;
+};
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
diff --git a/cpp/src/sparse/cluster/modularity_maximization.cuh b/cpp/src/sparse/cluster/modularity_maximization.cuh
new file mode 100644
index 000000000..71cba6927
--- /dev/null
+++ b/cpp/src/sparse/cluster/modularity_maximization.cuh
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __MODULARITY_MAXIMIZATION_H
+#define __MODULARITY_MAXIMIZATION_H
+
+#pragma once
+
+#include "detail/spectral/modularity_maximization.hpp"
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+
+// =========================================================
+// Spectral modularity_maximization
+// =========================================================
+
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return cuvs::spectral::detail::
+    modularity_maximization<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+      handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+//===================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute modularity
+/** This function determines the modularity based on a graph and cluster assignments
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param nClusters Number of clusters.
+ *  @param clusters (Input, device memory, n entries) Cluster assignments.
+ *  @param modularity On exit, modularity
+ */
+template <typename vertex_t, typename weight_t>
+void analyzeModularity(raft::resources const& handle,
+                       raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       vertex_t nClusters,
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
+  cuvs::spectral::detail::analyzeModularity<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, modularity);
+}
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
\ No newline at end of file
diff --git a/cpp/src/sparse/cluster/partition.cuh b/cpp/src/sparse/cluster/partition.cuh
new file mode 100644
index 000000000..df78a8a2d
--- /dev/null
+++ b/cpp/src/sparse/cluster/partition.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __PARTITION_H
+#define __PARTITION_H
+
+#pragma once
+
+#include "detail/spectral/partition.hpp"
+
+#include <tuple>
+
+namespace cuvs {
+namespace spectral {
+
+// =========================================================
+// Spectral partitioner
+// =========================================================
+
+/// Compute spectral graph partition
+/** Compute partition for a weighted undirected graph. This
+ *  partition attempts to minimize the cost function:
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
+ *
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param eigen_solver Eigensolver implementation
+ *  @param cluster_solver Cluster solver implementation
+ *  @param clusters (Output, device memory, n entries) Partition
+ *    assignments.
+ *  @param eigVals Output eigenvalue array pointer on device
+ *  @param eigVecs Output eigenvector array pointer on device
+ *  @return statistics: number of eigensolver iterations, .
+ */
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  raft::resources const& handle,
+  raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
+  return cuvs::spectral::detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+    handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
+}
+
+// =========================================================
+// Analysis of graph partition
+// =========================================================
+
+/// Compute cost function for partition
+/** This function determines the edges cut by a partition and a cost
+ *  function:
+ *    Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition)
+ *  Graph is assumed to be weighted and undirected.
+ *
+ *  @param handle raft handle for managing expensive resources
+ *  @param csr_m Weighted graph in CSR format
+ *  @param nClusters Number of partitions.
+ *  @param clusters (Input, device memory, n entries) Partition
+ *    assignments.
+ *  @param edgeCut On exit, weight of edges cut by partition.
+ *  @param cost On exit, partition cost function.
+ */
+template <typename vertex_t, typename weight_t>
+void analyzePartition(raft::resources const& handle,
+                      raft::spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
+  cuvs::spectral::detail::analyzePartition<vertex_t, weight_t>(
+    handle, csr_m, nClusters, clusters, edgeCut, cost);
+}
+
+}  // namespace spectral
+}  // namespace cuvs
+
+#endif
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0ecac6ec2..9224e88d8 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -218,6 +218,7 @@ if(BUILD_TESTS)
     distance/dist_l_inf.cu
     distance/dist_lp_unexp.cu
     distance/dist_russell_rao.cu
+    distance/gram.cu
     distance/masked_nn.cu
     distance/sparse_distance.cu
     sparse/neighbors/cross_component_nn.cu
@@ -227,6 +228,11 @@ if(BUILD_TESTS)
     100
   )
 
+  ConfigureTest(
+    NAME SPARSE_TEST PATH sparse/cluster/cluster_solvers.cu sparse/cluster/eigen_solvers.cu
+    sparse/cluster/spectral.cu GPUS 1 PERCENT 100
+  )
+  
   ConfigureTest(
     NAME PREPROCESSING_TEST PATH preprocessing/scalar_quantization.cu GPUS 1 PERCENT 100
   )
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
new file mode 100644
index 000000000..89b1525ea
--- /dev/null
+++ b/cpp/test/distance/gram.cu
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include "gram_base.cuh"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs::distance::kernels {
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/") << kernel_names[p.kernel.kernel] << "/ld_"
+     << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+const std::vector<GramMatrixInputs> inputs = {
+  {42, 137, 2, false, {KernelType::LINEAR}},
+  {42, 137, 2, true, {KernelType::LINEAR}},
+  {42, 137, 2, false, {KernelType::LINEAR}, 64, 179, 181},
+  {42, 137, 2, true, {KernelType::LINEAR}, 64, 179, 181},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143},
+  {3, 4, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, true, {KernelType::RBF, 0, 0.5}},
+  // Distance kernel does not support LD parameter yet.
+  //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49},
+  // {42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143},
+};
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()),
+      handle(),
+      x1(0, raft::resource::get_cuda_stream(handle)),
+      x2(0, raft::resource::get_cuda_stream(handle)),
+      gram(0, raft::resource::get_cuda_stream(handle)),
+      gram_host(0)
+  {
+    auto stream = raft::resource::get_cuda_stream(handle);
+
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the output from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::RngState rng(42137ULL);
+    raft::random::uniform(handle, rng, x1.data(), x1.size(), math_t(0), math_t(1));
+    raft::random::uniform(handle, rng, x2.data(), x2.size(), math_t(0), math_t(1));
+  }
+
+  ~GramMatrixTest() override {}
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
+
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
+
+    (*kernel)(handle, x1_span, x2_span, out_span);
+
+    auto stream = raft::resource::get_cuda_stream(handle);
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
+
+    ASSERT_TRUE(cuvs::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox<math_t>(1e-6f), stream));
+  }
+
+  GramMatrixInputs params;
+  raft::resources handle;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+  rmm::device_uvector<math_t> gram;
+
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloat;
+typedef GramMatrixTest<double> GramMatrixTestDouble;
+
+TEST_P(GramMatrixTestFloat, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs));
+};  // namespace cuvs::distance::kernels
\ No newline at end of file
diff --git a/cpp/test/distance/gram_base.cuh b/cpp/test/distance/gram_base.cuh
new file mode 100644
index 000000000..326cdb4f8
--- /dev/null
+++ b/cpp/test/distance/gram_base.cuh
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace distance {
+namespace kernels {
+
+// Get the offset of element [i,k].
+HDI int get_offset(int i, int k, int ld, bool is_row_major)
+{
+  return is_row_major ? i * ld + k : i + k * ld;
+}
+
+// Calculate the Gram matrix on the host.
+template <typename math_t>
+void naiveGramMatrixKernel(int n1,
+                           int n2,
+                           int n_cols,
+                           const rmm::device_uvector<math_t>& x1,
+                           const rmm::device_uvector<math_t>& x2,
+                           math_t* gram_host,
+                           int ld1,
+                           int ld2,
+                           int ld_out,
+                           bool is_row_major,
+                           KernelParams kernel,
+                           cudaStream_t stream,
+                           const raft::resources& handle)
+{
+  std::vector<math_t> x1_host(x1.size());
+  raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
+  std::vector<math_t> x2_host(x2.size());
+  raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
+  raft::resource::sync_stream(handle, stream);
+
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      float d = 0;
+      for (int k = 0; k < n_cols; k++) {
+        if (kernel.kernel == KernelType::RBF) {
+          math_t diff = x1_host[get_offset(i, k, ld1, is_row_major)] -
+                        x2_host[get_offset(j, k, ld2, is_row_major)];
+          d += diff * diff;
+        } else {
+          d += x1_host[get_offset(i, k, ld1, is_row_major)] *
+               x2_host[get_offset(j, k, ld2, is_row_major)];
+        }
+      }
+      int idx  = get_offset(i, j, ld_out, is_row_major);
+      math_t v = 0;
+      switch (kernel.kernel) {
+        case (KernelType::LINEAR): gram_host[idx] = d; break;
+        case (KernelType::POLYNOMIAL):
+          v              = kernel.gamma * d + kernel.coef0;
+          gram_host[idx] = std::pow(v, kernel.degree);
+          break;
+        case (KernelType::TANH): gram_host[idx] = std::tanh(kernel.gamma * d + kernel.coef0); break;
+        case (KernelType::RBF): gram_host[idx] = exp(-kernel.gamma * d); break;
+      }
+    }
+  }
+}
+
+}  // namespace kernels
+}  // namespace distance
+}  // namespace cuvs
\ No newline at end of file
diff --git a/cpp/test/sparse/cluster/cluster_solvers.cu b/cpp/test/sparse/cluster/cluster_solvers.cu
new file mode 100644
index 000000000..c0b6c1a78
--- /dev/null
+++ b/cpp/test/sparse/cluster/cluster_solvers.cu
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../src/sparse/cluster/cluster_solvers.cuh"
+#include "../../../src/sparse/cluster/eigen_solvers.cuh"
+#include "../../../src/sparse/cluster/modularity_maximization.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resources.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace spectral {
+
+TEST(Raft, ClusterSolvers)
+{
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+
+  index_type maxiter{100};
+  value_type tol{1.0e-10};
+  unsigned long long seed{100110021003};
+
+  auto stream = raft::resource::get_cuda_stream(h);
+
+  index_type n{100};
+  index_type d{10};
+  index_type k{5};
+
+  // nullptr expected to trigger exceptions:
+  //
+  value_type* eigvecs{nullptr};
+  index_type* codes{nullptr};
+
+  cluster_solver_config_t<index_type, value_type> cfg{k, maxiter, tol, seed};
+
+  kmeans_solver_t<index_type, value_type> cluster_solver{cfg};
+
+  EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes));
+}
+
+TEST(Raft, ModularitySolvers)
+{
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h));
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  // nullptr expected to trigger exceptions:
+  //
+  index_type* clusters{nullptr};
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+
+  unsigned long long seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> eig_cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
+
+  index_type k{5};
+
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
+  kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
+
+  auto stream = raft::resource::get_cuda_stream(h);
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+
+  EXPECT_ANY_THROW(cuvs::spectral::modularity_maximization(
+    h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+
+  value_type modularity{0};
+  EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity));
+}
+
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/test/sparse/cluster/eigen_solvers.cu b/cpp/test/sparse/cluster/eigen_solvers.cu
new file mode 100644
index 000000000..8de0b49e7
--- /dev/null
+++ b/cpp/test/sparse/cluster/eigen_solvers.cu
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../src/sparse/cluster/eigen_solvers.cuh"
+#include "../../../src/sparse/cluster/partition.cuh"
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resources.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+
+namespace cuvs {
+namespace spectral {
+
+TEST(Raft, EigenSolvers)
+{
+  raft::common::nvtx::range fun_scope("test::EigenSolvers");
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h));
+
+  index_type* ro{nullptr};
+  index_type* ci{nullptr};
+  value_type* vs{nullptr};
+  index_type nnz   = 0;
+  index_type nrows = 0;
+
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  ASSERT_EQ(nullptr, sm1.row_offsets_);
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  // nullptr expected to trigger exceptions:
+  //
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+  std::uint64_t seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+
+  lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+
+  EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
+
+  EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
+}
+
+TEST(Raft, SpectralSolvers)
+{
+  raft::common::nvtx::range fun_scope("test::SpectralSolvers");
+  using namespace raft::spectral::matrix;
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h)
+
+  );
+
+  index_type neigvs{10};
+  index_type maxiter{100};
+  index_type restart_iter{10};
+  value_type tol{1.0e-10};
+  bool reorthog{true};
+
+  // nullptr expected to trigger exceptions:
+  //
+  index_type* clusters{nullptr};
+  value_type* eigvals{nullptr};
+  value_type* eigvecs{nullptr};
+
+  unsigned long long seed{100110021003};
+
+  eigen_solver_config_t<index_type, value_type> eig_cfg{
+    neigvs, maxiter, restart_iter, tol, reorthog, seed};
+  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
+
+  index_type k{5};
+
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
+  kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
+
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+  EXPECT_ANY_THROW(
+    spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+
+  value_type edgeCut{0};
+  value_type cost{0};
+  EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
+}
+
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/test/sparse/cluster/spectral.cu b/cpp/test/sparse/cluster/spectral.cu
new file mode 100644
index 000000000..7d0cdef9d
--- /dev/null
+++ b/cpp/test/sparse/cluster/spectral.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../test_utils.cuh"
+
+#include "../../../src/sparse/cluster/modularity_maximization.cuh"
+#include "../../../src/sparse/cluster/partition.cuh"
+#include <raft/core/handle.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace cluster {
+
+/**
+ * Warning: There appears to be a CUDA 12.2 bug in cusparse that causes an
+ * alignment issue. We've fixed the bug in our code through a workaround
+ * (see raft/sparse/linalg/spmm.hpp for fix). This test is meant to fail
+ * in the case where the fix is accidentally reverted, so that it doesn't
+ * break any downstream libraries that depend on RAFT
+ */
+TEST(Raft, Spectral)
+{
+  raft::handle_t handle;
+
+  std::vector<int32_t> h_offsets({0, 2, 4, 7, 10, 12, 14});
+  std::vector<int32_t> h_indices({1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 5, 3, 4});
+  std::vector<float> h_values(
+    {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+  std::vector<int32_t> expected_clustering({1, 1, 1, 0, 0, 0});
+
+  int32_t n_clusters{2};
+  int32_t n_eigenvectors{2};
+  int32_t evs_max_it{100};
+  int32_t kmean_max_it{100};
+  int32_t restartIter_lanczos = 15 + n_eigenvectors;
+  float evs_tol{0.001};
+  float kmean_tol{0.001};
+  unsigned long long seed1{1234567};
+  unsigned long long seed2{12345678};
+  bool reorthog{false};
+
+  rmm::device_uvector<int32_t> offsets(h_offsets.size(), handle.get_stream());
+  rmm::device_uvector<int32_t> indices(h_indices.size(), handle.get_stream());
+  rmm::device_uvector<float> values(h_indices.size(), handle.get_stream());
+  rmm::device_uvector<int32_t> clustering(expected_clustering.size(), handle.get_stream());
+  rmm::device_uvector<float> eigenvalues(n_eigenvectors, handle.get_stream());
+  rmm::device_uvector<float> eigenvectors(n_eigenvectors * expected_clustering.size(),
+                                          handle.get_stream());
+
+  rmm::device_uvector<int32_t> exp_dev(expected_clustering.size(), handle.get_stream());
+
+  raft::update_device(
+    exp_dev.data(), expected_clustering.data(), expected_clustering.size(), handle.get_stream());
+
+  raft::update_device(offsets.data(), h_offsets.data(), h_offsets.size(), handle.get_stream());
+  raft::update_device(indices.data(), h_indices.data(), h_indices.size(), handle.get_stream());
+  raft::update_device(values.data(), h_values.data(), h_values.size(), handle.get_stream());
+
+  raft::spectral::matrix::sparse_matrix_t<int32_t, float> const matrix{
+    handle,
+    offsets.data(),
+    indices.data(),
+    values.data(),
+    static_cast<int32_t>(offsets.size() - 1),
+    static_cast<int32_t>(indices.size())};
+
+  cuvs::spectral::eigen_solver_config_t<int32_t, float> eig_cfg{
+    n_eigenvectors, evs_max_it, restartIter_lanczos, evs_tol, reorthog, seed1};
+  cuvs::spectral::lanczos_solver_t<int32_t, float> eig_solver{eig_cfg};
+
+  cuvs::spectral::cluster_solver_config_t<int32_t, float> clust_cfg{
+    n_clusters, kmean_max_it, kmean_tol, seed2};
+  cuvs::spectral::kmeans_solver_t<int32_t, float> cluster_solver{clust_cfg};
+
+  cuvs::spectral::partition(handle,
+                            matrix,
+                            eig_solver,
+                            cluster_solver,
+                            clustering.data(),
+                            eigenvalues.data(),
+                            eigenvectors.data());
+
+  ASSERT_TRUE(devArrMatch(expected_clustering.data(),
+                          exp_dev.data(),
+                          exp_dev.size(),
+                          1,
+                          cuvs::Compare<int32_t>(),
+                          handle.get_stream()));
+}
+
+}  // namespace cluster
+}  // namespace cuvs
\ No newline at end of file
diff --git a/cpp/test/sparse/cluster/spectral_matrix.cu b/cpp/test/sparse/cluster/spectral_matrix.cu
new file mode 100644
index 000000000..37a4202b8
--- /dev/null
+++ b/cpp/test/sparse/cluster/spectral_matrix.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_id.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/spectral/matrix_wrappers.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs {
+namespace spectral {
+namespace matrix {
+namespace {
+template <typename index_type, typename value_type>
+struct csr_view_t {
+  index_type* offsets;
+  index_type* indices;
+  value_type* edge_data;
+  index_type number_of_vertices;
+  index_type number_of_edges;
+};
+}  // namespace
+TEST(Raft, SpectralMatrices)
+{
+  using index_type = int;
+  using value_type = double;
+
+  raft::resources h;
+  ASSERT_EQ(0, raft::resource::get_device_id(h));
+
+  csr_view_t<index_type, value_type> csr_v{nullptr, nullptr, nullptr, 0, 0};
+
+  int const sz = 10;
+  vector_t<index_type> d_v{h, sz};
+
+  index_type* ro{nullptr};
+  index_type* ci{nullptr};
+  value_type* vs{nullptr};
+  index_type nnz   = 0;
+  index_type nrows = 0;
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
+  ASSERT_EQ(nullptr, sm1.row_offsets_);
+  ASSERT_EQ(nullptr, sm2.row_offsets_);
+
+  auto stream = resource::get_cuda_stream(h);
+
+  auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    laplacian_matrix_t<index_type, value_type> lm1{h, ro, ci, vs, nrows, nnz};
+  };
+  EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
+
+  auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t<index_type, value_type> lm2{h, sm2}; };
+  EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
+
+  auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) {
+    modularity_matrix_t<index_type, value_type> mm1{h, ro, ci, vs, nrows, nnz};
+  };
+  EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
+
+  auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t<index_type, value_type> mm2{h, sm2}; };
+  EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
+}
+
+}  // namespace matrix
+}  // namespace spectral
+}  // namespace cuvs
diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu
new file mode 100644
index 000000000..d7af30a1c
--- /dev/null
+++ b/cpp/test/sparse/gram.cu
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/core/resource/cuda_stream.hpp>
+
+#include "../distance/gram_base.cuh"
+#include "../test_utils.cuh"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/distance/grammian.hpp>
+
+#include <raft/random/rng.cuh>
+#include <raft/sparse/convert/dense.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+
+namespace cuvs::distance::kernels::sparse {
+
+/**
+ * Structure to describe structure of the input matrices:
+ *  - DENSE: dense, dense
+ *  - MIX: CSR, dense
+ *  - CSR: CSR, CSR
+ */
+enum SparseType { DENSE, MIX, CSR };
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  SparseType sparse_input;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/")
+     << (p.sparse_input == SparseType::DENSE
+           ? "DenseDense/"
+           : (p.sparse_input == SparseType::MIX ? "CsrDense/" : "CsrCsr/"))
+     << kernel_names[p.kernel.kernel] << "/ld_" << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+/*struct KernelParams {
+  // Kernel function parameters
+  KernelType kernel;  //!< Type of the kernel function
+  int degree;         //!< Degree of polynomial kernel (ignored by others)
+  double gamma;       //!< multiplier in the
+  double coef0;       //!< additive constant in poly and tanh kernels
+};*/
+
+// const KernelParams linear_kernel_params{.kernel=KernelType::LINEAR};
+
+// {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, {KernelType::TANH, 0, 0.5, 2.4}, {KernelType::RBF, 0, 0.5}
+const std::vector<GramMatrixInputs> inputs = raft::util::itertools::product<GramMatrixInputs>(
+  {42},
+  {137},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX, SparseType::CSR},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4},
+   KernelParams{KernelType::RBF, 0, 0.5}});
+
+// (ld_1, ld_2, ld_out) not supported by RBF and CSR
+const std::vector<GramMatrixInputs> inputs_ld = raft::util::itertools::product<GramMatrixInputs>(
+  {137},
+  {42},
+  {2},
+  {true, false},
+  {SparseType::DENSE, SparseType::MIX},
+  {KernelParams{KernelType::LINEAR},
+   KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+   KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+  {159},
+  {73},
+  {144});
+
+// (ld_1, ld_2) are supported by CSR
+const std::vector<GramMatrixInputs> inputs_ld_csr =
+  raft::util::itertools::product<GramMatrixInputs>(
+    {42},
+    {137},
+    {2},
+    {true, false},
+    {SparseType::CSR, SparseType::MIX},
+    {KernelParams{KernelType::LINEAR},
+     KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4},
+     KernelParams{KernelType::TANH, 0, 0.5, 2.4}},
+    {64},
+    {155},
+    {0});
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()),
+      stream(raft::resource::get_cuda_stream(handle)),
+      x1(0, stream),
+      x2(0, stream),
+      x1_csr_indptr(0, stream),
+      x1_csr_indices(0, stream),
+      x1_csr_data(0, stream),
+      x2_csr_indptr(0, stream),
+      x2_csr_indices(0, stream),
+      x2_csr_data(0, stream),
+      gram(0, stream),
+      gram_host(0)
+  {
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the output from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::RngState r(42137ULL);
+    raft::random::uniform(handle, r, x1.data(), x1.size(), math_t(0), math_t(1));
+    raft::random::uniform(handle, r, x2.data(), x2.size(), math_t(0), math_t(1));
+
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+  }
+
+  ~GramMatrixTest() override {}
+
+  int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data)
+  {
+    int nnz           = 0;
+    double eps        = 1e-6;
+    int n_cols        = params.n_cols;
+    bool is_row_major = params.is_row_major;
+    size_t dense_size = get_offset(n_rows - 1, n_cols - 1, ld, is_row_major) + 1;
+
+    std::vector<math_t> dense_host(dense_size);
+    raft::update_host(dense_host.data(), dense, dense_size, stream);
+    raft::resource::sync_stream(handle, stream);
+
+    std::vector<int> indptr_host(n_rows + 1);
+    std::vector<int> indices_host(n_rows * n_cols);
+    std::vector<math_t> data_host(n_rows * n_cols);
+
+    // create csr matrix from dense (with threshold)
+    for (int i = 0; i < n_rows; ++i) {
+      indptr_host[i] = nnz;
+      for (int j = 0; j < n_cols; ++j) {
+        math_t value = dense_host[get_offset(i, j, ld, is_row_major)];
+        if (value > eps) {
+          indices_host[nnz] = j;
+          data_host[nnz]    = value;
+          nnz++;
+        }
+      }
+    }
+    indptr_host[n_rows] = nnz;
+
+    // fill back dense matrix from CSR
+    std::fill(dense_host.data(), dense_host.data() + dense_size, 0);
+    for (int i = 0; i < n_rows; ++i) {
+      for (int idx = indptr_host[i]; idx < indptr_host[i + 1]; ++idx) {
+        dense_host[get_offset(i, indices_host[idx], ld, is_row_major)] = data_host[idx];
+      }
+    }
+
+    raft::update_device(dense, dense_host.data(), dense_size, stream);
+    raft::update_device(indptr, indptr_host.data(), n_rows + 1, stream);
+    raft::update_device(indices, indices_host.data(), nnz, stream);
+    raft::update_device(data, data_host.data(), nnz, stream);
+    raft::resource::sync_stream(handle, stream);
+    return nnz;
+  }
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel =
+      std::unique_ptr<GramMatrixBase<math_t>>(KernelFactory<math_t>::create(params.kernel));
+
+    auto x1_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x1.data(), params.n1, params.n_cols, params.ld1);
+    auto x2_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<const math_t, int, raft::layout_c_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2)
+        : raft::make_device_strided_matrix_view<const math_t, int, raft::layout_f_contiguous>(
+            x2.data(), params.n2, params.n_cols, params.ld2);
+    auto out_span =
+      params.is_row_major
+        ? raft::make_device_strided_matrix_view<math_t, int, raft::layout_c_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out)
+        : raft::make_device_strided_matrix_view<math_t, int, raft::layout_f_contiguous>(
+            gram.data(), params.n1, params.n2, params.ld_out);
+
+    if (params.sparse_input == SparseType::DENSE) {
+      (*kernel)(handle, x1_span, x2_span, out_span);
+    } else {
+      x1_csr_indptr.reserve(params.n1 + 1, stream);
+      x1_csr_indices.reserve(params.n1 * params.n_cols, stream);
+      x1_csr_data.reserve(params.n1 * params.n_cols, stream);
+      int x1_nnz = prepareCsr(x1.data(),
+                              params.n1,
+                              params.ld1,
+                              x1_csr_indptr.data(),
+                              x1_csr_indices.data(),
+                              x1_csr_data.data());
+
+      auto x1_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+        x1_csr_indptr.data(), x1_csr_indices.data(), params.n1, params.n_cols, x1_nnz);
+      auto x1_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+        raft::device_span<const math_t>(x1_csr_data.data(), x1_csr_structure.get_nnz()),
+        x1_csr_structure);
+
+      if (params.sparse_input == SparseType::MIX) {
+        (*kernel)(handle, x1_csr, x2_span, out_span);
+      } else {
+        x2_csr_indptr.reserve(params.n2 + 1, stream);
+        x2_csr_indices.reserve(params.n2 * params.n_cols, stream);
+        x2_csr_data.reserve(params.n2 * params.n_cols, stream);
+        int x2_nnz = prepareCsr(x2.data(),
+                                params.n2,
+                                params.ld2,
+                                x2_csr_indptr.data(),
+                                x2_csr_indices.data(),
+                                x2_csr_data.data());
+
+        auto x2_csr_structure = raft::make_device_compressed_structure_view<int, int, int>(
+          x2_csr_indptr.data(), x2_csr_indices.data(), params.n2, params.n_cols, x2_nnz);
+        auto x2_csr = raft::device_csr_matrix_view<const math_t, int, int, int>(
+          raft::device_span<const math_t>(x2_csr_data.data(), x2_csr_structure.get_nnz()),
+          x2_csr_structure);
+
+        (*kernel)(handle, x1_csr, x2_csr, out_span);
+      }
+    }
+    // Something in gram is executing not on the 'stream' and therefore
+    // a full device sync is required
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    naiveGramMatrixKernel(params.n1,
+                          params.n2,
+                          params.n_cols,
+                          x1,
+                          x2,
+                          gram_host.data(),
+                          params.ld1,
+                          params.ld2,
+                          params.ld_out,
+                          params.is_row_major,
+                          params.kernel,
+                          stream,
+                          handle);
+    raft::resource::sync_stream(handle, stream);
+
+    ASSERT_TRUE(cuvs::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox<math_t>(1e-6f), stream));
+  }
+
+  raft::resources handle;
+  cudaStream_t stream = 0;
+  GramMatrixInputs params;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+
+  rmm::device_uvector<int> x1_csr_indptr;
+  rmm::device_uvector<int> x1_csr_indices;
+  rmm::device_uvector<math_t> x1_csr_data;
+  rmm::device_uvector<int> x2_csr_indptr;
+  rmm::device_uvector<int> x2_csr_indices;
+  rmm::device_uvector<math_t> x2_csr_data;
+
+  rmm::device_uvector<math_t> gram;
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloatStandard;
+typedef GramMatrixTest<float> GramMatrixTestFloatLd;
+typedef GramMatrixTest<float> GramMatrixTestFloatLdCsr;
+
+TEST_P(GramMatrixTestFloatStandard, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLd, Gram) { runTest(); }
+TEST_P(GramMatrixTestFloatLdCsr, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatStandard, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatLd, ::testing::ValuesIn(inputs_ld));
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests,
+                         GramMatrixTestFloatLdCsr,
+                         ::testing::ValuesIn(inputs_ld_csr));
+};  // namespace cuvs::distance::kernels::sparse
\ No newline at end of file