From 007c3d2c9efddec6dc46549a5aa2e9f48d4d1612 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 5 Dec 2024 15:48:41 -0500 Subject: [PATCH] Moving spectral embedding and kernel gramm APIs to cuVS (#463) Partially addresses #455 Authors: - Corey J. Nolet (https://github.com/cjnolet) - Ben Frederickson (https://github.com/benfred) Approvers: - Ben Frederickson (https://github.com/benfred) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cuvs/pull/463 --- build.sh | 12 +- cpp/CMakeLists.txt | 4 + cpp/include/cuvs/cluster/agglomerative.hpp | 1 + cpp/include/cuvs/distance/grammian.hpp | 665 +++++++++++++++ cpp/include/cuvs/embed/spectral.hpp | 40 + .../distance/detail/kernels/gram_matrix.cu | 481 +++++++++++ .../distance/detail/kernels/gram_matrix.cuh | 488 ----------- .../distance/detail/kernels/kernel_factory.cu | 61 ++ .../detail/kernels/kernel_factory.cuh | 65 -- .../detail/kernels/kernel_matrices.cu | 726 ++++++++++++++++ .../detail/kernels/kernel_matrices.cuh | 777 ------------------ .../distance/detail/kernels/rbf_fin_op.cuh | 4 +- .../detail/pairwise_matrix/dispatch-ext.cuh | 4 +- .../detail/pairwise_matrix/dispatch_rbf.cu | 6 +- cpp/src/distance/distance-ext.cuh | 4 +- cpp/src/distance/distance.cu | 4 +- cpp/src/embed/spectral.cu | 53 ++ cpp/src/sparse/cluster/cluster_solvers.cuh | 100 +++ cpp/src/sparse/cluster/detail/spectral.cuh | 111 +++ .../spectral/modularity_maximization.hpp | 176 ++++ .../cluster/detail/spectral/partition.hpp | 188 +++++ .../cluster/detail/spectral/spectral_util.cuh | 181 ++++ cpp/src/sparse/cluster/eigen_solvers.cuh | 107 +++ .../cluster/modularity_maximization.cuh | 86 ++ cpp/src/sparse/cluster/partition.cuh | 95 +++ cpp/test/CMakeLists.txt | 6 + cpp/test/distance/gram.cu | 174 ++++ cpp/test/distance/gram_base.cuh | 91 ++ cpp/test/sparse/cluster/cluster_solvers.cu | 105 +++ cpp/test/sparse/cluster/eigen_solvers.cu | 119 +++ cpp/test/sparse/cluster/spectral.cu | 109 +++ cpp/test/sparse/cluster/spectral_matrix.cu | 84 ++ cpp/test/sparse/gram.cu | 330 ++++++++ 33 files changed, 4105 insertions(+), 1352 deletions(-) create mode 100644 cpp/include/cuvs/distance/grammian.hpp create mode 100644 cpp/include/cuvs/embed/spectral.hpp create mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cu delete mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cuh create mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cu delete mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cuh create mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cu delete mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cuh create mode 100644 cpp/src/embed/spectral.cu create mode 100644 cpp/src/sparse/cluster/cluster_solvers.cuh create mode 100644 cpp/src/sparse/cluster/detail/spectral.cuh create mode 100644 cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp create mode 100644 cpp/src/sparse/cluster/detail/spectral/partition.hpp create mode 100644 cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh create mode 100644 cpp/src/sparse/cluster/eigen_solvers.cuh create mode 100644 cpp/src/sparse/cluster/modularity_maximization.cuh create mode 100644 cpp/src/sparse/cluster/partition.cuh create mode 100644 cpp/test/distance/gram.cu create mode 100644 cpp/test/distance/gram_base.cuh create mode 100644 cpp/test/sparse/cluster/cluster_solvers.cu create mode 100644 cpp/test/sparse/cluster/eigen_solvers.cu create mode 100644 cpp/test/sparse/cluster/spectral.cu create mode 100644 cpp/test/sparse/cluster/spectral_matrix.cu create mode 100644 cpp/test/sparse/gram.cu diff --git a/build.sh b/build.sh index c08c2900e..bd5fa649b 100755 --- a/build.sh +++ b/build.sh @@ -76,8 +76,8 @@ BUILD_REPORT_METRICS="" BUILD_REPORT_INCL_CACHE_STATS=OFF BUILD_SHARED_LIBS=ON -TEST_TARGETS="NEIGHBORS_ANN_CAGRA_TEST" -ANN_BENCH_TARGETS="CUVS_ANN_BENCH_ALL" +TEST_TARGETS="" +ANN_BENCH_TARGETS="" CACHE_ARGS="" NVTX=ON @@ -273,14 +273,6 @@ fi if hasArg tests || (( ${NUMARGS} == 0 )); then BUILD_TESTS=ON CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}" - - # Force compile library when needed test targets are specified - if [[ $CMAKE_TARGET == *"CAGRA_C_TEST"* || \ - $CMAKE_TARGET == *"INTEROP_TEST"* || \ - $CMAKE_TARGET == *"NEIGHBORS_ANN_CAGRA_TEST"* ]]; then - echo "-- Enabling compiled lib for gtests" - COMPILE_LIBRARY=ON - fi fi if hasArg bench-ann || (( ${NUMARGS} == 0 )); then diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 199bb232d..95fb7e63b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -324,6 +324,9 @@ if(BUILD_SHARED_LIBS) src/cluster/kmeans_transform_float.cu src/cluster/single_linkage_float.cu src/core/bitset.cu + src/distance/detail/kernels/gram_matrix.cu + src/distance/detail/kernels/kernel_factory.cu + src/distance/detail/kernels/kernel_matrices.cu src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu @@ -370,6 +373,7 @@ if(BUILD_SHARED_LIBS) src/distance/distance.cu src/distance/pairwise_distance.cu src/distance/sparse_distance.cu + src/embed/spectral.cu src/neighbors/brute_force.cu src/neighbors/brute_force_serialize.cu src/neighbors/cagra_build_float.cu diff --git a/cpp/include/cuvs/cluster/agglomerative.hpp b/cpp/include/cuvs/cluster/agglomerative.hpp index e1da04085..8f7e8675a 100644 --- a/cpp/include/cuvs/cluster/agglomerative.hpp +++ b/cpp/include/cuvs/cluster/agglomerative.hpp @@ -18,6 +18,7 @@ #include #include + #include #include diff --git a/cpp/include/cuvs/distance/grammian.hpp b/cpp/include/cuvs/distance/grammian.hpp new file mode 100644 index 000000000..0c904d493 --- /dev/null +++ b/cpp/include/cuvs/distance/grammian.hpp @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace cuvs::distance::kernels { + +template +using dense_input_matrix_view_t = raft::device_matrix_view; +template +using dense_output_matrix_view_t = raft::device_matrix_view; +template +using csr_input_matrix_view_t = raft::device_csr_matrix_view; + +/** + * Base class for general Gram matrices + * A Gram matrix is the Hermitian matrix of inner probucts G_ik = + * Here, the inner product is evaluated for all elements from vectors sets X1, + * and X2. + * + * To be more precise, on exit the output buffer will store: + * - if is_row_major == true: out[j+k*n1] = , + * - if is_row_major == false: out[j*n2 + k] = , + * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector + * from the x2 set. + */ +template +class GramMatrixBase { + protected: + cublasHandle_t cublas_handle; + bool legacy_interface; + + public: + GramMatrixBase() : legacy_interface(false){}; + [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle) + : cublas_handle(cublas_handle), legacy_interface(true){}; + + virtual ~GramMatrixBase(){}; + + /** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void operator()(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1 = nullptr, + math_t* norm_x2 = nullptr); + + /** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1 = nullptr, + math_t* norm_x2 = nullptr); + + /** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1 = nullptr, + math_t* norm_x2 = nullptr); + + // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual + + /** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + virtual void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + virtual void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + virtual void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] virtual void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); + + /** Convenience function to evaluate the Gram matrix for two vector sets. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ + [[deprecated]] void operator()(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1 = 0, + int ld2 = 0, + int ld_out = 0); + + protected: + /** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ + [[deprecated]] void linear(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); + + protected: + bool get_is_row_major(dense_output_matrix_view_t matrix); + bool get_is_row_major(dense_input_matrix_view_t matrix); + bool get_is_col_major(dense_output_matrix_view_t matrix); + bool get_is_col_major(dense_input_matrix_view_t matrix); + + /** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ + void linear(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out); + + /** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ + void linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out); + + /** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ + void linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out); +}; + +template +class KernelFactory { + public: + static GramMatrixBase* create(KernelParams params); + [[deprecated]] static GramMatrixBase* create(KernelParams params, cublasHandle_t handle); +}; + +/** + * Create a kernel matrix using polynomial kernel function. + */ +template +class PolynomialKernel : public GramMatrixBase { + exp_t exponent; + math_t gain; + math_t offset; + + void applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream); + + public: + /** + * Constructs a polynomial kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = (gain* + offset)^exponent + * + * @tparam math_t floating point type + * @tparam exp_t type of exponent + * @param exponent + * @param gain + * @param offset + */ + PolynomialKernel(exp_t exponent, math_t gain, math_t offset) + : GramMatrixBase(), exponent(exponent), gain(gain), offset(offset){}; + + [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle) + : GramMatrixBase(handle), exponent(exponent), gain(gain), offset(offset){}; + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); +}; + +/** + * Create a kernel matrix using tanh kernel function. + */ +template +class TanhKernel : public GramMatrixBase { + math_t gain, offset; + + void applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream); + + public: + /** + * Constructs a tanh kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = tanh(gain* + offset) + * + * @tparam math_t floating point type + * @param gain + * @param offset + */ + TanhKernel(math_t gain, math_t offset) : GramMatrixBase(), gain(gain), offset(offset) {} + + [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle) + : GramMatrixBase(handle), gain(gain), offset(offset){}; + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); +}; + +/** + * Create a kernel matrix using RBF kernel function. + */ +template +class RBFKernel : public GramMatrixBase { + math_t gain; + + void applyKernel(math_t* inout, + int ld, + int rows, + int cols, + math_t* norm_x1, + math_t* norm_x2, + bool is_row_major, + cudaStream_t stream); + + public: + /** + * Constructs a RBF kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = exp(-gain*|x1_i- x2_k|^2) + * + * @tparam math_t floating point type + * @param gain + */ + RBFKernel(math_t gain) : GramMatrixBase(), gain(gain){}; + + [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle) + : GramMatrixBase(handle), gain(gain){}; + + void matrixRowNormL2(raft::resources const& handle, + dense_input_matrix_view_t matrix, + math_t* target); + + void matrixRowNormL2(raft::resources const& handle, + csr_input_matrix_view_t matrix, + math_t* target); + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); +}; +}; // end namespace cuvs::distance::kernels diff --git a/cpp/include/cuvs/embed/spectral.hpp b/cpp/include/cuvs/embed/spectral.hpp new file mode 100644 index 000000000..1a8fed96a --- /dev/null +++ b/cpp/include/cuvs/embed/spectral.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cuvs::embed::spectral { + +/** + * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings + * (lowest n_components eigenvectors), using Lanczos min cut algorithm. Please note that this + * algorithm does not compute a full laplacian eigenmap, as the laplacian eigenmap would embed each + * connected component. Laplacian eigenmaps can be built from this algorithm by running it on the + * vectors for each connected component. + + * @param[in] handle + * @param[in] knn_graph KNN Graph + * @param[in] n_components the number of components to project into + * @param[out] out output array for embedding (size n*n_comonents) + * @param[in] seed + */ +void fit(const raft::resources& handle, + raft::device_coo_matrix_view knn_graph, + int n_components, + raft::device_matrix_view out, + unsigned long long seed = 0L); +}; // namespace cuvs::embed::spectral diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cu b/cpp/src/distance/detail/kernels/gram_matrix.cu new file mode 100644 index 000000000..0e4f3e639 --- /dev/null +++ b/cpp/src/distance/detail/kernels/gram_matrix.cu @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../distance.cuh" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::distance::kernels { + +/** + * Base class for general Gram matrices + * A Gram matrix is the Hermitian matrix of inner probucts G_ik = + * Here, the inner product is evaluated for all elements from vectors sets X1, + * and X2. + * + * To be more precise, on exit the output buffer will store: + * - if is_row_major == true: out[j+k*n1] = , + * - if is_row_major == false: out[j*n2 + k] = , + * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector + * from the x2 set. + */ + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void GramMatrixBase::operator()(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + evaluate(handle, x1, x2, out, norm_x1, norm_x2); +} + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void GramMatrixBase::operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + evaluate(handle, x1, x2, out, norm_x1, norm_x2); +} + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void GramMatrixBase::operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + evaluate(handle, x1, x2, out, norm_x1, norm_x2); +} + +// unfortunately, 'evaluate' cannot be templatized as it needs to be virtual + +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void GramMatrixBase::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + linear(handle, x1, x2, out); +} +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void GramMatrixBase::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + linear(handle, x1, x2, out); +} +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void GramMatrixBase::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + linear(handle, x1, x2, out); +} + +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void GramMatrixBase::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); +} + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ +template +[[deprecated]] void GramMatrixBase::operator()(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor."); + if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; } + if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; } + if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; } + evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ +template +[[deprecated]] void GramMatrixBase::linear(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + math_t alpha = 1.0; + math_t beta = 0.0; + if (is_row_major) { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + n2, + n1, + n_cols, + &alpha, + x2, + ld2, + x1, + ld1, + &beta, + out, + ld_out, + stream)); + } else { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, + CUBLAS_OP_N, + CUBLAS_OP_T, + n1, + n2, + n_cols, + &alpha, + x1, + ld1, + x2, + ld2, + &beta, + out, + ld_out, + stream)); + } +} + +template +bool GramMatrixBase::get_is_row_major(dense_output_matrix_view_t matrix) +{ + return (matrix.stride(1) == 1); +} +template +bool GramMatrixBase::get_is_row_major(dense_input_matrix_view_t matrix) +{ + return (matrix.stride(1) == 1); +} + +template +bool GramMatrixBase::get_is_col_major(dense_output_matrix_view_t matrix) +{ + return (matrix.stride(0) == 1); +} + +template +bool GramMatrixBase::get_is_col_major(dense_input_matrix_view_t matrix) +{ + return (matrix.stride(0) == 1); +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ +template +void GramMatrixBase::linear(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out) +{ + // check is_row_major consistency + bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out); + bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out); + ASSERT(is_row_major || is_col_major, + "GramMatrix leading dimensions for x1, x2 and out do not match"); + + // check dimensions + int n1 = out.extent(0); + int n2 = out.extent(1); + int n_cols = x1.extent(1); + ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match"); + ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match"); + ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match"); + + // extract major stride + int ld1 = is_row_major ? x1.stride(0) : x1.stride(1); + int ld2 = is_row_major ? x2.stride(0) : x2.stride(1); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + + math_t alpha = 1.0; + math_t beta = 0.0; + if (is_row_major) { + // #TODO: Use mdspan-based API when stride-capable + // https://github.com/rapidsai/raft/issues/875 + raft::linalg::gemm(handle, + true, + false, + n2, + n1, + n_cols, + &alpha, + x2.data_handle(), + ld2, + x1.data_handle(), + ld1, + &beta, + out.data_handle(), + ld_out, + raft::resource::get_cuda_stream(handle)); + } else { + // #TODO: Use mdspan-based API when stride-capable + // https://github.com/rapidsai/raft/issues/875 + raft::linalg::gemm(handle, + false, + true, + n1, + n2, + n_cols, + &alpha, + x1.data_handle(), + ld1, + x2.data_handle(), + ld2, + &beta, + out.data_handle(), + ld_out, + raft::resource::get_cuda_stream(handle)); + } +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ +template +void GramMatrixBase::linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out) +{ + // check is_row_major consistency + bool is_row_major = get_is_row_major(x2) && get_is_row_major(out); + bool is_col_major = get_is_col_major(x2) && get_is_col_major(out); + ASSERT(is_row_major || is_col_major, "GramMatrix leading dimensions for x2 and out do not match"); + + // check dimensions + auto x1_structure = x1.structure_view(); + ASSERT(x1_structure.get_n_rows() == out.extent(0), + "GramMatrix input matrix dimensions for x1 and out do not match"); + ASSERT(x2.extent(0) == out.extent(1), + "GramMatrix input matrix dimensions for x2 and out do not match"); + ASSERT(x2.extent(1) == x1_structure.get_n_cols(), + "GramMatrix input matrix dimensions for x1 and x2 do not match"); + + math_t alpha = 1.0; + math_t beta = 0.0; + + raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out); +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ +template +void GramMatrixBase::linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out) +{ + // check layout consistency (w.r.t. strides a matrix might be both row & col major) + bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1); + bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0); + + ASSERT(is_row_major_nopad || is_col_major_nopad, + "Sparse linear Kernel distance does not support ld_out parameter"); + + // switch a,b based on is_row_major + if (is_col_major_nopad) { + auto out_row_major = raft::make_device_matrix_view( + out.data_handle(), out.extent(1), out.extent(0)); + + cuvs::distance::pairwise_distance( + handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); + } else { + auto out_row_major = raft::make_device_matrix_view( + out.data_handle(), out.extent(0), out.extent(1)); + cuvs::distance::pairwise_distance( + handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); + } +} + +template class GramMatrixBase; +template class GramMatrixBase; + +}; // namespace cuvs::distance::kernels diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cuh b/cpp/src/distance/detail/kernels/gram_matrix.cuh deleted file mode 100644 index d435fb4d1..000000000 --- a/cpp/src/distance/detail/kernels/gram_matrix.cuh +++ /dev/null @@ -1,488 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "../../distance.cuh" -#include -#include -#include -#include -// #include -#include -#include -#include -#include - -namespace cuvs::distance::kernels::detail { - -template -using dense_input_matrix_view_t = raft::device_matrix_view; -template -using dense_output_matrix_view_t = raft::device_matrix_view; -template -using csr_input_matrix_view_t = raft::device_csr_matrix_view; - -/** - * Base class for general Gram matrices - * A Gram matrix is the Hermitian matrix of inner probucts G_ik = - * Here, the inner product is evaluated for all elements from vectors sets X1, - * and X2. - * - * To be more precise, on exit the output buffer will store: - * - if is_row_major == true: out[j+k*n1] = , - * - if is_row_major == false: out[j*n2 + k] = , - * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector - * from the x2 set. - */ -template -class GramMatrixBase { - protected: - cublasHandle_t cublas_handle; - bool legacy_interface; - - public: - GramMatrixBase() : legacy_interface(false){}; - [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle) - : cublas_handle(cublas_handle), legacy_interface(true){}; - - virtual ~GramMatrixBase(){}; - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * Vector sets are provided in Matrix format - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void operator()(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * Vector sets are provided in Matrix format - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void operator()(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * Vector sets are provided in Matrix format - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void operator()(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } - - // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual - - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - virtual void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - virtual void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - virtual void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } - - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] virtual void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - } - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 - * @param ld2 leading dimension of x2 - * @param ld_out leading dimension of out - */ - [[deprecated]] void operator()(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1 = 0, - int ld2 = 0, - int ld_out = 0) - { - ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor."); - if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; } - if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; } - if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; } - evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - } - - protected: - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 - * @param ld2 leading dimension of x2 - * @param ld_out leading dimension of out - */ - [[deprecated]] void linear(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - math_t alpha = 1.0; - math_t beta = 0.0; - if (is_row_major) { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - n2, - n1, - n_cols, - &alpha, - x2, - ld2, - x1, - ld1, - &beta, - out, - ld_out, - stream)); - } else { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_T, - n1, - n2, - n_cols, - &alpha, - x1, - ld1, - x2, - ld2, - &beta, - out, - ld_out, - stream)); - } - } - - protected: - bool get_is_row_major(dense_output_matrix_view_t matrix) - { - return (matrix.stride(1) == 1); - } - - bool get_is_row_major(dense_input_matrix_view_t matrix) - { - return (matrix.stride(1) == 1); - } - - bool get_is_col_major(dense_output_matrix_view_t matrix) - { - return (matrix.stride(0) == 1); - } - - bool get_is_col_major(dense_input_matrix_view_t matrix) - { - return (matrix.stride(0) == 1); - } - - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - */ - void linear(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check is_row_major consistency - bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out); - bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out); - ASSERT(is_row_major || is_col_major, - "GramMatrix leading dimensions for x1, x2 and out do not match"); - - // check dimensions - int n1 = out.extent(0); - int n2 = out.extent(1); - int n_cols = x1.extent(1); - ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match"); - ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match"); - ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match"); - - // extract major stride - int ld1 = is_row_major ? x1.stride(0) : x1.stride(1); - int ld2 = is_row_major ? x2.stride(0) : x2.stride(1); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - - math_t alpha = 1.0; - math_t beta = 0.0; - if (is_row_major) { - // #TODO: Use mdspan-based API when stride-capable - // https://github.com/rapidsai/raft/issues/875 - raft::linalg::gemm(handle, - true, - false, - n2, - n1, - n_cols, - &alpha, - x2.data_handle(), - ld2, - x1.data_handle(), - ld1, - &beta, - out.data_handle(), - ld_out, - resource::get_cuda_stream(handle)); - } else { - // #TODO: Use mdspan-based API when stride-capable - // https://github.com/rapidsai/raft/issues/875 - raft::linalg::gemm(handle, - false, - true, - n1, - n2, - n_cols, - &alpha, - x1.data_handle(), - ld1, - x2.data_handle(), - ld2, - &beta, - out.data_handle(), - ld_out, - resource::get_cuda_stream(handle)); - } - } - - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - */ - void linear(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check is_row_major consistency - bool is_row_major = get_is_row_major(x2) && get_is_row_major(out); - bool is_col_major = get_is_col_major(x2) && get_is_col_major(out); - ASSERT(is_row_major || is_col_major, - "GramMatrix leading dimensions for x2 and out do not match"); - - // check dimensions - auto x1_structure = x1.structure_view(); - ASSERT(x1_structure.get_n_rows() == out.extent(0), - "GramMatrix input matrix dimensions for x1 and out do not match"); - ASSERT(x2.extent(0) == out.extent(1), - "GramMatrix input matrix dimensions for x2 and out do not match"); - ASSERT(x2.extent(1) == x1_structure.get_n_cols(), - "GramMatrix input matrix dimensions for x1 and x2 do not match"); - - math_t alpha = 1.0; - math_t beta = 0.0; - - raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out); - } - - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - */ - void linear(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check layout consistency (w.r.t. strides a matrix might be both row & col major) - bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1); - bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0); - - ASSERT(is_row_major_nopad || is_col_major_nopad, - "Sparse linear Kernel distance does not support ld_out parameter"); - - // switch a,b based on is_row_major - if (is_col_major_nopad) { - auto out_row_major = raft::make_device_matrix_view( - out.data_handle(), out.extent(1), out.extent(0)); - raft::sparse::distance::pairwise_distance( - handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); - } else { - auto out_row_major = raft::make_device_matrix_view( - out.data_handle(), out.extent(0), out.extent(1)); - raft::sparse::distance::pairwise_distance( - handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); - } - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cu b/cpp/src/distance/detail/kernels/kernel_factory.cu new file mode 100644 index 000000000..25f9e9b84 --- /dev/null +++ b/cpp/src/distance/detail/kernels/kernel_factory.cu @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace cuvs::distance::kernels { + +template +GramMatrixBase* KernelFactory::create(KernelParams params) +{ + GramMatrixBase* res; + // KernelParams is not templated, we convert the parameters to math_t here: + math_t coef0 = params.coef0; + math_t gamma = params.gamma; + switch (params.kernel) { + case LINEAR: res = new GramMatrixBase(); break; + case POLYNOMIAL: res = new PolynomialKernel(params.degree, gamma, coef0); break; + case TANH: res = new TanhKernel(gamma, coef0); break; + case RBF: res = new RBFKernel(gamma); break; + default: throw raft::exception("Kernel not implemented"); + } + return res; +} + +template +[[deprecated]] GramMatrixBase* KernelFactory::create(KernelParams params, + cublasHandle_t handle) +{ + GramMatrixBase* res; + // KernelParams is not templated, we convert the parameters to math_t here: + math_t coef0 = params.coef0; + math_t gamma = params.gamma; + switch (params.kernel) { + case LINEAR: res = new GramMatrixBase(handle); break; + case POLYNOMIAL: + res = new PolynomialKernel(params.degree, gamma, coef0, handle); + break; + case TANH: res = new TanhKernel(gamma, coef0, handle); break; + case RBF: res = new RBFKernel(gamma, handle); break; + default: throw raft::exception("Kernel not implemented"); + } + return res; +} + +template class KernelFactory; +template class KernelFactory; + +}; // end namespace cuvs::distance::kernels diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cuh b/cpp/src/distance/detail/kernels/kernel_factory.cuh deleted file mode 100644 index 5c50a95a3..000000000 --- a/cpp/src/distance/detail/kernels/kernel_factory.cuh +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "gram_matrix.cuh" -#include "kernel_matrices.cuh" - -#include -#include - -namespace cuvs::distance::kernels::detail { - -template -class KernelFactory { - public: - static GramMatrixBase* create(KernelParams params) - { - GramMatrixBase* res; - // KernelParams is not templated, we convert the parameters to math_t here: - math_t coef0 = params.coef0; - math_t gamma = params.gamma; - switch (params.kernel) { - case LINEAR: res = new GramMatrixBase(); break; - case POLYNOMIAL: res = new PolynomialKernel(params.degree, gamma, coef0); break; - case TANH: res = new TanhKernel(gamma, coef0); break; - case RBF: res = new RBFKernel(gamma); break; - default: throw raft::exception("Kernel not implemented"); - } - return res; - } - - [[deprecated]] static GramMatrixBase* create(KernelParams params, cublasHandle_t handle) - { - GramMatrixBase* res; - // KernelParams is not templated, we convert the parameters to math_t here: - math_t coef0 = params.coef0; - math_t gamma = params.gamma; - switch (params.kernel) { - case LINEAR: res = new GramMatrixBase(handle); break; - case POLYNOMIAL: - res = new PolynomialKernel(params.degree, gamma, coef0, handle); - break; - case TANH: res = new TanhKernel(gamma, coef0, handle); break; - case RBF: res = new RBFKernel(gamma, handle); break; - default: throw raft::exception("Kernel not implemented"); - } - return res; - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cu b/cpp/src/distance/detail/kernels/kernel_matrices.cu new file mode 100644 index 000000000..526ca106f --- /dev/null +++ b/cpp/src/distance/detail/kernels/kernel_matrices.cu @@ -0,0 +1,726 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../../distance/distance.cuh" +#include + +#include "rbf_fin_op.cuh" +#include +#include +#include +#include +#include + +namespace cuvs::distance::kernels { + +/** Epiloge function for polynomial kernel without padding. + * Calculates output = (gain*in + offset)^exponent + * @param inout device vector in column major format, size [len] + * @param len array length + * @param exponent + * @param gain + * @param offset + */ +template +RAFT_KERNEL polynomial_kernel_nopad( + math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) +{ + for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; + tid += blockDim.x * gridDim.x) { + inout[tid] = pow(gain * inout[tid] + offset, exponent); + } +} + +/** Epiloge function for polynomial kernel with padding. + * Calculates output = (gain*input + offset)^exponent + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of columns + * @param exponent + * @param gain + * @param offset + */ +template +RAFT_KERNEL polynomial_kernel( + math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent); + } +} + +/** Epiloge function for tanh kernel without padding. + * Calculates output = tanh(gain*input + offset) + * @param inout device vector, size [len] + * @param len length of the input vector + * @param gain + * @param offset + */ +template +RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) +{ + for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; + tid += blockDim.x * gridDim.x) { + inout[tid] = tanh(gain * inout[tid] + offset); + } +} + +/** Epiloge function for tanh kernel without padding. + * Calculates output = tanh(gain*input + offset) + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of columns + * @param gain + * @param offset + */ +template +RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset); + } +} + +/** Epiloge function for rbf kernel using expansion. + * + * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij)); + * + * Intended usage + * - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk + * - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X + * - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y + * + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of columns + * @param norm_x l2-norm of X's rows + * @param norm_y l2-norm of Y's rows + * @param gain + */ +template +RAFT_KERNEL rbf_kernel_expanded( + math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) { + math_t norm_y_val = norm_y[tidy]; + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = + exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2)); + } + } +} + +std::tuple generateLaunchConfig2dElementwiseOp(int n1, int n2) +{ + dim3 block_shape = dim3(32, 4); + const int num_blocks_x = raft::ceildiv(n1, 32); + const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1); + dim3 grid_shape = dim3(num_blocks_x, num_blocks_y); + return std::make_tuple(grid_shape, block_shape); +} + +/** + * Create a kernel matrix using polynomial kernel function. + */ +template +void PolynomialKernel::applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) +{ + const int n_minor = is_row_major ? cols : rows; + if (ld == n_minor) { + polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( + inout, rows * cols, exponent, gain, offset); + } else { + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + polynomial_kernel<<>>( + inout, ld, n1, n2, exponent, gain, offset); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void PolynomialKernel::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void PolynomialKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void PolynomialKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void PolynomialKernel::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(GramMatrixBase::legacy_interface, + "Legacy interface can only be used with legacy ctor."); + GramMatrixBase::linear( + x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + applyKernel(out, ld_out, n1, n2, is_row_major, stream); +} + +/** + * Create a kernel matrix using tanh kernel function. + */ +template +void TanhKernel::applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) +{ + const int n_minor = is_row_major ? cols : rows; + if (ld == n_minor) { + tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( + inout, rows * cols, gain, offset); + } else { + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void TanhKernel::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void TanhKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void TanhKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void TanhKernel::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(GramMatrixBase::legacy_interface, + "Legacy interface can only be used with legacy ctor."); + GramMatrixBase::linear( + x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + applyKernel(out, ld_out, n1, n2, is_row_major, stream); +} + +/** + * Create a kernel matrix using RBF kernel function. + */ +template +void RBFKernel::applyKernel(math_t* inout, + int ld, + int rows, + int cols, + math_t* norm_x1, + math_t* norm_x2, + bool is_row_major, + cudaStream_t stream) +{ + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; + math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + rbf_kernel_expanded<<>>( + inout, ld, n1, n2, norm_n1, norm_n2, gain); +} + +template +void RBFKernel::matrixRowNormL2(raft::resources const& handle, + dense_input_matrix_view_t matrix, + math_t* target) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(matrix); + int minor = is_row_major ? matrix.extent(1) : matrix.extent(0); + int ld = is_row_major ? matrix.stride(0) : matrix.stride(1); + ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter"); + raft::linalg::rowNorm(target, + matrix.data_handle(), + matrix.extent(1), + matrix.extent(0), + raft::linalg::NormType::L2Norm, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +template +void RBFKernel::matrixRowNormL2(raft::resources const& handle, + csr_input_matrix_view_t matrix, + math_t* target) +{ + auto matrix_structure = matrix.structure_view(); + raft::sparse::linalg::rowNormCsr(handle, + matrix_structure.get_indptr().data(), + matrix.get_elements().data(), + matrix_structure.get_nnz(), + matrix_structure.get_n_rows(), + target, + raft::linalg::NormType::L2Norm); +} + +/** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void RBFKernel::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + // lazy compute norms if not given + rmm::device_uvector tmp_norm_x1(0, stream); + rmm::device_uvector tmp_norm_x2(0, stream); + if (norm_x1 == nullptr) { + tmp_norm_x1.reserve(x1.extent(0), stream); + norm_x1 = tmp_norm_x1.data(); + matrixRowNormL2(handle, x1, norm_x1); + } + if (norm_x2 == nullptr) { + tmp_norm_x2.reserve(x2.extent(0), stream); + norm_x2 = tmp_norm_x2.data(); + matrixRowNormL2(handle, x2, norm_x2); + } + + // compute L2expanded + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + norm_x1, + norm_x2, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void RBFKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + // lazy compute norms if not given + rmm::device_uvector tmp_norm_x1(0, stream); + rmm::device_uvector tmp_norm_x2(0, stream); + if (norm_x1 == nullptr) { + tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); + norm_x1 = tmp_norm_x1.data(); + matrixRowNormL2(handle, x1, norm_x1); + } + if (norm_x2 == nullptr) { + tmp_norm_x2.reserve(x2.extent(0), stream); + norm_x2 = tmp_norm_x2.data(); + matrixRowNormL2(handle, x2, norm_x2); + } + + // compute L2expanded + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + norm_x1, + norm_x2, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void RBFKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + // lazy compute norms if not given + rmm::device_uvector tmp_norm_x1(0, stream); + rmm::device_uvector tmp_norm_x2(0, stream); + if (norm_x1 == nullptr) { + tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); + norm_x1 = tmp_norm_x1.data(); + matrixRowNormL2(handle, x1, norm_x1); + } + if (norm_x2 == nullptr) { + tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream); + norm_x2 = tmp_norm_x2.data(); + matrixRowNormL2(handle, x2, norm_x2); + } + + // compute L2expanded + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + norm_x1, + norm_x2, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void RBFKernel::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(GramMatrixBase::legacy_interface, + "Legacy interface can only be used with legacy ctor."); + int minor1 = is_row_major ? n_cols : n1; + int minor2 = is_row_major ? n_cols : n2; + int minor_out = is_row_major ? n2 : n1; + ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter"); + ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter"); + ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter"); + + math_t gain = this->gain; + using index_t = int64_t; + + rbf_fin_op fin_op{gain}; + + raft::resources handle; + raft::resource::set_cuda_stream(handle, stream); + + cuvs::distance::distance(handle, + const_cast(x1), + const_cast(x2), + out, + n1, + n2, + n_cols, + NULL, + 0, + fin_op, + is_row_major); +} + +template class PolynomialKernel; +template class PolynomialKernel; +template class TanhKernel; +template class TanhKernel; +template class RBFKernel; +template class RBFKernel; + +}; // end namespace cuvs::distance::kernels diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cuh b/cpp/src/distance/detail/kernels/kernel_matrices.cuh deleted file mode 100644 index bff5bda92..000000000 --- a/cpp/src/distance/detail/kernels/kernel_matrices.cuh +++ /dev/null @@ -1,777 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "gram_matrix.cuh" - -#include "../detail/kernels/rbf_fin_op.cuh" -#include -#include -#include -#include -#include - -namespace cuvs::distance::kernels::detail { - -/** Epiloge function for polynomial kernel without padding. - * Calculates output = (gain*in + offset)^exponent - * @param inout device vector in column major format, size [len] - * @param len array length - * @param exponent - * @param gain - * @param offset - */ -template -RAFT_KERNEL polynomial_kernel_nopad( - math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; - tid += blockDim.x * gridDim.x) { - inout[tid] = pow(gain * inout[tid] + offset, exponent); - } -} - -/** Epiloge function for polynomial kernel with padding. - * Calculates output = (gain*input + offset)^exponent - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param exponent - * @param gain - * @param offset - */ -template -RAFT_KERNEL polynomial_kernel( - math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent); - } -} - -/** Epiloge function for tanh kernel without padding. - * Calculates output = tanh(gain*input + offset) - * @param inout device vector, size [len] - * @param len length of the input vector - * @param gain - * @param offset - */ -template -RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; - tid += blockDim.x * gridDim.x) { - inout[tid] = tanh(gain * inout[tid] + offset); - } -} - -/** Epiloge function for tanh kernel without padding. - * Calculates output = tanh(gain*input + offset) - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param gain - * @param offset - */ -template -RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset); - } -} - -/** Epiloge function for rbf kernel using expansion. - * - * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij)); - * - * Intended usage - * - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk - * - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X - * - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y - * - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param norm_x l2-norm of X's rows - * @param norm_y l2-norm of Y's rows - * @param gain - */ -template -RAFT_KERNEL rbf_kernel_expanded( - math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) { - math_t norm_y_val = norm_y[tidy]; - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = - exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2)); - } - } -} - -namespace { -std::tuple generateLaunchConfig2dElementwiseOp(int n1, int n2) -{ - dim3 block_shape = dim3(32, 4); - const int num_blocks_x = raft::ceildiv(n1, 32); - const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1); - dim3 grid_shape = dim3(num_blocks_x, num_blocks_y); - return std::make_tuple(grid_shape, block_shape); -} -} // namespace - -/** - * Create a kernel matrix using polynomial kernel function. - */ -template -class PolynomialKernel : public GramMatrixBase { - exp_t exponent; - math_t gain; - math_t offset; - - void applyKernel( - math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) - { - const int n_minor = is_row_major ? cols : rows; - if (ld == n_minor) { - polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( - inout, rows * cols, exponent, gain, offset); - } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - polynomial_kernel<<>>( - inout, ld, n1, n2, exponent, gain, offset); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - public: - /** - * Constructs a polynomial kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = (gain* + offset)^exponent - * - * @tparam math_t floating point type - * @tparam exp_t type of exponent - * @param exponent - * @param gain - * @param offset - */ - PolynomialKernel(exp_t exponent, math_t gain, math_t offset) - : GramMatrixBase(), exponent(exponent), gain(gain), offset(offset) - { - } - - [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), exponent(exponent), gain(gain), offset(offset) - { - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - GramMatrixBase::linear( - x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - applyKernel(out, ld_out, n1, n2, is_row_major, stream); - } -}; - -/** - * Create a kernel matrix using tanh kernel function. - */ -template -class TanhKernel : public GramMatrixBase { - math_t gain, offset; - - void applyKernel( - math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) - { - const int n_minor = is_row_major ? cols : rows; - if (ld == n_minor) { - tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( - inout, rows * cols, gain, offset); - } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - public: - /** - * Constructs a tanh kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = tanh(gain* + offset) - * - * @tparam math_t floating point type - * @param gain - * @param offset - */ - TanhKernel(math_t gain, math_t offset) : GramMatrixBase(), gain(gain), offset(offset) {} - - [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain), offset(offset) - { - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - GramMatrixBase::linear( - x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - applyKernel(out, ld_out, n1, n2, is_row_major, stream); - } -}; - -/** - * Create a kernel matrix using RBF kernel function. - */ -template -class RBFKernel : public GramMatrixBase { - math_t gain; - - void applyKernel(math_t* inout, - int ld, - int rows, - int cols, - math_t* norm_x1, - math_t* norm_x2, - bool is_row_major, - cudaStream_t stream) - { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; - math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - rbf_kernel_expanded<<>>( - inout, ld, n1, n2, norm_n1, norm_n2, gain); - } - - public: - /** - * Constructs a RBF kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = exp(-gain*|x1_i- x2_k|^2) - * - * @tparam math_t floating point type - * @param gain - */ - RBFKernel(math_t gain) : GramMatrixBase(), gain(gain) {} - - [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain) - { - } - - void matrixRowNormL2(raft::resources const& handle, - dense_input_matrix_view_t matrix, - math_t* target) - { - bool is_row_major = GramMatrixBase::get_is_row_major(matrix); - int minor = is_row_major ? matrix.extent(1) : matrix.extent(0); - int ld = is_row_major ? matrix.stride(0) : matrix.stride(1); - ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter"); - raft::linalg::rowNorm(target, - matrix.data_handle(), - matrix.extent(1), - matrix.extent(0), - raft::linalg::NormType::L2Norm, - is_row_major, - resource::get_cuda_stream(handle)); - } - - void matrixRowNormL2(raft::resources const& handle, - csr_input_matrix_view_t matrix, - math_t* target) - { - auto matrix_structure = matrix.structure_view(); - raft::sparse::linalg::rowNormCsr(handle, - matrix_structure.get_indptr().data(), - matrix.get_elements().data(), - matrix_structure.get_nnz(), - matrix_structure.get_n_rows(), - target, - raft::linalg::NormType::L2Norm); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.extent(0), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.extent(0), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.extent(0), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - int minor1 = is_row_major ? n_cols : n1; - int minor2 = is_row_major ? n_cols : n2; - int minor_out = is_row_major ? n2 : n1; - ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter"); - ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter"); - ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter"); - - math_t gain = this->gain; - using index_t = int64_t; - - rbf_fin_op fin_op{gain}; - - raft::resources handle; - resource::set_cuda_stream(handle, stream); - - cuvs::distance::distance(handle, - const_cast(x1), - const_cast(x2), - out, - n1, - n2, - n_cols, - NULL, - 0, - fin_op, - is_row_major); - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh index 73588baea..53022368d 100644 --- a/cpp/src/distance/detail/kernels/rbf_fin_op.cuh +++ b/cpp/src/distance/detail/kernels/rbf_fin_op.cuh @@ -28,7 +28,7 @@ #include // raft::exp #include // HD -namespace cuvs::distance::kernels::detail { +namespace cuvs::distance::kernels { /** @brief: Final op for Gram matrix with RBF kernel. * @@ -48,4 +48,4 @@ struct rbf_fin_op { } }; // struct rbf_fin_op -} // namespace cuvs::distance::kernels::detail +} // namespace cuvs::distance::kernels diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh index edfd7cf5f..49497ab3a 100644 --- a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh +++ b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh @@ -118,9 +118,7 @@ instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default( instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default( cuvs::distance::detail::ops::russel_rao_distance_op, int); instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo( - cuvs::distance::detail::ops::l2_unexp_distance_op, - int64_t, - cuvs::distance::kernels::detail::rbf_fin_op); + cuvs::distance::detail::ops::l2_unexp_distance_op, int64_t, cuvs::distance::kernels::rbf_fin_op); instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default( cuvs::distance::detail::ops::l2_exp_distance_op, int64_t); diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu index 3c8f25109..a2e12b6df 100644 --- a/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu +++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_rbf.cu @@ -50,7 +50,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch( float, float, float, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); instantiate_raft_distance_detail_pairwise_matrix_dispatch( @@ -58,7 +58,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch( double, double, double, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); instantiate_raft_distance_detail_pairwise_matrix_dispatch( @@ -66,7 +66,7 @@ instantiate_raft_distance_detail_pairwise_matrix_dispatch( half, float, float, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); #undef instantiate_raft_distance_detail_pairwise_matrix_dispatch diff --git a/cpp/src/distance/distance-ext.cuh b/cpp/src/distance/distance-ext.cuh index e623f76ba..a692a62a3 100644 --- a/cpp/src/distance/distance-ext.cuh +++ b/cpp/src/distance/distance-ext.cuh @@ -273,13 +273,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan float, float, float, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded, double, double, double, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); #undef instantiate_cuvs_distance_distance_extra diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu index c1d39f360..47e72460f 100644 --- a/cpp/src/distance/distance.cu +++ b/cpp/src/distance/distance.cu @@ -139,13 +139,13 @@ instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpan float, float, float, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); instantiate_cuvs_distance_distance_extra(cuvs::distance::DistanceType::L2Unexpanded, double, double, double, - cuvs::distance::kernels::detail::rbf_fin_op, + cuvs::distance::kernels::rbf_fin_op, int64_t); #undef instantiate_cuvs_distance_distance_extra diff --git a/cpp/src/embed/spectral.cu b/cpp/src/embed/spectral.cu new file mode 100644 index 000000000..c3d4e3fc7 --- /dev/null +++ b/cpp/src/embed/spectral.cu @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../sparse/cluster/detail/spectral.cuh" +#include +#include +#include + +namespace cuvs::embed::spectral { + +/** + * Given a COO formatted (symmetric) knn graph, this function computes the spectral embeddings + * (lowest n_components eigenvectors), using Lanczos min cut algorithm. + * @param rows source vertices of knn graph (size nnz) + * @param cols destination vertices of knn graph (size nnz) + * @param vals edge weights connecting vertices of knn graph (size nnz) + * @param nnz size of rows/cols/vals + * @param n number of samples in X + * @param n_neighbors the number of neighbors to query for knn graph construction + * @param n_components the number of components to project the X into + * @param out output array for embedding (size n*n_comonents) + */ +void fit(const raft::resources& handle, + raft::device_coo_matrix_view knn_graph, + int n_components, + raft::device_matrix_view out, + unsigned long long seed) +{ + cuvs::sparse::cluster::spectral::detail::fit_embedding( + handle, + knn_graph.structure_view().get_rows().data(), + knn_graph.structure_view().get_cols().data(), + knn_graph.get_elements().data(), + knn_graph.structure_view().get_nnz(), + knn_graph.structure_view().get_n_rows(), + n_components, + out.data_handle(), + seed); +} +}; // namespace cuvs::embed::spectral diff --git a/cpp/src/sparse/cluster/cluster_solvers.cuh b/cpp/src/sparse/cluster/cluster_solvers.cuh new file mode 100644 index 000000000..7b4cf6ab3 --- /dev/null +++ b/cpp/src/sparse/cluster/cluster_solvers.cuh @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __CLUSTER_SOLVERS_H +#define __CLUSTER_SOLVERS_H + +#pragma once + +#include +#include +#include + +#include // for std::pair + +namespace cuvs { +namespace spectral { + +using namespace raft::spectral::matrix; + +// aggregate of control params for Eigen Solver: +// +template +struct cluster_solver_config_t { + size_type_t n_clusters; + size_type_t maxIter; + + value_type_t tol; + + unsigned long long seed{123456}; +}; + +template +struct kmeans_solver_t { + explicit kmeans_solver_t( + cluster_solver_config_t const& config) + : config_(config) + { + } + + std::pair solve(raft::resources const& handle, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { + RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); + RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); + value_type_t residual{}; + index_type_t iters{}; + cuvs::cluster::kmeans::params km_params; + km_params.n_clusters = config_.n_clusters; + km_params.tol = config_.tol; + km_params.max_iter = config_.maxIter; + km_params.rng_state.seed = config_.seed; + + auto X = raft::make_device_matrix_view(obs, n_obs_vecs, dim); + auto labels = raft::make_device_vector_view(codes, n_obs_vecs); + auto centroids = + raft::make_device_matrix(handle, config_.n_clusters, dim); + auto weight = raft::make_device_vector(handle, n_obs_vecs); + thrust::fill(raft::resource::get_thrust_policy(handle), + weight.data_handle(), + weight.data_handle() + n_obs_vecs, + 1); + + auto sw = std::make_optional((raft::device_vector_view)weight.view()); + cuvs::cluster::kmeans::fit_predict(handle, + km_params, + X, + sw, + centroids.view(), + labels, + raft::make_host_scalar_view(&residual), + raft::make_host_scalar_view(&iters)); + return std::make_pair(residual, iters); + } + + auto const& get_config(void) const { return config_; } + + private: + cluster_solver_config_t config_; +}; + +} // namespace spectral +} // namespace cuvs + +#endif \ No newline at end of file diff --git a/cpp/src/sparse/cluster/detail/spectral.cuh b/cpp/src/sparse/cluster/detail/spectral.cuh new file mode 100644 index 000000000..571d92bf5 --- /dev/null +++ b/cpp/src/sparse/cluster/detail/spectral.cuh @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../cluster_solvers.cuh" +#include "../eigen_solvers.cuh" +#include "../partition.cuh" +#include +#include +#include +#include +#include + +#include + +namespace cuvs::sparse::cluster::spectral::detail { + +template +void fit_embedding(raft::resources const& handle, + int* rows, + int* cols, + T* vals, + int nnz, + int n, + int n_components, + T* out, + unsigned long long seed = 1234567) +{ + auto stream = raft::resource::get_cuda_stream(handle); + rmm::device_uvector src_offsets(n + 1, stream); + rmm::device_uvector dst_cols(nnz, stream); + rmm::device_uvector dst_vals(nnz, stream); + raft::sparse::convert::coo_to_csr( + handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data()); + + rmm::device_uvector eigVals(n_components + 1, stream); + rmm::device_uvector eigVecs(n * (n_components + 1), stream); + rmm::device_uvector labels(n, stream); + + raft::resource::sync_stream(handle, stream); + + /** + * Raft spectral clustering + */ + using index_type = int; + using value_type = T; + + index_type* ro = src_offsets.data(); + index_type* ci = dst_cols.data(); + value_type* vs = dst_vals.data(); + + raft::spectral::matrix::sparse_matrix_t const r_csr_m{ + handle, ro, ci, vs, n, nnz}; + + index_type neigvs = n_components + 1; + index_type maxiter = 4000; // default reset value (when set to 0); + value_type tol = 0.01; + index_type restart_iter = 15 + neigvs; // what cugraph is using + + cuvs::spectral::eigen_solver_config_t cfg{ + neigvs, maxiter, restart_iter, tol}; + + cfg.seed = seed; + + cuvs::spectral::lanczos_solver_t eig_solver{cfg}; + + // cluster computation here is irrelevant, + // hence define a no-op such solver to + // feed partition(): + // + struct no_op_cluster_solver_t { + using index_type_t = index_type; + using size_type_t = index_type; + using value_type_t = value_type; + + std::pair solve(raft::resources const& handle, + size_type_t n_obs_vecs, + size_type_t dim, + value_type_t const* __restrict__ obs, + index_type_t* __restrict__ codes) const + { + return std::make_pair(0, 0); + } + }; + + cuvs::spectral::partition(handle, + r_csr_m, + eig_solver, + no_op_cluster_solver_t{}, + labels.data(), + eigVals.data(), + eigVecs.data()); + + raft::copy(out, eigVecs.data() + n, n * n_components, stream); + + RAFT_CUDA_TRY(cudaGetLastError()); +} + +}; // namespace cuvs::sparse::cluster::spectral::detail \ No newline at end of file diff --git a/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp new file mode 100644 index 000000000..a42ad2dc1 --- /dev/null +++ b/cpp/src/sparse/cluster/detail/spectral/modularity_maximization.hpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS +#include "../../cluster_solvers.cuh" +#include "../../eigen_solvers.cuh" +#include "spectral_util.cuh" +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuvs { +namespace spectral { +namespace detail { + +// ========================================================= +// Spectral modularity_maximization +// ========================================================= + +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param clusters (Output, device memory, n entries) Cluster + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return error flag. + */ +template +std::tuple modularity_maximization( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + + std::tuple + stats; // # iters eigen solver, cluster solver residual, # iters cluster solver + + vertex_t n = csr_m.nrows_; + + // Compute eigenvectors of Modularity Matrix + + // Initialize Modularity Matrix + raft::spectral::matrix::modularity_matrix_t B{handle, csr_m}; + + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_config.n_eigVecs; + + // Compute eigenvectors corresponding to largest eigenvalues + std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); + + // notice that at this point the matrix has already been transposed, so we are scaling + // columns + auto dataset_view = raft::make_device_matrix_view(eigVecs, nEigVecs, n); + raft::linalg::row_normalize( + handle, raft::make_const_mdspan(dataset_view), dataset_view, raft::linalg::L2Norm); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; +} +//=================================================== +// Analysis of graph partition +// ========================================================= + +/// Compute modularity +/** This function determines the modularity based on a graph and cluster assignments + * @param G Weighted graph in CSR format + * @param nClusters Number of clusters. + * @param clusters (Input, device memory, n entries) Cluster assignments. + * @param modularity On exit, modularity + */ +template +void analyzeModularity(raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + vertex_t nClusters, + vertex_t const* __restrict__ clusters, + weight_t& modularity) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + + vertex_t i; + vertex_t n = csr_m.nrows_; + weight_t partModularity, clustersize; + + auto cublas_h = raft::resource::get_cublas_handle(handle); + auto stream = raft::resource::get_cuda_stream(handle); + + // Device memory + raft::spectral::matrix::vector_t part_i(handle, n); + raft::spectral::matrix::vector_t Bx(handle, n); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // Initialize Modularity + raft::spectral::matrix::modularity_matrix_t B{handle, csr_m}; + + // Initialize output + modularity = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { + WARNING("empty partition"); + continue; + } + + // Record results + modularity += partModularity; + } + + modularity = modularity / B.diagonal_.nrm1(); +} + +} // namespace detail +} // namespace spectral +} // namespace cuvs diff --git a/cpp/src/sparse/cluster/detail/spectral/partition.hpp b/cpp/src/sparse/cluster/detail/spectral/partition.hpp new file mode 100644 index 000000000..77e83c17d --- /dev/null +++ b/cpp/src/sparse/cluster/detail/spectral/partition.hpp @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS +#include + +#include "../../cluster_solvers.cuh" +#include "../../eigen_solvers.cuh" +#include "spectral_util.cuh" +#include + +#include +#include +#include +#include + +#include +#include + +#include + +namespace cuvs { +namespace spectral { +namespace detail { + +// ========================================================= +// Spectral partitioner +// ========================================================= + +/// Compute spectral graph partition +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param nEigVecs Number of eigenvectors to compute. + * @param maxIter_lanczos Maximum number of Lanczos iterations. + * @param restartIter_lanczos Maximum size of Lanczos system before + * implicit restart. + * @param tol_lanczos Convergence tolerance for Lanczos method. + * @param maxIter_kmeans Maximum number of k-means iterations. + * @param tol_kmeans Convergence tolerance for k-means algorithm. + * @param clusters (Output, device memory, n entries) Partition + * assignments. + * @param iters_lanczos On exit, number of Lanczos iterations + * performed. + * @param iters_kmeans On exit, number of k-means iterations + * performed. + * @return statistics: number of eigensolver iterations, . + */ +template +std::tuple partition( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + + std::tuple + stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, + // cluster solver residual, # iters cluster solver + + vertex_t n = csr_m.nrows_; + + // ------------------------------------------------------- + // Spectral partitioner + // ------------------------------------------------------- + + // Compute eigenvectors of Laplacian + + // Initialize Laplacian + /// sparse_matrix_t A{handle, graph}; + raft::spectral::matrix::laplacian_matrix_t L{handle, csr_m}; + + auto eigen_config = eigen_solver.get_config(); + auto nEigVecs = eigen_config.n_eigVecs; + + // Compute smallest eigenvalues and eigenvectors + std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); + + // Whiten eigenvector matrix + transform_eigen_matrix(handle, n, nEigVecs, eigVecs); + + // Find partition clustering + auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); + + std::get<1>(stats) = pair_cluster.first; + std::get<2>(stats) = pair_cluster.second; + + return stats; +} + +// ========================================================= +// Analysis of graph partition +// ========================================================= + +/// Compute cost function for partition +/** This function determines the edges cut by a partition and a cost + * function: + * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param G Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param clusters (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + * @return error flag. + */ +template +void analyzePartition(raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + vertex_t nClusters, + const vertex_t* __restrict__ clusters, + weight_t& edgeCut, + weight_t& cost) +{ + RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); + + vertex_t i; + vertex_t n = csr_m.nrows_; + + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + + weight_t partEdgesCut, clustersize; + + // Device memory + raft::spectral::matrix::vector_t part_i(handle, n); + raft::spectral::matrix::vector_t Lx(handle, n); + + // Initialize cuBLAS + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // Initialize Laplacian + /// sparse_matrix_t A{handle, graph}; + raft::spectral::matrix::laplacian_matrix_t L{handle, csr_m}; + + // Initialize output + cost = 0; + edgeCut = 0; + + // Iterate through partitions + for (i = 0; i < nClusters; ++i) { + // Construct indicator vector for ith partition + if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { + WARNING("empty partition"); + continue; + } + + // Record results + cost += partEdgesCut / clustersize; + edgeCut += partEdgesCut / 2; + } +} + +} // namespace detail +} // namespace spectral +} // namespace cuvs diff --git a/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh new file mode 100644 index 000000000..1d2e58e2a --- /dev/null +++ b/cpp/src/sparse/cluster/detail/spectral/spectral_util.cuh @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +// TODO: Expose needed wrappers in RAFT's public API so we don't need to call detail APIs in cuVS +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cuvs { +namespace spectral { + +template +void transform_eigen_matrix(raft::resources const& handle, + edge_t n, + vertex_t nEigVecs, + weight_t* eigVecs) +{ + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + auto thrust_exec_policy = raft::resource::get_thrust_policy(handle); + + const weight_t zero{0.0}; + const weight_t one{1.0}; + + // Whiten eigenvector matrix + for (auto i = 0; i < nEigVecs; ++i) { + weight_t mean, std; + + mean = thrust::reduce(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); + RAFT_CHECK_CUDA(stream); + mean /= n; + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(mean), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::minus()); + RAFT_CHECK_CUDA(stream); + + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); + + std /= std::sqrt(static_cast(n)); + + thrust::transform(thrust_exec_policy, + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), + thrust::make_constant_iterator(std), + thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), + thrust::divides()); + RAFT_CHECK_CUDA(stream); + } + + // Transpose eigenvector matrix + // TODO: in-place transpose + { + raft::spectral::matrix::vector_t work(handle, nEigVecs * n); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); + + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h, + CUBLAS_OP_T, + CUBLAS_OP_N, + nEigVecs, + n, + &one, + eigVecs, + n, + &zero, + (weight_t*)NULL, + nEigVecs, + work.raw(), + nEigVecs, + stream)); + + RAFT_CUDA_TRY(cudaMemcpyAsync( + eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); + } +} + +namespace { +/// Functor to generate indicator vectors +/** For use in Thrust transform + */ +template +struct equal_to_i_op { + const index_type_t i; + + public: + equal_to_i_op(index_type_t _i) : i(_i) {} + template + __host__ __device__ void operator()(Tuple_ t) + { + thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; + } +}; +} // namespace + +// Construct indicator vector for ith partition +// +template +bool construct_indicator(raft::resources const& handle, + edge_t index, + edge_t n, + weight_t& clustersize, + weight_t& partStats, + vertex_t const* __restrict__ clusters, + raft::spectral::matrix::vector_t& part_i, + raft::spectral::matrix::vector_t& Bx, + raft::spectral::matrix::laplacian_matrix_t const& B) +{ + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + auto thrust_exec_policy = raft::resource::get_thrust_policy(handle); + + thrust::for_each( + thrust_exec_policy, + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters), + thrust::device_pointer_cast(part_i.raw()))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n), + thrust::device_pointer_cast(part_i.raw() + n))), + equal_to_i_op(index)); + RAFT_CHECK_CUDA(stream); + + // Compute size of ith partition + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot( + cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); + + clustersize = round(clustersize); + if (clustersize < 0.5) { return false; } + + // Compute part stats + B.mv(1, part_i.raw(), 0, Bx.raw()); + // TODO: Call from public API when ready + RAFT_CUBLAS_TRY( + raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); + + return true; +} + +} // namespace spectral +} // namespace cuvs diff --git a/cpp/src/sparse/cluster/eigen_solvers.cuh b/cpp/src/sparse/cluster/eigen_solvers.cuh new file mode 100644 index 000000000..1b2501d68 --- /dev/null +++ b/cpp/src/sparse/cluster/eigen_solvers.cuh @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __EIGEN_SOLVERS_H +#define __EIGEN_SOLVERS_H + +#pragma once + +#include +#include + +namespace cuvs { +namespace spectral { + +// aggregate of control params for Eigen Solver: +// +template +struct eigen_solver_config_t { + size_type_t n_eigVecs; + size_type_t maxIter; + + size_type_t restartIter; + value_type_t tol; + + bool reorthogonalize{false}; + unsigned long long seed{ + 1234567}; // CAVEAT: this default value is now common to all instances of using seed in + // Lanczos; was not the case before: there were places where a default seed = 123456 + // was used; this may trigger slightly different # solver iterations +}; + +template +struct lanczos_solver_t { + explicit lanczos_solver_t( + eigen_solver_config_t const& config) + : config_(config) + { + } + + index_type_t solve_smallest_eigenvectors( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + index_type_t iters{}; + raft::sparse::solver::computeSmallestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed); + return iters; + } + + index_type_t solve_largest_eigenvectors( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& A, + value_type_t* __restrict__ eigVals, + value_type_t* __restrict__ eigVecs) const + { + RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); + RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); + index_type_t iters{}; + raft::sparse::solver::computeLargestEigenvectors(handle, + A, + config_.n_eigVecs, + config_.maxIter, + config_.restartIter, + config_.tol, + config_.reorthogonalize, + iters, + eigVals, + eigVecs, + config_.seed); + return iters; + } + + auto const& get_config(void) const { return config_; } + + private: + eigen_solver_config_t config_; +}; + +} // namespace spectral +} // namespace cuvs + +#endif diff --git a/cpp/src/sparse/cluster/modularity_maximization.cuh b/cpp/src/sparse/cluster/modularity_maximization.cuh new file mode 100644 index 000000000..71cba6927 --- /dev/null +++ b/cpp/src/sparse/cluster/modularity_maximization.cuh @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __MODULARITY_MAXIMIZATION_H +#define __MODULARITY_MAXIMIZATION_H + +#pragma once + +#include "detail/spectral/modularity_maximization.hpp" + +#include + +namespace cuvs { +namespace spectral { + +// ========================================================= +// Spectral modularity_maximization +// ========================================================= + +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param handle raft handle for managing expensive resources + * @param csr_m Weighted graph in CSR format + * @param eigen_solver Eigensolver implementation + * @param cluster_solver Cluster solver implementation + * @param clusters (Output, device memory, n entries) Partition + * assignments. + * @param eigVals Output eigenvalue array pointer on device + * @param eigVecs Output eigenvector array pointer on device + * @return statistics: number of eigensolver iterations, . + */ +template +std::tuple modularity_maximization( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + return cuvs::spectral::detail:: + modularity_maximization( + handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs); +} +//=================================================== +// Analysis of graph partition +// ========================================================= + +/// Compute modularity +/** This function determines the modularity based on a graph and cluster assignments + * @param handle raft handle for managing expensive resources + * @param csr_m Weighted graph in CSR format + * @param nClusters Number of clusters. + * @param clusters (Input, device memory, n entries) Cluster assignments. + * @param modularity On exit, modularity + */ +template +void analyzeModularity(raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + vertex_t nClusters, + vertex_t const* __restrict__ clusters, + weight_t& modularity) +{ + cuvs::spectral::detail::analyzeModularity( + handle, csr_m, nClusters, clusters, modularity); +} + +} // namespace spectral +} // namespace cuvs + +#endif \ No newline at end of file diff --git a/cpp/src/sparse/cluster/partition.cuh b/cpp/src/sparse/cluster/partition.cuh new file mode 100644 index 000000000..df78a8a2d --- /dev/null +++ b/cpp/src/sparse/cluster/partition.cuh @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __PARTITION_H +#define __PARTITION_H + +#pragma once + +#include "detail/spectral/partition.hpp" + +#include + +namespace cuvs { +namespace spectral { + +// ========================================================= +// Spectral partitioner +// ========================================================= + +/// Compute spectral graph partition +/** Compute partition for a weighted undirected graph. This + * partition attempts to minimize the cost function: + * Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition) + * + * @param handle raft handle for managing expensive resources + * @param csr_m Weighted graph in CSR format + * @param eigen_solver Eigensolver implementation + * @param cluster_solver Cluster solver implementation + * @param clusters (Output, device memory, n entries) Partition + * assignments. + * @param eigVals Output eigenvalue array pointer on device + * @param eigVecs Output eigenvector array pointer on device + * @return statistics: number of eigensolver iterations, . + */ +template +std::tuple partition( + raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + EigenSolver const& eigen_solver, + ClusterSolver const& cluster_solver, + vertex_t* __restrict__ clusters, + weight_t* eigVals, + weight_t* eigVecs) +{ + return cuvs::spectral::detail::partition( + handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs); +} + +// ========================================================= +// Analysis of graph partition +// ========================================================= + +/// Compute cost function for partition +/** This function determines the edges cut by a partition and a cost + * function: + * Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition) + * Graph is assumed to be weighted and undirected. + * + * @param handle raft handle for managing expensive resources + * @param csr_m Weighted graph in CSR format + * @param nClusters Number of partitions. + * @param clusters (Input, device memory, n entries) Partition + * assignments. + * @param edgeCut On exit, weight of edges cut by partition. + * @param cost On exit, partition cost function. + */ +template +void analyzePartition(raft::resources const& handle, + raft::spectral::matrix::sparse_matrix_t const& csr_m, + vertex_t nClusters, + const vertex_t* __restrict__ clusters, + weight_t& edgeCut, + weight_t& cost) +{ + cuvs::spectral::detail::analyzePartition( + handle, csr_m, nClusters, clusters, edgeCut, cost); +} + +} // namespace spectral +} // namespace cuvs + +#endif \ No newline at end of file diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 0ecac6ec2..9224e88d8 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -218,6 +218,7 @@ if(BUILD_TESTS) distance/dist_l_inf.cu distance/dist_lp_unexp.cu distance/dist_russell_rao.cu + distance/gram.cu distance/masked_nn.cu distance/sparse_distance.cu sparse/neighbors/cross_component_nn.cu @@ -227,6 +228,11 @@ if(BUILD_TESTS) 100 ) + ConfigureTest( + NAME SPARSE_TEST PATH sparse/cluster/cluster_solvers.cu sparse/cluster/eigen_solvers.cu + sparse/cluster/spectral.cu GPUS 1 PERCENT 100 + ) + ConfigureTest( NAME PREPROCESSING_TEST PATH preprocessing/scalar_quantization.cu GPUS 1 PERCENT 100 ) diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu new file mode 100644 index 000000000..89b1525ea --- /dev/null +++ b/cpp/test/distance/gram.cu @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" +#include "gram_base.cuh" + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace cuvs::distance::kernels { + +struct GramMatrixInputs { + int n1; // feature vectors in matrix 1 + int n2; // featuer vectors in matrix 2 + int n_cols; // number of elements in a feature vector + bool is_row_major; + KernelParams kernel; + int ld1; + int ld2; + int ld_out; + // We will generate random input using the dimensions given here. + // The reference output is calculated by a custom kernel. +}; + +std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p) +{ + std::vector kernel_names{"linear", "poly", "rbf", "tanh"}; + os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/" + << (p.is_row_major ? "RowMajor/" : "ColMajor/") << kernel_names[p.kernel.kernel] << "/ld_" + << p.ld1 << "x" << p.ld2 << "x" << p.ld_out; + return os; +} + +const std::vector inputs = { + {42, 137, 2, false, {KernelType::LINEAR}}, + {42, 137, 2, true, {KernelType::LINEAR}}, + {42, 137, 2, false, {KernelType::LINEAR}, 64, 179, 181}, + {42, 137, 2, true, {KernelType::LINEAR}, 64, 179, 181}, + {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}}, + {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}}, + {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144}, + {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144}, + {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}}, + {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}}, + {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49}, + {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143}, + {3, 4, 2, false, {KernelType::RBF, 0, 0.5}}, + {42, 137, 2, false, {KernelType::RBF, 0, 0.5}}, + {42, 137, 2, true, {KernelType::RBF, 0, 0.5}}, + // Distance kernel does not support LD parameter yet. + //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49}, + // {42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143}, +}; + +template +class GramMatrixTest : public ::testing::TestWithParam { + protected: + GramMatrixTest() + : params(GetParam()), + handle(), + x1(0, raft::resource::get_cuda_stream(handle)), + x2(0, raft::resource::get_cuda_stream(handle)), + gram(0, raft::resource::get_cuda_stream(handle)), + gram_host(0) + { + auto stream = raft::resource::get_cuda_stream(handle); + + if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; } + if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; } + if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; } + // Derive the size of the output from the offset of the last element. + size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1; + x1.resize(size, stream); + size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1; + x2.resize(size, stream); + size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1; + + gram.resize(size, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream)); + gram_host.resize(gram.size()); + std::fill(gram_host.begin(), gram_host.end(), 0); + + raft::random::RngState rng(42137ULL); + raft::random::uniform(handle, rng, x1.data(), x1.size(), math_t(0), math_t(1)); + raft::random::uniform(handle, rng, x2.data(), x2.size(), math_t(0), math_t(1)); + } + + ~GramMatrixTest() override {} + + void runTest() + { + std::unique_ptr> kernel = + std::unique_ptr>(KernelFactory::create(params.kernel)); + + auto x1_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + x1.data(), params.n1, params.n_cols, params.ld1) + : raft::make_device_strided_matrix_view( + x1.data(), params.n1, params.n_cols, params.ld1); + auto x2_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + x2.data(), params.n2, params.n_cols, params.ld2) + : raft::make_device_strided_matrix_view( + x2.data(), params.n2, params.n_cols, params.ld2); + auto out_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + gram.data(), params.n1, params.n2, params.ld_out) + : raft::make_device_strided_matrix_view( + gram.data(), params.n1, params.n2, params.ld_out); + + (*kernel)(handle, x1_span, x2_span, out_span); + + auto stream = raft::resource::get_cuda_stream(handle); + naiveGramMatrixKernel(params.n1, + params.n2, + params.n_cols, + x1, + x2, + gram_host.data(), + params.ld1, + params.ld2, + params.ld_out, + params.is_row_major, + params.kernel, + stream, + handle); + + ASSERT_TRUE(cuvs::devArrMatchHost( + gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox(1e-6f), stream)); + } + + GramMatrixInputs params; + raft::resources handle; + + rmm::device_uvector x1; + rmm::device_uvector x2; + rmm::device_uvector gram; + + std::vector gram_host; +}; + +typedef GramMatrixTest GramMatrixTestFloat; +typedef GramMatrixTest GramMatrixTestDouble; + +TEST_P(GramMatrixTestFloat, Gram) { runTest(); } + +INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs)); +}; // namespace cuvs::distance::kernels \ No newline at end of file diff --git a/cpp/test/distance/gram_base.cuh b/cpp/test/distance/gram_base.cuh new file mode 100644 index 000000000..326cdb4f8 --- /dev/null +++ b/cpp/test/distance/gram_base.cuh @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cuvs { +namespace distance { +namespace kernels { + +// Get the offset of element [i,k]. +HDI int get_offset(int i, int k, int ld, bool is_row_major) +{ + return is_row_major ? i * ld + k : i + k * ld; +} + +// Calculate the Gram matrix on the host. +template +void naiveGramMatrixKernel(int n1, + int n2, + int n_cols, + const rmm::device_uvector& x1, + const rmm::device_uvector& x2, + math_t* gram_host, + int ld1, + int ld2, + int ld_out, + bool is_row_major, + KernelParams kernel, + cudaStream_t stream, + const raft::resources& handle) +{ + std::vector x1_host(x1.size()); + raft::update_host(x1_host.data(), x1.data(), x1.size(), stream); + std::vector x2_host(x2.size()); + raft::update_host(x2_host.data(), x2.data(), x2.size(), stream); + raft::resource::sync_stream(handle, stream); + + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + float d = 0; + for (int k = 0; k < n_cols; k++) { + if (kernel.kernel == KernelType::RBF) { + math_t diff = x1_host[get_offset(i, k, ld1, is_row_major)] - + x2_host[get_offset(j, k, ld2, is_row_major)]; + d += diff * diff; + } else { + d += x1_host[get_offset(i, k, ld1, is_row_major)] * + x2_host[get_offset(j, k, ld2, is_row_major)]; + } + } + int idx = get_offset(i, j, ld_out, is_row_major); + math_t v = 0; + switch (kernel.kernel) { + case (KernelType::LINEAR): gram_host[idx] = d; break; + case (KernelType::POLYNOMIAL): + v = kernel.gamma * d + kernel.coef0; + gram_host[idx] = std::pow(v, kernel.degree); + break; + case (KernelType::TANH): gram_host[idx] = std::tanh(kernel.gamma * d + kernel.coef0); break; + case (KernelType::RBF): gram_host[idx] = exp(-kernel.gamma * d); break; + } + } + } +} + +} // namespace kernels +} // namespace distance +} // namespace cuvs \ No newline at end of file diff --git a/cpp/test/sparse/cluster/cluster_solvers.cu b/cpp/test/sparse/cluster/cluster_solvers.cu new file mode 100644 index 000000000..c0b6c1a78 --- /dev/null +++ b/cpp/test/sparse/cluster/cluster_solvers.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../../src/sparse/cluster/cluster_solvers.cuh" +#include "../../../src/sparse/cluster/eigen_solvers.cuh" +#include "../../../src/sparse/cluster/modularity_maximization.cuh" +#include +#include +#include + +#include + +#include +#include + +namespace cuvs { +namespace spectral { + +TEST(Raft, ClusterSolvers) +{ + using namespace raft::spectral::matrix; + using index_type = int; + using value_type = double; + + raft::resources h; + + index_type maxiter{100}; + value_type tol{1.0e-10}; + unsigned long long seed{100110021003}; + + auto stream = raft::resource::get_cuda_stream(h); + + index_type n{100}; + index_type d{10}; + index_type k{5}; + + // nullptr expected to trigger exceptions: + // + value_type* eigvecs{nullptr}; + index_type* codes{nullptr}; + + cluster_solver_config_t cfg{k, maxiter, tol, seed}; + + kmeans_solver_t cluster_solver{cfg}; + + EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes)); +} + +TEST(Raft, ModularitySolvers) +{ + using namespace raft::spectral::matrix; + using index_type = int; + using value_type = double; + + raft::resources h; + ASSERT_EQ(0, raft::resource::get_device_id(h)); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + // nullptr expected to trigger exceptions: + // + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + auto stream = raft::resource::get_cuda_stream(h); + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; + + EXPECT_ANY_THROW(cuvs::spectral::modularity_maximization( + h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + + value_type modularity{0}; + EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity)); +} + +} // namespace spectral +} // namespace cuvs diff --git a/cpp/test/sparse/cluster/eigen_solvers.cu b/cpp/test/sparse/cluster/eigen_solvers.cu new file mode 100644 index 000000000..8de0b49e7 --- /dev/null +++ b/cpp/test/sparse/cluster/eigen_solvers.cu @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../../src/sparse/cluster/eigen_solvers.cuh" +#include "../../../src/sparse/cluster/partition.cuh" +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuvs { +namespace spectral { + +TEST(Raft, EigenSolvers) +{ + raft::common::nvtx::range fun_scope("test::EigenSolvers"); + using namespace raft::spectral::matrix; + using index_type = int; + using value_type = double; + + raft::resources h; + ASSERT_EQ(0, raft::resource::get_device_id(h)); + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + // nullptr expected to trigger exceptions: + // + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + std::uint64_t seed{100110021003}; + + eigen_solver_config_t cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + + lanczos_solver_t eig_solver{cfg}; + + EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs)); + + EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs)); +} + +TEST(Raft, SpectralSolvers) +{ + raft::common::nvtx::range fun_scope("test::SpectralSolvers"); + using namespace raft::spectral::matrix; + using index_type = int; + using value_type = double; + + raft::resources h; + ASSERT_EQ(0, raft::resource::get_device_id(h) + + ); + + index_type neigvs{10}; + index_type maxiter{100}; + index_type restart_iter{10}; + value_type tol{1.0e-10}; + bool reorthog{true}; + + // nullptr expected to trigger exceptions: + // + index_type* clusters{nullptr}; + value_type* eigvals{nullptr}; + value_type* eigvecs{nullptr}; + + unsigned long long seed{100110021003}; + + eigen_solver_config_t eig_cfg{ + neigvs, maxiter, restart_iter, tol, reorthog, seed}; + lanczos_solver_t eig_solver{eig_cfg}; + + index_type k{5}; + + cluster_solver_config_t clust_cfg{k, maxiter, tol, seed}; + kmeans_solver_t cluster_solver{clust_cfg}; + + sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; + EXPECT_ANY_THROW( + spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); + + value_type edgeCut{0}; + value_type cost{0}; + EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost)); +} + +} // namespace spectral +} // namespace cuvs diff --git a/cpp/test/sparse/cluster/spectral.cu b/cpp/test/sparse/cluster/spectral.cu new file mode 100644 index 000000000..7d0cdef9d --- /dev/null +++ b/cpp/test/sparse/cluster/spectral.cu @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../test_utils.cuh" + +#include "../../../src/sparse/cluster/modularity_maximization.cuh" +#include "../../../src/sparse/cluster/partition.cuh" +#include + +#include + +#include +#include + +namespace cuvs { +namespace cluster { + +/** + * Warning: There appears to be a CUDA 12.2 bug in cusparse that causes an + * alignment issue. We've fixed the bug in our code through a workaround + * (see raft/sparse/linalg/spmm.hpp for fix). This test is meant to fail + * in the case where the fix is accidentally reverted, so that it doesn't + * break any downstream libraries that depend on RAFT + */ +TEST(Raft, Spectral) +{ + raft::handle_t handle; + + std::vector h_offsets({0, 2, 4, 7, 10, 12, 14}); + std::vector h_indices({1, 2, 0, 2, 0, 1, 3, 2, 4, 5, 3, 5, 3, 4}); + std::vector h_values( + {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}); + std::vector expected_clustering({1, 1, 1, 0, 0, 0}); + + int32_t n_clusters{2}; + int32_t n_eigenvectors{2}; + int32_t evs_max_it{100}; + int32_t kmean_max_it{100}; + int32_t restartIter_lanczos = 15 + n_eigenvectors; + float evs_tol{0.001}; + float kmean_tol{0.001}; + unsigned long long seed1{1234567}; + unsigned long long seed2{12345678}; + bool reorthog{false}; + + rmm::device_uvector offsets(h_offsets.size(), handle.get_stream()); + rmm::device_uvector indices(h_indices.size(), handle.get_stream()); + rmm::device_uvector values(h_indices.size(), handle.get_stream()); + rmm::device_uvector clustering(expected_clustering.size(), handle.get_stream()); + rmm::device_uvector eigenvalues(n_eigenvectors, handle.get_stream()); + rmm::device_uvector eigenvectors(n_eigenvectors * expected_clustering.size(), + handle.get_stream()); + + rmm::device_uvector exp_dev(expected_clustering.size(), handle.get_stream()); + + raft::update_device( + exp_dev.data(), expected_clustering.data(), expected_clustering.size(), handle.get_stream()); + + raft::update_device(offsets.data(), h_offsets.data(), h_offsets.size(), handle.get_stream()); + raft::update_device(indices.data(), h_indices.data(), h_indices.size(), handle.get_stream()); + raft::update_device(values.data(), h_values.data(), h_values.size(), handle.get_stream()); + + raft::spectral::matrix::sparse_matrix_t const matrix{ + handle, + offsets.data(), + indices.data(), + values.data(), + static_cast(offsets.size() - 1), + static_cast(indices.size())}; + + cuvs::spectral::eigen_solver_config_t eig_cfg{ + n_eigenvectors, evs_max_it, restartIter_lanczos, evs_tol, reorthog, seed1}; + cuvs::spectral::lanczos_solver_t eig_solver{eig_cfg}; + + cuvs::spectral::cluster_solver_config_t clust_cfg{ + n_clusters, kmean_max_it, kmean_tol, seed2}; + cuvs::spectral::kmeans_solver_t cluster_solver{clust_cfg}; + + cuvs::spectral::partition(handle, + matrix, + eig_solver, + cluster_solver, + clustering.data(), + eigenvalues.data(), + eigenvectors.data()); + + ASSERT_TRUE(devArrMatch(expected_clustering.data(), + exp_dev.data(), + exp_dev.size(), + 1, + cuvs::Compare(), + handle.get_stream())); +} + +} // namespace cluster +} // namespace cuvs \ No newline at end of file diff --git a/cpp/test/sparse/cluster/spectral_matrix.cu b/cpp/test/sparse/cluster/spectral_matrix.cu new file mode 100644 index 000000000..37a4202b8 --- /dev/null +++ b/cpp/test/sparse/cluster/spectral_matrix.cu @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +#include +#include + +namespace cuvs { +namespace spectral { +namespace matrix { +namespace { +template +struct csr_view_t { + index_type* offsets; + index_type* indices; + value_type* edge_data; + index_type number_of_vertices; + index_type number_of_edges; +}; +} // namespace +TEST(Raft, SpectralMatrices) +{ + using index_type = int; + using value_type = double; + + raft::resources h; + ASSERT_EQ(0, raft::resource::get_device_id(h)); + + csr_view_t csr_v{nullptr, nullptr, nullptr, 0, 0}; + + int const sz = 10; + vector_t d_v{h, sz}; + + index_type* ro{nullptr}; + index_type* ci{nullptr}; + value_type* vs{nullptr}; + index_type nnz = 0; + index_type nrows = 0; + sparse_matrix_t sm1{h, ro, ci, vs, nrows, nnz}; + sparse_matrix_t sm2{h, csr_v}; + ASSERT_EQ(nullptr, sm1.row_offsets_); + ASSERT_EQ(nullptr, sm2.row_offsets_); + + auto stream = resource::get_cuda_stream(h); + + auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) { + laplacian_matrix_t lm1{h, ro, ci, vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_lm1()); // because of nullptr ptr args + + auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t lm2{h, sm2}; }; + EXPECT_ANY_THROW(cnstr_lm2()); // because of nullptr ptr args + + auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) { + modularity_matrix_t mm1{h, ro, ci, vs, nrows, nnz}; + }; + EXPECT_ANY_THROW(cnstr_mm1()); // because of nullptr ptr args + + auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t mm2{h, sm2}; }; + EXPECT_ANY_THROW(cnstr_mm2()); // because of nullptr ptr args +} + +} // namespace matrix +} // namespace spectral +} // namespace cuvs diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu new file mode 100644 index 000000000..d7af30a1c --- /dev/null +++ b/cpp/test/sparse/gram.cu @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "../distance/gram_base.cuh" +#include "../test_utils.cuh" + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace cuvs::distance::kernels::sparse { + +/** + * Structure to describe structure of the input matrices: + * - DENSE: dense, dense + * - MIX: CSR, dense + * - CSR: CSR, CSR + */ +enum SparseType { DENSE, MIX, CSR }; + +struct GramMatrixInputs { + int n1; // feature vectors in matrix 1 + int n2; // featuer vectors in matrix 2 + int n_cols; // number of elements in a feature vector + bool is_row_major; + SparseType sparse_input; + KernelParams kernel; + int ld1; + int ld2; + int ld_out; + // We will generate random input using the dimensions given here. + // The reference output is calculated by a custom kernel. +}; + +std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p) +{ + std::vector kernel_names{"linear", "poly", "rbf", "tanh"}; + os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/" + << (p.is_row_major ? "RowMajor/" : "ColMajor/") + << (p.sparse_input == SparseType::DENSE + ? "DenseDense/" + : (p.sparse_input == SparseType::MIX ? "CsrDense/" : "CsrCsr/")) + << kernel_names[p.kernel.kernel] << "/ld_" << p.ld1 << "x" << p.ld2 << "x" << p.ld_out; + return os; +} + +/*struct KernelParams { + // Kernel function parameters + KernelType kernel; //!< Type of the kernel function + int degree; //!< Degree of polynomial kernel (ignored by others) + double gamma; //!< multiplier in the + double coef0; //!< additive constant in poly and tanh kernels +};*/ + +// const KernelParams linear_kernel_params{.kernel=KernelType::LINEAR}; + +// {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, {KernelType::TANH, 0, 0.5, 2.4}, {KernelType::RBF, 0, 0.5} +const std::vector inputs = raft::util::itertools::product( + {42}, + {137}, + {2}, + {true, false}, + {SparseType::DENSE, SparseType::MIX, SparseType::CSR}, + {KernelParams{KernelType::LINEAR}, + KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4}, + KernelParams{KernelType::TANH, 0, 0.5, 2.4}, + KernelParams{KernelType::RBF, 0, 0.5}}); + +// (ld_1, ld_2, ld_out) not supported by RBF and CSR +const std::vector inputs_ld = raft::util::itertools::product( + {137}, + {42}, + {2}, + {true, false}, + {SparseType::DENSE, SparseType::MIX}, + {KernelParams{KernelType::LINEAR}, + KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4}, + KernelParams{KernelType::TANH, 0, 0.5, 2.4}}, + {159}, + {73}, + {144}); + +// (ld_1, ld_2) are supported by CSR +const std::vector inputs_ld_csr = + raft::util::itertools::product( + {42}, + {137}, + {2}, + {true, false}, + {SparseType::CSR, SparseType::MIX}, + {KernelParams{KernelType::LINEAR}, + KernelParams{KernelType::POLYNOMIAL, 2, 0.5, 2.4}, + KernelParams{KernelType::TANH, 0, 0.5, 2.4}}, + {64}, + {155}, + {0}); + +template +class GramMatrixTest : public ::testing::TestWithParam { + protected: + GramMatrixTest() + : params(GetParam()), + stream(raft::resource::get_cuda_stream(handle)), + x1(0, stream), + x2(0, stream), + x1_csr_indptr(0, stream), + x1_csr_indices(0, stream), + x1_csr_data(0, stream), + x2_csr_indptr(0, stream), + x2_csr_indices(0, stream), + x2_csr_data(0, stream), + gram(0, stream), + gram_host(0) + { + if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; } + if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; } + if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; } + // Derive the size of the output from the offset of the last element. + size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1; + x1.resize(size, stream); + size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1; + x2.resize(size, stream); + size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1; + + gram.resize(size, stream); + RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream)); + gram_host.resize(gram.size()); + std::fill(gram_host.begin(), gram_host.end(), 0); + + raft::random::RngState r(42137ULL); + raft::random::uniform(handle, r, x1.data(), x1.size(), math_t(0), math_t(1)); + raft::random::uniform(handle, r, x2.data(), x2.size(), math_t(0), math_t(1)); + + RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); + } + + ~GramMatrixTest() override {} + + int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data) + { + int nnz = 0; + double eps = 1e-6; + int n_cols = params.n_cols; + bool is_row_major = params.is_row_major; + size_t dense_size = get_offset(n_rows - 1, n_cols - 1, ld, is_row_major) + 1; + + std::vector dense_host(dense_size); + raft::update_host(dense_host.data(), dense, dense_size, stream); + raft::resource::sync_stream(handle, stream); + + std::vector indptr_host(n_rows + 1); + std::vector indices_host(n_rows * n_cols); + std::vector data_host(n_rows * n_cols); + + // create csr matrix from dense (with threshold) + for (int i = 0; i < n_rows; ++i) { + indptr_host[i] = nnz; + for (int j = 0; j < n_cols; ++j) { + math_t value = dense_host[get_offset(i, j, ld, is_row_major)]; + if (value > eps) { + indices_host[nnz] = j; + data_host[nnz] = value; + nnz++; + } + } + } + indptr_host[n_rows] = nnz; + + // fill back dense matrix from CSR + std::fill(dense_host.data(), dense_host.data() + dense_size, 0); + for (int i = 0; i < n_rows; ++i) { + for (int idx = indptr_host[i]; idx < indptr_host[i + 1]; ++idx) { + dense_host[get_offset(i, indices_host[idx], ld, is_row_major)] = data_host[idx]; + } + } + + raft::update_device(dense, dense_host.data(), dense_size, stream); + raft::update_device(indptr, indptr_host.data(), n_rows + 1, stream); + raft::update_device(indices, indices_host.data(), nnz, stream); + raft::update_device(data, data_host.data(), nnz, stream); + raft::resource::sync_stream(handle, stream); + return nnz; + } + + void runTest() + { + std::unique_ptr> kernel = + std::unique_ptr>(KernelFactory::create(params.kernel)); + + auto x1_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + x1.data(), params.n1, params.n_cols, params.ld1) + : raft::make_device_strided_matrix_view( + x1.data(), params.n1, params.n_cols, params.ld1); + auto x2_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + x2.data(), params.n2, params.n_cols, params.ld2) + : raft::make_device_strided_matrix_view( + x2.data(), params.n2, params.n_cols, params.ld2); + auto out_span = + params.is_row_major + ? raft::make_device_strided_matrix_view( + gram.data(), params.n1, params.n2, params.ld_out) + : raft::make_device_strided_matrix_view( + gram.data(), params.n1, params.n2, params.ld_out); + + if (params.sparse_input == SparseType::DENSE) { + (*kernel)(handle, x1_span, x2_span, out_span); + } else { + x1_csr_indptr.reserve(params.n1 + 1, stream); + x1_csr_indices.reserve(params.n1 * params.n_cols, stream); + x1_csr_data.reserve(params.n1 * params.n_cols, stream); + int x1_nnz = prepareCsr(x1.data(), + params.n1, + params.ld1, + x1_csr_indptr.data(), + x1_csr_indices.data(), + x1_csr_data.data()); + + auto x1_csr_structure = raft::make_device_compressed_structure_view( + x1_csr_indptr.data(), x1_csr_indices.data(), params.n1, params.n_cols, x1_nnz); + auto x1_csr = raft::device_csr_matrix_view( + raft::device_span(x1_csr_data.data(), x1_csr_structure.get_nnz()), + x1_csr_structure); + + if (params.sparse_input == SparseType::MIX) { + (*kernel)(handle, x1_csr, x2_span, out_span); + } else { + x2_csr_indptr.reserve(params.n2 + 1, stream); + x2_csr_indices.reserve(params.n2 * params.n_cols, stream); + x2_csr_data.reserve(params.n2 * params.n_cols, stream); + int x2_nnz = prepareCsr(x2.data(), + params.n2, + params.ld2, + x2_csr_indptr.data(), + x2_csr_indices.data(), + x2_csr_data.data()); + + auto x2_csr_structure = raft::make_device_compressed_structure_view( + x2_csr_indptr.data(), x2_csr_indices.data(), params.n2, params.n_cols, x2_nnz); + auto x2_csr = raft::device_csr_matrix_view( + raft::device_span(x2_csr_data.data(), x2_csr_structure.get_nnz()), + x2_csr_structure); + + (*kernel)(handle, x1_csr, x2_csr, out_span); + } + } + // Something in gram is executing not on the 'stream' and therefore + // a full device sync is required + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + naiveGramMatrixKernel(params.n1, + params.n2, + params.n_cols, + x1, + x2, + gram_host.data(), + params.ld1, + params.ld2, + params.ld_out, + params.is_row_major, + params.kernel, + stream, + handle); + raft::resource::sync_stream(handle, stream); + + ASSERT_TRUE(cuvs::devArrMatchHost( + gram_host.data(), gram.data(), gram.size(), cuvs::CompareApprox(1e-6f), stream)); + } + + raft::resources handle; + cudaStream_t stream = 0; + GramMatrixInputs params; + + rmm::device_uvector x1; + rmm::device_uvector x2; + + rmm::device_uvector x1_csr_indptr; + rmm::device_uvector x1_csr_indices; + rmm::device_uvector x1_csr_data; + rmm::device_uvector x2_csr_indptr; + rmm::device_uvector x2_csr_indices; + rmm::device_uvector x2_csr_data; + + rmm::device_uvector gram; + std::vector gram_host; +}; + +typedef GramMatrixTest GramMatrixTestFloatStandard; +typedef GramMatrixTest GramMatrixTestFloatLd; +typedef GramMatrixTest GramMatrixTestFloatLdCsr; + +TEST_P(GramMatrixTestFloatStandard, Gram) { runTest(); } +TEST_P(GramMatrixTestFloatLd, Gram) { runTest(); } +TEST_P(GramMatrixTestFloatLdCsr, Gram) { runTest(); } + +INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatStandard, ::testing::ValuesIn(inputs)); +INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloatLd, ::testing::ValuesIn(inputs_ld)); +INSTANTIATE_TEST_SUITE_P(GramMatrixTests, + GramMatrixTestFloatLdCsr, + ::testing::ValuesIn(inputs_ld_csr)); +}; // namespace cuvs::distance::kernels::sparse \ No newline at end of file