diff --git a/build.sh b/build.sh index c4b7a7bf7..ba71e5f93 100755 --- a/build.sh +++ b/build.sh @@ -24,7 +24,7 @@ HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool= -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace cuvs::cluster::detail { -template -class UnionFind { - public: - value_idx next_label; - std::vector parent; - std::vector size; - - value_idx n_indices; - - UnionFind(value_idx N_) - : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_) - { - memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx)); - } - - value_idx find(value_idx n) - { - value_idx p; - p = n; - - while (parent[n] != -1) - n = parent[n]; - - // path compression - while (parent[p] != n) { - p = parent[p == -1 ? n_indices - 1 : p]; - parent[p == -1 ? n_indices - 1 : p] = n; - } - return n; - } - - void perform_union(value_idx m, value_idx n) - { - size[next_label] = size[m] + size[n]; - parent[m] = next_label; - parent[n] = next_label; - - next_label += 1; - } -}; - -/** - * Agglomerative labeling on host. This has not been found to be a bottleneck - * in the algorithm. A parallel version of this can be done using a parallel - * variant of Kruskal's MST algorithm - * (ref http://cucis.ece.northwestern.edu/publications/pdf/HenPat12.pdf), - * which breaks apart the sorted MST results into overlapping subsets and - * independently runs Kruskal's algorithm on each subset, merging them back - * together into a single hierarchy when complete. Unfortunately, - * this is nontrivial and the speedup wouldn't be useful until this - * becomes a bottleneck. - * - * @tparam value_idx - * @tparam value_t - * @param[in] handle the raft handle - * @param[in] rows src edges of the sorted MST - * @param[in] cols dst edges of the sorted MST - * @param[in] nnz the number of edges in the sorted MST - * @param[out] out_src parents of output - * @param[out] out_dst children of output - * @param[out] out_delta distances of output - * @param[out] out_size cluster sizes of output - */ -template -void build_dendrogram_host(raft::resources const& handle, - const value_idx* rows, - const value_idx* cols, - const value_t* data, - size_t nnz, - value_idx* children, - value_t* out_delta, - value_idx* out_size) -{ - auto stream = resource::get_cuda_stream(handle); - - value_idx n_edges = nnz; - - std::vector mst_src_h(n_edges); - std::vector mst_dst_h(n_edges); - std::vector mst_weights_h(n_edges); - - update_host(mst_src_h.data(), rows, n_edges, stream); - update_host(mst_dst_h.data(), cols, n_edges, stream); - update_host(mst_weights_h.data(), data, n_edges, stream); - - resource::sync_stream(handle, stream); - - std::vector children_h(n_edges * 2); - std::vector out_size_h(n_edges); - std::vector out_delta_h(n_edges); - - UnionFind U(nnz + 1); - - for (std::size_t i = 0; i < nnz; i++) { - value_idx a = mst_src_h[i]; - value_idx b = mst_dst_h[i]; - value_t delta = mst_weights_h[i]; - - value_idx aa = U.find(a); - value_idx bb = U.find(b); - - value_idx children_idx = i * 2; - - children_h[children_idx] = aa; - children_h[children_idx + 1] = bb; - out_delta_h[i] = delta; - out_size_h[i] = U.size[aa] + U.size[bb]; - - U.perform_union(aa, bb); - } - - raft::update_device(children, children_h.data(), n_edges * 2, stream); - raft::update_device(out_size, out_size_h.data(), n_edges, stream); - raft::update_device(out_delta, out_delta_h.data(), n_edges, stream); -} - -template -RAFT_KERNEL write_levels_kernel(const value_idx* children, value_idx* parents, value_idx n_vertices) -{ - value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - if (tid < n_vertices) { - value_idx level = tid / 2; - value_idx child = children[tid]; - parents[child] = level; - } -} - -/** - * Instead of propagating a label from roots to children, - * the children each iterate up the tree until they find - * the label of their parent. This increases the potential - * parallelism. - * @tparam value_idx - * @param children - * @param parents - * @param n_leaves - * @param labels - */ -template -RAFT_KERNEL inherit_labels(const value_idx* children, - const value_idx* levels, - std::size_t n_leaves, - value_idx* labels, - int cut_level, - value_idx n_vertices) -{ - value_idx tid = blockDim.x * blockIdx.x + threadIdx.x; - - if (tid < n_vertices) { - value_idx node = children[tid]; - value_idx cur_level = tid / 2; - - /** - * Any roots above the cut level should be ignored. - * Any leaves at the cut level should already be labeled - */ - if (cur_level > cut_level) return; - - value_idx cur_parent = node; - value_idx label = labels[cur_parent]; - - while (label == -1) { - cur_parent = cur_level + n_leaves; - cur_level = levels[cur_parent]; - label = labels[cur_parent]; - } - - labels[node] = label; - } -} - -template -struct init_label_roots { - init_label_roots(value_idx* labels_) : labels(labels_) {} - - template - __host__ __device__ void operator()(Tuple t) - { - labels[thrust::get<1>(t)] = thrust::get<0>(t); - } - - private: - value_idx* labels; -}; - -/** - * Cuts the dendrogram at a particular level where the number of nodes - * is equal to n_clusters, then propagates the resulting labels - * to all the children. - * - * @tparam value_idx - * @param handle - * @param labels - * @param children - * @param n_clusters - * @param n_leaves - */ -template -void extract_flattened_clusters(raft::resources const& handle, - value_idx* labels, - const value_idx* children, - size_t n_clusters, - size_t n_leaves) -{ - auto stream = resource::get_cuda_stream(handle); - auto thrust_policy = resource::get_thrust_policy(handle); - - // Handle special case where n_clusters == 1 - if (n_clusters == 1) { - thrust::fill(thrust_policy, labels, labels + n_leaves, 0); - } else { - /** - * Compute levels for each node - * - * 1. Initialize "levels" array of size n_leaves * 2 - * - * 2. For each entry in children, write parent - * out for each of the children - */ - - auto n_edges = (n_leaves - 1) * 2; - - thrust::device_ptr d_ptr = thrust::device_pointer_cast(children); - value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1; - - // Prevent potential infinite loop from labeling disconnected - // connectivities graph. - RAFT_EXPECTS(n_leaves > 0, "n_leaves must be positive"); - RAFT_EXPECTS( - static_cast(n_vertices) == static_cast((n_leaves - 1) * 2), - "Multiple components found in MST or MST is invalid. " - "Cannot find single-linkage solution."); - - rmm::device_uvector levels(n_vertices, stream); - - value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb); - write_levels_kernel<<>>(children, levels.data(), n_vertices); - /** - * Step 1: Find label roots: - * - * 1. Copying children[children.size()-(n_clusters-1):] entries to - * separate arrayo - * 2. sort array - * 3. take first n_clusters entries - */ - - value_idx child_size = (n_clusters - 1) * 2; - rmm::device_uvector label_roots(child_size, stream); - - value_idx children_cpy_start = n_edges - child_size; - raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream); - - thrust::sort(thrust_policy, - label_roots.data(), - label_roots.data() + (child_size), - thrust::greater()); - - rmm::device_uvector tmp_labels(n_vertices, stream); - - // Init labels to -1 - thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1); - - // Write labels for cluster roots to "labels" - thrust::counting_iterator first(0); - - auto z_iter = thrust::make_zip_iterator( - thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters))); - - thrust::for_each( - thrust_policy, z_iter, z_iter + n_clusters, init_label_roots(tmp_labels.data())); - - /** - * Step 2: Propagate labels by having children iterate through their parents - * 1. Initialize labels to -1 - * 2. For each element in levels array, propagate until parent's - * label is !=-1 - */ - value_idx cut_level = (n_edges / 2) - (n_clusters - 1); - - inherit_labels<<>>( - children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices); - - // copy tmp labels to actual labels - raft::copy_async(labels, tmp_labels.data(), n_leaves, stream); - } -} - -}; // namespace cuvs::cluster::detail diff --git a/cpp/include/cuvs/cluster/detail/connectivities.cuh b/cpp/include/cuvs/cluster/detail/connectivities.cuh deleted file mode 100644 index 165058dbd..000000000 --- a/cpp/include/cuvs/cluster/detail/connectivities.cuh +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -namespace cuvs::cluster::detail { - -template -struct distance_graph_impl { - void run(raft::resources const& handle, - const value_t* X, - size_t m, - size_t n, - cuvs::distance::DistanceType metric, - rmm::device_uvector& indptr, - rmm::device_uvector& indices, - rmm::device_uvector& data, - int c); -}; - -/** - * Connectivities specialization to build a knn graph - * @tparam value_idx - * @tparam value_t - */ -template -struct distance_graph_impl { - void run(raft::resources const& handle, - const value_t* X, - size_t m, - size_t n, - cuvs::distance::DistanceType metric, - rmm::device_uvector& indptr, - rmm::device_uvector& indices, - rmm::device_uvector& data, - int c) - { - auto stream = resource::get_cuda_stream(handle); - auto thrust_policy = resource::get_thrust_policy(handle); - - // Need to symmetrize knn into undirected graph - raft::sparse::COO knn_graph_coo(stream); - - raft::sparse::neighbors::knn_graph(handle, X, m, n, metric, knn_graph_coo, c); - - indices.resize(knn_graph_coo.nnz, stream); - data.resize(knn_graph_coo.nnz, stream); - - // self-loops get max distance - auto transform_in = thrust::make_zip_iterator( - thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals())); - - thrust::transform(thrust_policy, - transform_in, - transform_in + knn_graph_coo.nnz, - knn_graph_coo.vals(), - [=] __device__(const thrust::tuple& tup) { - bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup); - return (self_loop * std::numeric_limits::max()) + - (!self_loop * thrust::get<2>(tup)); - }); - - raft::sparse::convert::sorted_coo_to_csr( - knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream); - - // TODO: Wouldn't need to copy here if we could compute knn - // graph directly on the device uvectors - // ref: https://github.com/rapidsai/raft/issues/227 - raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, stream); - raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, stream); - } -}; - -template -RAFT_KERNEL fill_indices2(value_idx* indices, size_t m, size_t nnz) -{ - value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tid >= nnz) return; - value_idx v = tid % m; - indices[tid] = v; -} - -/** - * Compute connected CSR of pairwise distances - * @tparam value_idx - * @tparam value_t - * @param handle - * @param X - * @param m - * @param n - * @param metric - * @param[out] indptr - * @param[out] indices - * @param[out] data - */ -template -void pairwise_distances(const raft::resources& handle, - const value_t* X, - size_t m, - size_t n, - cuvs::distance::DistanceType metric, - value_idx* indptr, - value_idx* indices, - value_t* data) -{ - auto stream = resource::get_cuda_stream(handle); - auto exec_policy = resource::get_thrust_policy(handle); - - value_idx nnz = m * m; - - value_idx blocks = raft::ceildiv(nnz, (value_idx)256); - fill_indices2<<>>(indices, m, nnz); - - thrust::sequence(exec_policy, indptr, indptr + m, 0, (int)m); - - raft::update_device(indptr + m, &nnz, 1, stream); - - // TODO: It would ultimately be nice if the MST could accept - // dense inputs directly so we don't need to double the memory - // usage to hand it a sparse array here. - distance::pairwise_distance(handle, X, X, data, m, m, n, metric); - // self-loops get max distance - auto transform_in = - thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator(0), data)); - - thrust::transform(exec_policy, - transform_in, - transform_in + nnz, - data, - [=] __device__(const thrust::tuple& tup) { - value_idx idx = thrust::get<0>(tup); - bool self_loop = idx % m == idx / m; - return (self_loop * std::numeric_limits::max()) + - (!self_loop * thrust::get<1>(tup)); - }); -} - -/** - * Connectivities specialization for pairwise distances - * @tparam value_idx - * @tparam value_t - */ -template -struct distance_graph_impl { - void run(const raft::resources& handle, - const value_t* X, - size_t m, - size_t n, - cuvs::distance::DistanceType metric, - rmm::device_uvector& indptr, - rmm::device_uvector& indices, - rmm::device_uvector& data, - int c) - { - auto stream = resource::get_cuda_stream(handle); - - size_t nnz = m * m; - - indices.resize(nnz, stream); - data.resize(nnz, stream); - - pairwise_distances(handle, X, m, n, metric, indptr.data(), indices.data(), data.data()); - } -}; - -/** - * Returns a CSR connectivities graph based on the given linkage distance. - * @tparam value_idx - * @tparam value_t - * @tparam dist_type - * @param[in] handle raft handle - * @param[in] X dense data for which to construct connectivites - * @param[in] m number of rows in X - * @param[in] n number of columns in X - * @param[in] metric distance metric to use - * @param[out] indptr indptr array of connectivities graph - * @param[out] indices column indices array of connectivities graph - * @param[out] data distances array of connectivities graph - * @param[out] c constant 'c' used for nearest neighbors-based distances - * which will guarantee k <= log(n) + c - */ -template -void get_distance_graph(raft::resources const& handle, - const value_t* X, - size_t m, - size_t n, - cuvs::distance::DistanceType metric, - rmm::device_uvector& indptr, - rmm::device_uvector& indices, - rmm::device_uvector& data, - int c) -{ - auto stream = resource::get_cuda_stream(handle); - - indptr.resize(m + 1, stream); - - distance_graph_impl dist_graph; - dist_graph.run(handle, X, m, n, metric, indptr, indices, data, c); -} - -}; // namespace cuvs::cluster::detail diff --git a/cpp/include/cuvs/cluster/detail/kmeans.cuh b/cpp/include/cuvs/cluster/detail/kmeans.cuh deleted file mode 100644 index 1ed9f4ccd..000000000 --- a/cpp/include/cuvs/cluster/detail/kmeans.cuh +++ /dev/null @@ -1,1255 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace cluster { -namespace detail { - -// ========================================================= -// Init functions -// ========================================================= - -// Selects 'n_clusters' samples randomly from X -template -void initRandom(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids) -{ - raft::common::nvtx::range fun_scope("initRandom"); - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_clusters = params.n_clusters; - detail::shuffleAndGather(handle, X, centroids, n_clusters, params.rng_state.seed); -} - -/* - * @brief Selects 'n_clusters' samples from the input X using kmeans++ algorithm. - - * @note This is the algorithm described in - * "k-means++: the advantages of careful seeding". 2007, Arthur, D. and Vassilvitskii, S. - * ACM-SIAM symposium on Discrete algorithms. - * - * Scalable kmeans++ pseudocode - * 1: C = sample a point uniformly at random from X - * 2: while |C| < k - * 3: Sample x in X with probability p_x = d^2(x, C) / phi_X (C) - * 4: C = C U {x} - * 5: end for - */ -template -void kmeansPlusPlus(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroidsRawData, - rmm::device_uvector& workspace) -{ - raft::common::nvtx::range fun_scope("kmeansPlusPlus"); - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - auto n_clusters = params.n_clusters; - auto metric = params.metric; - - // number of seeding trials for each center (except the first) - auto n_trials = 2 + static_cast(std::ceil(log(n_clusters))); - - RAFT_LOG_DEBUG( - "Run sequential k-means++ to select %d centroids from %d input samples " - "(%d seeding trials per iterations)", - n_clusters, - n_samples, - n_trials); - - auto dataBatchSize = getDataBatchSize(params.batch_samples, n_samples); - - // temporary buffers - auto indices = raft::make_device_vector(handle, n_trials); - auto centroidCandidates = raft::make_device_matrix(handle, n_trials, n_features); - auto costPerCandidate = raft::make_device_vector(handle, n_trials); - auto minClusterDistance = raft::make_device_vector(handle, n_samples); - auto distBuffer = raft::make_device_matrix(handle, n_trials, n_samples); - - rmm::device_uvector L2NormBuf_OR_DistBuf(0, stream); - rmm::device_scalar clusterCost(stream); - rmm::device_scalar> minClusterIndexAndDistance(stream); - - // Device and matrix views - raft::device_vector_view indices_view(indices.data_handle(), n_trials); - auto const_weights_view = - raft::make_device_vector_view(minClusterDistance.data_handle(), n_samples); - auto const_indices_view = - raft::make_device_vector_view(indices.data_handle(), n_trials); - auto const_X_view = - raft::make_device_matrix_view(X.data_handle(), n_samples, n_features); - raft::device_matrix_view candidates_view( - centroidCandidates.data_handle(), n_trials, n_features); - - // L2 norm of X: ||c||^2 - auto L2NormX = raft::make_device_vector(handle, n_samples); - - if (metric == cuvs::distance::DistanceType::L2Expanded || - metric == cuvs::distance::DistanceType::L2SqrtExpanded) { - raft::linalg::rowNorm(L2NormX.data_handle(), - X.data_handle(), - X.extent(1), - X.extent(0), - raft::linalg::L2Norm, - true, - stream); - } - - raft::random::RngState rng(params.rng_state.seed, params.rng_state.type); - std::mt19937 gen(params.rng_state.seed); - std::uniform_int_distribution<> dis(0, n_samples - 1); - - // <<< Step-1 >>>: C <-- sample a point uniformly at random from X - auto initialCentroid = raft::make_device_matrix_view( - X.data_handle() + dis(gen) * n_features, 1, n_features); - int n_clusters_picked = 1; - - // store the chosen centroid in the buffer - raft::copy( - centroidsRawData.data_handle(), initialCentroid.data_handle(), initialCentroid.size(), stream); - - // C = initial set of centroids - auto centroids = raft::make_device_matrix_view( - centroidsRawData.data_handle(), initialCentroid.extent(0), initialCentroid.extent(1)); - // <<< End of Step-1 >>> - - // Calculate cluster distance, d^2(x, C), for all the points x in X to the nearest centroid - detail::minClusterDistanceCompute(handle, - X, - centroids, - minClusterDistance.view(), - L2NormX.view(), - L2NormBuf_OR_DistBuf, - params.metric, - params.batch_samples, - params.batch_centroids, - workspace); - - RAFT_LOG_DEBUG(" k-means++ - Sampled %d/%d centroids", n_clusters_picked, n_clusters); - - // <<<< Step-2 >>> : while |C| < k - while (n_clusters_picked < n_clusters) { - // <<< Step-3 >>> : Sample x in X with probability p_x = d^2(x, C) / phi_X (C) - // Choose 'n_trials' centroid candidates from X with probability proportional to the squared - // distance to the nearest existing cluster - - raft::random::discrete(handle, rng, indices_view, const_weights_view); - raft::matrix::gather(handle, const_X_view, const_indices_view, candidates_view); - - // Calculate pairwise distance between X and the centroid candidates - // Output - pwd [n_trials x n_samples] - auto pwd = distBuffer.view(); - detail::pairwise_distance_kmeans( - handle, centroidCandidates.view(), X, pwd, workspace, metric); - - // Update nearest cluster distance for each centroid candidate - // Note pwd and minDistBuf points to same buffer which currently holds pairwise distance values. - // Outputs minDistanceBuf[n_trials x n_samples] where minDistance[i, :] contains updated - // minClusterDistance that includes candidate-i - auto minDistBuf = distBuffer.view(); - raft::linalg::matrixVectorOp(minDistBuf.data_handle(), - pwd.data_handle(), - minClusterDistance.data_handle(), - pwd.extent(1), - pwd.extent(0), - true, - true, - raft::min_op{}, - stream); - - // Calculate costPerCandidate[n_trials] where costPerCandidate[i] is the cluster cost when using - // centroid candidate-i - raft::linalg::reduce(costPerCandidate.data_handle(), - minDistBuf.data_handle(), - minDistBuf.extent(1), - minDistBuf.extent(0), - static_cast(0), - true, - true, - stream); - - // Greedy Choice - Choose the candidate that has minimum cluster cost - // ArgMin operation below identifies the index of minimum cost in costPerCandidate - { - // Determine temporary device storage requirements - size_t temp_storage_bytes = 0; - cub::DeviceReduce::ArgMin(nullptr, - temp_storage_bytes, - costPerCandidate.data_handle(), - minClusterIndexAndDistance.data(), - costPerCandidate.extent(0), - stream); - - // Allocate temporary storage - workspace.resize(temp_storage_bytes, stream); - - // Run argmin-reduction - cub::DeviceReduce::ArgMin(workspace.data(), - temp_storage_bytes, - costPerCandidate.data_handle(), - minClusterIndexAndDistance.data(), - costPerCandidate.extent(0), - stream); - - int bestCandidateIdx = -1; - raft::copy(&bestCandidateIdx, &minClusterIndexAndDistance.data()->key, 1, stream); - resource::sync_stream(handle); - /// <<< End of Step-3 >>> - - /// <<< Step-4 >>>: C = C U {x} - // Update minimum cluster distance corresponding to the chosen centroid candidate - raft::copy(minClusterDistance.data_handle(), - minDistBuf.data_handle() + bestCandidateIdx * n_samples, - n_samples, - stream); - - raft::copy(centroidsRawData.data_handle() + n_clusters_picked * n_features, - centroidCandidates.data_handle() + bestCandidateIdx * n_features, - n_features, - stream); - - ++n_clusters_picked; - /// <<< End of Step-4 >>> - } - - RAFT_LOG_DEBUG(" k-means++ - Sampled %d/%d centroids", n_clusters_picked, n_clusters); - } /// <<<< Step-5 >>> -} - -/** - * - * @tparam DataT - * @tparam IndexT - * @param handle - * @param[in] X input matrix (size n_samples, n_features) - * @param[in] weight number of samples currently assigned to each centroid - * @param[in] cur_centroids matrix of current centroids (size n_clusters, n_features) - * @param[in] l2norm_x - * @param[out] min_cluster_and_dist - * @param[out] new_centroids - * @param[out] new_weight - * @param[inout] workspace - */ -template -void update_centroids(raft::resources const& handle, - raft::device_matrix_view X, - raft::device_vector_view sample_weights, - raft::device_matrix_view centroids, - - // TODO: Figure out how to best wrap iterator types in mdspan - LabelsIterator cluster_labels, - raft::device_vector_view weight_per_cluster, - raft::device_matrix_view new_centroids, - rmm::device_uvector& workspace) -{ - auto n_clusters = centroids.extent(0); - auto n_samples = X.extent(0); - - workspace.resize(n_samples, resource::get_cuda_stream(handle)); - - // Calculates weighted sum of all the samples assigned to cluster-i and stores the - // result in new_centroids[i] - raft::linalg::reduce_rows_by_key((DataT*)X.data_handle(), - X.extent(1), - cluster_labels, - sample_weights.data_handle(), - workspace.data(), - X.extent(0), - X.extent(1), - n_clusters, - new_centroids.data_handle(), - resource::get_cuda_stream(handle)); - - // Reduce weights by key to compute weight in each cluster - raft::linalg::reduce_cols_by_key(sample_weights.data_handle(), - cluster_labels, - weight_per_cluster.data_handle(), - (IndexT)1, - (IndexT)sample_weights.extent(0), - (IndexT)n_clusters, - resource::get_cuda_stream(handle)); - - // Computes new_centroids[i] = new_centroids[i]/weight_per_cluster[i] where - // new_centroids[n_clusters x n_features] - 2D array, new_centroids[i] has sum of all the - // samples assigned to cluster-i - // weight_per_cluster[n_clusters] - 1D array, weight_per_cluster[i] contains sum of weights in - // cluster-i. - // Note - when weight_per_cluster[i] is 0, new_centroids[i] is reset to 0 - raft::linalg::matrixVectorOp(new_centroids.data_handle(), - new_centroids.data_handle(), - weight_per_cluster.data_handle(), - new_centroids.extent(1), - new_centroids.extent(0), - true, - false, - raft::div_checkzero_op{}, - resource::get_cuda_stream(handle)); - - // copy centroids[i] to new_centroids[i] when weight_per_cluster[i] is 0 - cub::ArgIndexInputIterator itr_wt(weight_per_cluster.data_handle()); - raft::matrix::gather_if( - const_cast(centroids.data_handle()), - static_cast(centroids.extent(1)), - static_cast(centroids.extent(0)), - itr_wt, - itr_wt, - static_cast(weight_per_cluster.size()), - new_centroids.data_handle(), - [=] __device__(raft::KeyValuePair map) { // predicate - // copy when the sum of weights in the cluster is 0 - return map.value == 0; - }, - raft::key_op{}, - resource::get_cuda_stream(handle)); -} - -// TODO: Resizing is needed to use mdarray instead of rmm::device_uvector -template -void kmeans_fit_main(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_vector_view weight, - raft::device_matrix_view centroidsRawData, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter, - rmm::device_uvector& workspace) -{ - raft::common::nvtx::range fun_scope("kmeans_fit_main"); - logger::get(RAFT_NAME).set_level(params.verbosity); - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - auto n_clusters = params.n_clusters; - auto metric = params.metric; - - // stores (key, value) pair corresponding to each sample where - // - key is the index of nearest cluster - // - value is the distance to the nearest cluster - auto minClusterAndDistance = - raft::make_device_vector, IndexT>(handle, n_samples); - - // temporary buffer to store L2 norm of centroids or distance matrix, - // destructor releases the resource - rmm::device_uvector L2NormBuf_OR_DistBuf(0, stream); - - // temporary buffer to store intermediate centroids, destructor releases the - // resource - auto newCentroids = raft::make_device_matrix(handle, n_clusters, n_features); - - // temporary buffer to store weights per cluster, destructor releases the - // resource - auto wtInCluster = raft::make_device_vector(handle, n_clusters); - - rmm::device_scalar clusterCostD(stream); - - // L2 norm of X: ||x||^2 - auto L2NormX = raft::make_device_vector(handle, n_samples); - auto l2normx_view = - raft::make_device_vector_view(L2NormX.data_handle(), n_samples); - - if (metric == cuvs::distance::DistanceType::L2Expanded || - metric == cuvs::distance::DistanceType::L2SqrtExpanded) { - raft::linalg::rowNorm(L2NormX.data_handle(), - X.data_handle(), - X.extent(1), - X.extent(0), - raft::linalg::L2Norm, - true, - stream); - } - - RAFT_LOG_DEBUG( - "Calling KMeans.fit with %d samples of input data and the initialized " - "cluster centers", - n_samples); - - DataT priorClusteringCost = 0; - for (n_iter[0] = 1; n_iter[0] <= params.max_iter; ++n_iter[0]) { - RAFT_LOG_DEBUG( - "KMeans.fit: Iteration-%d: fitting the model using the initialized " - "cluster centers", - n_iter[0]); - - auto centroids = raft::make_device_matrix_view( - centroidsRawData.data_handle(), n_clusters, n_features); - - // computes minClusterAndDistance[0:n_samples) where - // minClusterAndDistance[i] is a pair where - // 'key' is index to a sample in 'centroids' (index of the nearest - // centroid) and 'value' is the distance between the sample 'X[i]' and the - // 'centroid[key]' - detail::minClusterAndDistanceCompute(handle, - X, - centroids, - minClusterAndDistance.view(), - l2normx_view, - L2NormBuf_OR_DistBuf, - params.metric, - params.batch_samples, - params.batch_centroids, - workspace); - - // Using TransformInputIteratorT to dereference an array of - // raft::KeyValuePair and converting them to just return the Key to be used - // in reduce_rows_by_key prims - detail::KeyValueIndexOp conversion_op; - cub::TransformInputIterator, - raft::KeyValuePair*> - itr(minClusterAndDistance.data_handle(), conversion_op); - - update_centroids(handle, - X, - weight, - raft::make_device_matrix_view( - centroidsRawData.data_handle(), n_clusters, n_features), - itr, - wtInCluster.view(), - newCentroids.view(), - workspace); - - // compute the squared norm between the newCentroids and the original - // centroids, destructor releases the resource - auto sqrdNorm = raft::make_device_scalar(handle, DataT(0)); - raft::linalg::mapThenSumReduce(sqrdNorm.data_handle(), - newCentroids.size(), - raft::sqdiff_op{}, - stream, - centroids.data_handle(), - newCentroids.data_handle()); - - DataT sqrdNormError = 0; - raft::copy(&sqrdNormError, sqrdNorm.data_handle(), sqrdNorm.size(), stream); - - raft::copy( - centroidsRawData.data_handle(), newCentroids.data_handle(), newCentroids.size(), stream); - - bool done = false; - if (params.inertia_check) { - // calculate cluster cost phi_x(C) - detail::computeClusterCost(handle, - minClusterAndDistance.view(), - workspace, - raft::make_device_scalar_view(clusterCostD.data()), - raft::value_op{}, - raft::add_op{}); - - DataT curClusteringCost = clusterCostD.value(stream); - - ASSERT(curClusteringCost != (DataT)0.0, - "Too few points and centroids being found is getting 0 cost from " - "centers"); - - if (n_iter[0] > 1) { - DataT delta = curClusteringCost / priorClusteringCost; - if (delta > 1 - params.tol) done = true; - } - priorClusteringCost = curClusteringCost; - } - - resource::sync_stream(handle, stream); - if (sqrdNormError < params.tol) done = true; - - if (done) { - RAFT_LOG_DEBUG("Threshold triggered after %d iterations. Terminating early.", n_iter[0]); - break; - } - } - - auto centroids = raft::make_device_matrix_view( - centroidsRawData.data_handle(), n_clusters, n_features); - - detail::minClusterAndDistanceCompute(handle, - X, - centroids, - minClusterAndDistance.view(), - l2normx_view, - L2NormBuf_OR_DistBuf, - params.metric, - params.batch_samples, - params.batch_centroids, - workspace); - - // TODO: add different templates for InType of binaryOp to avoid thrust transform - thrust::transform(raft::resource::get_thrust_policy(handle), - minClusterAndDistance.data_handle(), - minClusterAndDistance.data_handle() + minClusterAndDistance.size(), - weight.data_handle(), - minClusterAndDistance.data_handle(), - [=] __device__(const raft::KeyValuePair kvp, DataT wt) { - raft::KeyValuePair res; - res.value = kvp.value * wt; - res.key = kvp.key; - return res; - }); - - // calculate cluster cost phi_x(C) - detail::computeClusterCost(handle, - minClusterAndDistance.view(), - workspace, - raft::make_device_scalar_view(clusterCostD.data()), - raft::value_op{}, - raft::add_op{}); - - inertia[0] = clusterCostD.value(stream); - - RAFT_LOG_DEBUG("KMeans.fit: completed after %d iterations with %f inertia[0] ", - n_iter[0] > params.max_iter ? n_iter[0] - 1 : n_iter[0], - inertia[0]); -} - -/* - * @brief Selects 'n_clusters' samples from X using scalable kmeans++ algorithm. - - * @note This is the algorithm described in - * "Scalable K-Means++", 2012, Bahman Bahmani, Benjamin Moseley, - * Andrea Vattani, Ravi Kumar, Sergei Vassilvitskii, - * https://arxiv.org/abs/1203.6402 - - * Scalable kmeans++ pseudocode - * 1: C = sample a point uniformly at random from X - * 2: psi = phi_X (C) - * 3: for O( log(psi) ) times do - * 4: C' = sample each point x in X independently with probability - * p_x = l * (d^2(x, C) / phi_X (C) ) - * 5: C = C U C' - * 6: end for - * 7: For x in C, set w_x to be the number of points in X closer to x than any - * other point in C - * 8: Recluster the weighted points in C into k clusters - - * TODO: Resizing is needed to use mdarray instead of rmm::device_uvector - - */ -template -void initScalableKMeansPlusPlus(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroidsRawData, - rmm::device_uvector& workspace) -{ - raft::common::nvtx::range fun_scope( - "initScalableKMeansPlusPlus"); - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - auto n_clusters = params.n_clusters; - auto metric = params.metric; - - raft::random::RngState rng(params.rng_state.seed, params.rng_state.type); - - // <<<< Step-1 >>> : C <- sample a point uniformly at random from X - std::mt19937 gen(params.rng_state.seed); - std::uniform_int_distribution<> dis(0, n_samples - 1); - - auto cIdx = dis(gen); - auto initialCentroid = raft::make_device_matrix_view( - X.data_handle() + cIdx * n_features, 1, n_features); - - // flag the sample that is chosen as initial centroid - std::vector h_isSampleCentroid(n_samples); - std::fill(h_isSampleCentroid.begin(), h_isSampleCentroid.end(), 0); - h_isSampleCentroid[cIdx] = 1; - - // device buffer to flag the sample that is chosen as initial centroid - auto isSampleCentroid = raft::make_device_vector(handle, n_samples); - - raft::copy( - isSampleCentroid.data_handle(), h_isSampleCentroid.data(), isSampleCentroid.size(), stream); - - rmm::device_uvector centroidsBuf(initialCentroid.size(), stream); - - // reset buffer to store the chosen centroid - raft::copy(centroidsBuf.data(), initialCentroid.data_handle(), initialCentroid.size(), stream); - - auto potentialCentroids = raft::make_device_matrix_view( - centroidsBuf.data(), initialCentroid.extent(0), initialCentroid.extent(1)); - // <<< End of Step-1 >>> - - // temporary buffer to store L2 norm of centroids or distance matrix, - // destructor releases the resource - rmm::device_uvector L2NormBuf_OR_DistBuf(0, stream); - - // L2 norm of X: ||x||^2 - auto L2NormX = raft::make_device_vector(handle, n_samples); - if (metric == cuvs::distance::DistanceType::L2Expanded || - metric == cuvs::distance::DistanceType::L2SqrtExpanded) { - raft::linalg::rowNorm(L2NormX.data_handle(), - X.data_handle(), - X.extent(1), - X.extent(0), - raft::linalg::L2Norm, - true, - stream); - } - - auto minClusterDistanceVec = raft::make_device_vector(handle, n_samples); - auto uniformRands = raft::make_device_vector(handle, n_samples); - rmm::device_scalar clusterCost(stream); - - // <<< Step-2 >>>: psi <- phi_X (C) - detail::minClusterDistanceCompute(handle, - X, - potentialCentroids, - minClusterDistanceVec.view(), - L2NormX.view(), - L2NormBuf_OR_DistBuf, - params.metric, - params.batch_samples, - params.batch_centroids, - workspace); - - // compute partial cluster cost from the samples in rank - detail::computeClusterCost(handle, - minClusterDistanceVec.view(), - workspace, - raft::make_device_scalar_view(clusterCost.data()), - raft::identity_op{}, - raft::add_op{}); - - auto psi = clusterCost.value(stream); - - // <<< End of Step-2 >>> - - // Scalable kmeans++ paper claims 8 rounds is sufficient - resource::sync_stream(handle, stream); - int niter = std::min(8, (int)ceil(log(psi))); - RAFT_LOG_DEBUG("KMeans||: psi = %g, log(psi) = %g, niter = %d ", psi, log(psi), niter); - - // <<<< Step-3 >>> : for O( log(psi) ) times do - for (int iter = 0; iter < niter; ++iter) { - RAFT_LOG_DEBUG("KMeans|| - Iteration %d: # potential centroids sampled - %d", - iter, - potentialCentroids.extent(0)); - - detail::minClusterDistanceCompute(handle, - X, - potentialCentroids, - minClusterDistanceVec.view(), - L2NormX.view(), - L2NormBuf_OR_DistBuf, - params.metric, - params.batch_samples, - params.batch_centroids, - workspace); - - detail::computeClusterCost(handle, - minClusterDistanceVec.view(), - workspace, - raft::make_device_scalar_view(clusterCost.data()), - raft::identity_op{}, - raft::add_op{}); - - psi = clusterCost.value(stream); - - // <<<< Step-4 >>> : Sample each point x in X independently and identify new - // potentialCentroids - raft::random::uniform( - handle, rng, uniformRands.data_handle(), uniformRands.extent(0), (DataT)0, (DataT)1); - - detail::SamplingOp select_op(psi, - params.oversampling_factor, - n_clusters, - uniformRands.data_handle(), - isSampleCentroid.data_handle()); - - rmm::device_uvector CpRaw(0, stream); - detail::sampleCentroids(handle, - X, - minClusterDistanceVec.view(), - isSampleCentroid.view(), - select_op, - CpRaw, - workspace); - auto Cp = raft::make_device_matrix_view( - CpRaw.data(), CpRaw.size() / n_features, n_features); - /// <<<< End of Step-4 >>>> - - /// <<<< Step-5 >>> : C = C U C' - // append the data in Cp to the buffer holding the potentialCentroids - centroidsBuf.resize(centroidsBuf.size() + Cp.size(), stream); - raft::copy( - centroidsBuf.data() + centroidsBuf.size() - Cp.size(), Cp.data_handle(), Cp.size(), stream); - - IndexT tot_centroids = potentialCentroids.extent(0) + Cp.extent(0); - potentialCentroids = - raft::make_device_matrix_view(centroidsBuf.data(), tot_centroids, n_features); - /// <<<< End of Step-5 >>> - } /// <<<< Step-6 >>> - - RAFT_LOG_DEBUG("KMeans||: total # potential centroids sampled - %d", - potentialCentroids.extent(0)); - - if ((int)potentialCentroids.extent(0) > n_clusters) { - // <<< Step-7 >>>: For x in C, set w_x to be the number of pts closest to X - // temporary buffer to store the sample count per cluster, destructor - // releases the resource - auto weight = raft::make_device_vector(handle, potentialCentroids.extent(0)); - - detail::countSamplesInCluster( - handle, params, X, L2NormX.view(), potentialCentroids, workspace, weight.view()); - - // <<< end of Step-7 >>> - - // Step-8: Recluster the weighted points in C into k clusters - detail::kmeansPlusPlus( - handle, params, potentialCentroids, centroidsRawData, workspace); - - auto inertia = make_host_scalar(0); - auto n_iter = make_host_scalar(0); - KMeansParams default_params; - default_params.n_clusters = params.n_clusters; - - detail::kmeans_fit_main(handle, - default_params, - potentialCentroids, - weight.view(), - centroidsRawData, - inertia.view(), - n_iter.view(), - workspace); - - } else if ((int)potentialCentroids.extent(0) < n_clusters) { - // supplement with random - auto n_random_clusters = n_clusters - potentialCentroids.extent(0); - - RAFT_LOG_DEBUG( - "[Warning!] KMeans||: found fewer than %d centroids during " - "initialization (found %d centroids, remaining %d centroids will be " - "chosen randomly from input samples)", - n_clusters, - potentialCentroids.extent(0), - n_random_clusters); - - // generate `n_random_clusters` centroids - KMeansParams rand_params; - rand_params.init = KMeansParams::InitMethod::Random; - rand_params.n_clusters = n_random_clusters; - initRandom(handle, rand_params, X, centroidsRawData); - - // copy centroids generated during kmeans|| iteration to the buffer - raft::copy(centroidsRawData.data_handle() + n_random_clusters * n_features, - potentialCentroids.data_handle(), - potentialCentroids.size(), - stream); - } else { - // found the required n_clusters - raft::copy(centroidsRawData.data_handle(), - potentialCentroids.data_handle(), - potentialCentroids.size(), - stream); - } -} - -/** - * @brief Find clusters with k-means algorithm. - * Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. It must be noted - * that the data must be in row-major format and stored in device accessible - * location. - * @param[in] n_samples Number of samples in the input X. - * @param[in] n_features Number of features or the dimensions of each - * sample. - * @param[in] sample_weight Optional weights for each observation in X. - * @param[inout] centroids [in] When init is InitMethod::Array, use - * centroids as the initial cluster centers - * [out] Otherwise, generated centroids from the - * kmeans algorithm is stored at the address pointed by 'centroids'. - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - */ -template -void kmeans_fit(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter) -{ - raft::common::nvtx::range fun_scope("kmeans_fit"); - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - auto n_clusters = params.n_clusters; - cudaStream_t stream = resource::get_cuda_stream(handle); - // Check that parameters are valid - if (sample_weight.has_value()) - RAFT_EXPECTS(sample_weight.value().extent(0) == n_samples, - "invalid parameter (sample_weight!=n_samples)"); - RAFT_EXPECTS(n_clusters > 0, "invalid parameter (n_clusters<=0)"); - RAFT_EXPECTS(params.tol > 0, "invalid parameter (tol<=0)"); - RAFT_EXPECTS(params.oversampling_factor >= 0, "invalid parameter (oversampling_factor<0)"); - RAFT_EXPECTS((int)centroids.extent(0) == params.n_clusters, - "invalid parameter (centroids.extent(0) != n_clusters)"); - RAFT_EXPECTS(centroids.extent(1) == n_features, - "invalid parameter (centroids.extent(1) != n_features)"); - - // Display a message if the batch size is smaller than n_samples but will be ignored - if (params.batch_samples < (int)n_samples && - (params.metric == cuvs::distance::DistanceType::L2Expanded || - params.metric == cuvs::distance::DistanceType::L2SqrtExpanded)) { - RAFT_LOG_DEBUG( - "batch_samples=%d was passed, but batch_samples=%d will be used (reason: " - "batch_samples has no impact on the memory footprint when FusedL2NN can be used)", - params.batch_samples, - (int)n_samples); - } - // Display a message if batch_centroids is set and a fusedL2NN-compatible metric is used - if (params.batch_centroids != 0 && params.batch_centroids != params.n_clusters && - (params.metric == cuvs::distance::DistanceType::L2Expanded || - params.metric == cuvs::distance::DistanceType::L2SqrtExpanded)) { - RAFT_LOG_DEBUG( - "batch_centroids=%d was passed, but batch_centroids=%d will be used (reason: " - "batch_centroids has no impact on the memory footprint when FusedL2NN can be used)", - params.batch_centroids, - params.n_clusters); - } - - logger::get(RAFT_NAME).set_level(params.verbosity); - - // Allocate memory - rmm::device_uvector workspace(0, stream); - auto weight = raft::make_device_vector(handle, n_samples); - if (sample_weight.has_value()) - raft::copy(weight.data_handle(), sample_weight.value().data_handle(), n_samples, stream); - else - thrust::fill(raft::resource::get_thrust_policy(handle), - weight.data_handle(), - weight.data_handle() + weight.size(), - 1); - - // check if weights sum up to n_samples - checkWeight(handle, weight.view(), workspace); - - auto centroidsRawData = raft::make_device_matrix(handle, n_clusters, n_features); - - auto n_init = params.n_init; - if (params.init == KMeansParams::InitMethod::Array && n_init != 1) { - RAFT_LOG_DEBUG( - "Explicit initial center position passed: performing only one init in " - "k-means instead of n_init=%d", - n_init); - n_init = 1; - } - - std::mt19937 gen(params.rng_state.seed); - inertia[0] = std::numeric_limits::max(); - - for (auto seed_iter = 0; seed_iter < n_init; ++seed_iter) { - KMeansParams iter_params = params; - iter_params.rng_state.seed = gen(); - - DataT iter_inertia = std::numeric_limits::max(); - IndexT n_current_iter = 0; - if (iter_params.init == KMeansParams::InitMethod::Random) { - // initializing with random samples from input dataset - RAFT_LOG_DEBUG( - "KMeans.fit (Iteration-%d/%d): initialize cluster centers by " - "randomly choosing from the " - "input data.", - seed_iter + 1, - n_init); - initRandom(handle, iter_params, X, centroidsRawData.view()); - } else if (iter_params.init == KMeansParams::InitMethod::KMeansPlusPlus) { - // default method to initialize is kmeans++ - RAFT_LOG_DEBUG( - "KMeans.fit (Iteration-%d/%d): initialize cluster centers using " - "k-means++ algorithm.", - seed_iter + 1, - n_init); - if (iter_params.oversampling_factor == 0) - detail::kmeansPlusPlus( - handle, iter_params, X, centroidsRawData.view(), workspace); - else - detail::initScalableKMeansPlusPlus( - handle, iter_params, X, centroidsRawData.view(), workspace); - } else if (iter_params.init == KMeansParams::InitMethod::Array) { - RAFT_LOG_DEBUG( - "KMeans.fit (Iteration-%d/%d): initialize cluster centers from " - "the ndarray array input " - "passed to init argument.", - seed_iter + 1, - n_init); - raft::copy( - centroidsRawData.data_handle(), centroids.data_handle(), n_clusters * n_features, stream); - } else { - THROW("unknown initialization method to select initial centers"); - } - - detail::kmeans_fit_main(handle, - iter_params, - X, - weight.view(), - centroidsRawData.view(), - raft::make_host_scalar_view(&iter_inertia), - raft::make_host_scalar_view(&n_current_iter), - workspace); - if (iter_inertia < inertia[0]) { - inertia[0] = iter_inertia; - n_iter[0] = n_current_iter; - raft::copy( - centroids.data_handle(), centroidsRawData.data_handle(), n_clusters * n_features, stream); - } - RAFT_LOG_DEBUG("KMeans.fit after iteration-%d/%d: inertia - %f, n_iter[0] - %d", - seed_iter + 1, - n_init, - inertia[0], - n_iter[0]); - } - RAFT_LOG_DEBUG("KMeans.fit: async call returned (fit could still be running on the device)"); -} - -template -void kmeans_fit(raft::resources const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - DataT* centroids, - IndexT n_samples, - IndexT n_features, - DataT& inertia, - IndexT& n_iter) -{ - auto XView = raft::make_device_matrix_view(X, n_samples, n_features); - auto centroidsView = - raft::make_device_matrix_view(centroids, params.n_clusters, n_features); - std::optional> sample_weightView = std::nullopt; - if (sample_weight) - sample_weightView = - raft::make_device_vector_view(sample_weight, n_samples); - auto inertiaView = raft::make_host_scalar_view(&inertia); - auto n_iterView = raft::make_host_scalar_view(&n_iter); - - detail::kmeans_fit( - handle, params, XView, sample_weightView, centroidsView, inertiaView, n_iterView); -} - -template -void kmeans_predict(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, - raft::device_vector_view labels, - bool normalize_weight, - raft::host_scalar_view inertia) -{ - raft::common::nvtx::range fun_scope("kmeans_predict"); - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - cudaStream_t stream = resource::get_cuda_stream(handle); - // Check that parameters are valid - if (sample_weight.has_value()) - RAFT_EXPECTS(sample_weight.value().extent(0) == n_samples, - "invalid parameter (sample_weight!=n_samples)"); - RAFT_EXPECTS(params.n_clusters > 0, "invalid parameter (n_clusters<=0)"); - RAFT_EXPECTS(params.tol > 0, "invalid parameter (tol<=0)"); - RAFT_EXPECTS(params.oversampling_factor >= 0, "invalid parameter (oversampling_factor<0)"); - RAFT_EXPECTS((int)centroids.extent(0) == params.n_clusters, - "invalid parameter (centroids.extent(0) != n_clusters)"); - RAFT_EXPECTS(centroids.extent(1) == n_features, - "invalid parameter (centroids.extent(1) != n_features)"); - - logger::get(RAFT_NAME).set_level(params.verbosity); - auto metric = params.metric; - - // Allocate memory - // Device-accessible allocation of expandable storage used as temporary buffers - rmm::device_uvector workspace(0, stream); - auto weight = raft::make_device_vector(handle, n_samples); - if (sample_weight.has_value()) - raft::copy(weight.data_handle(), sample_weight.value().data_handle(), n_samples, stream); - else - thrust::fill(raft::resource::get_thrust_policy(handle), - weight.data_handle(), - weight.data_handle() + weight.size(), - 1); - - // check if weights sum up to n_samples - if (normalize_weight) checkWeight(handle, weight.view(), workspace); - - auto minClusterAndDistance = - raft::make_device_vector, IndexT>(handle, n_samples); - rmm::device_uvector L2NormBuf_OR_DistBuf(0, stream); - - // L2 norm of X: ||x||^2 - auto L2NormX = raft::make_device_vector(handle, n_samples); - if (metric == cuvs::distance::DistanceType::L2Expanded || - metric == cuvs::distance::DistanceType::L2SqrtExpanded) { - raft::linalg::rowNorm(L2NormX.data_handle(), - X.data_handle(), - X.extent(1), - X.extent(0), - raft::linalg::L2Norm, - true, - stream); - } - - // computes minClusterAndDistance[0:n_samples) where minClusterAndDistance[i] - // is a pair where - // 'key' is index to a sample in 'centroids' (index of the nearest - // centroid) and 'value' is the distance between the sample 'X[i]' and the - // 'centroid[key]' - auto l2normx_view = - raft::make_device_vector_view(L2NormX.data_handle(), n_samples); - detail::minClusterAndDistanceCompute(handle, - X, - centroids, - minClusterAndDistance.view(), - l2normx_view, - L2NormBuf_OR_DistBuf, - params.metric, - params.batch_samples, - params.batch_centroids, - workspace); - - // calculate cluster cost phi_x(C) - rmm::device_scalar clusterCostD(stream); - // TODO: add different templates for InType of binaryOp to avoid thrust transform - thrust::transform(raft::resource::get_thrust_policy(handle), - minClusterAndDistance.data_handle(), - minClusterAndDistance.data_handle() + minClusterAndDistance.size(), - weight.data_handle(), - minClusterAndDistance.data_handle(), - [=] __device__(const raft::KeyValuePair kvp, DataT wt) { - raft::KeyValuePair res; - res.value = kvp.value * wt; - res.key = kvp.key; - return res; - }); - - detail::computeClusterCost(handle, - minClusterAndDistance.view(), - workspace, - raft::make_device_scalar_view(clusterCostD.data()), - raft::value_op{}, - raft::add_op{}); - - thrust::transform(raft::resource::get_thrust_policy(handle), - minClusterAndDistance.data_handle(), - minClusterAndDistance.data_handle() + minClusterAndDistance.size(), - labels.data_handle(), - raft::key_op{}); - - inertia[0] = clusterCostD.value(stream); -} - -template -void kmeans_predict(raft::resources const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - const DataT* centroids, - IndexT n_samples, - IndexT n_features, - IndexT* labels, - bool normalize_weight, - DataT& inertia) -{ - auto XView = raft::make_device_matrix_view(X, n_samples, n_features); - auto centroidsView = - raft::make_device_matrix_view(centroids, params.n_clusters, n_features); - std::optional> sample_weightView{std::nullopt}; - if (sample_weight) - sample_weightView.emplace( - raft::make_device_vector_view(sample_weight, n_samples)); - auto labelsView = raft::make_device_vector_view(labels, n_samples); - auto inertiaView = raft::make_host_scalar_view(&inertia); - - detail::kmeans_predict(handle, - params, - XView, - sample_weightView, - centroidsView, - labelsView, - normalize_weight, - inertiaView); -} - -template -void kmeans_fit_predict(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - std::optional> centroids, - raft::device_vector_view labels, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter) -{ - raft::common::nvtx::range fun_scope("kmeans_fit_predict"); - if (!centroids.has_value()) { - auto n_features = X.extent(1); - auto centroids_matrix = - raft::make_device_matrix(handle, params.n_clusters, n_features); - detail::kmeans_fit( - handle, params, X, sample_weight, centroids_matrix.view(), inertia, n_iter); - detail::kmeans_predict( - handle, params, X, sample_weight, centroids_matrix.view(), labels, true, inertia); - } else { - detail::kmeans_fit( - handle, params, X, sample_weight, centroids.value(), inertia, n_iter); - detail::kmeans_predict( - handle, params, X, sample_weight, centroids.value(), labels, true, inertia); - } -} - -template -void kmeans_fit_predict(raft::resources const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - DataT* centroids, - IndexT n_samples, - IndexT n_features, - IndexT* labels, - DataT& inertia, - IndexT& n_iter) -{ - auto XView = raft::make_device_matrix_view(X, n_samples, n_features); - std::optional> sample_weightView{std::nullopt}; - if (sample_weight) - sample_weightView.emplace( - raft::make_device_vector_view(sample_weight, n_samples)); - std::optional> centroidsView{std::nullopt}; - if (centroids) - centroidsView.emplace( - raft::make_device_matrix_view(centroids, params.n_clusters, n_features)); - auto labelsView = raft::make_device_vector_view(labels, n_samples); - auto inertiaView = raft::make_host_scalar_view(&inertia); - auto n_iterView = raft::make_host_scalar_view(&n_iter); - - detail::kmeans_fit_predict( - handle, params, XView, sample_weightView, centroidsView, labelsView, inertiaView, n_iterView); -} - -/** - * @brief Transform X to a cluster-distance space. - * - * @param[in] handle The handle to the cuML library context that - * manages the CUDA resources. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format - * @param[in] centroids Cluster centroids. The data must be in row-major format. - * @param[out] X_new X transformed in the new space.. - */ -template -void kmeans_transform(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_matrix_view X_new) -{ - raft::common::nvtx::range fun_scope("kmeans_transform"); - logger::get(RAFT_NAME).set_level(params.verbosity); - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - auto n_clusters = params.n_clusters; - auto metric = params.metric; - - // Device-accessible allocation of expandable storage used as temporary buffers - rmm::device_uvector workspace(0, stream); - auto dataBatchSize = getDataBatchSize(params.batch_samples, n_samples); - - // tile over the input data and calculate distance matrix [n_samples x - // n_clusters] - for (IndexT dIdx = 0; dIdx < (IndexT)n_samples; dIdx += dataBatchSize) { - // # of samples for the current batch - auto ns = std::min(static_cast(dataBatchSize), static_cast(n_samples - dIdx)); - - // datasetView [ns x n_features] - view representing the current batch of - // input dataset - auto datasetView = raft::make_device_matrix_view( - X.data_handle() + n_features * dIdx, ns, n_features); - - // pairwiseDistanceView [ns x n_clusters] - auto pairwiseDistanceView = raft::make_device_matrix_view( - X_new.data_handle() + n_clusters * dIdx, ns, n_clusters); - - // calculate pairwise distance between cluster centroids and current batch - // of input dataset - pairwise_distance_kmeans( - handle, datasetView, centroids, pairwiseDistanceView, workspace, metric); - } -} - -template -void kmeans_transform(raft::resources const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* centroids, - IndexT n_samples, - IndexT n_features, - DataT* X_new) -{ - auto XView = raft::make_device_matrix_view(X, n_samples, n_features); - auto centroidsView = - raft::make_device_matrix_view(centroids, params.n_clusters, n_features); - auto X_newView = raft::make_device_matrix_view(X_new, n_samples, n_features); - - detail::kmeans_transform(handle, params, XView, centroidsView, X_newView); -} -} // namespace detail -} // namespace cluster -} // namespace cuvs diff --git a/cpp/include/cuvs/cluster/detail/kmeans_auto_find_k.cuh b/cpp/include/cuvs/cluster/detail/kmeans_auto_find_k.cuh deleted file mode 100644 index 78566bb06..000000000 --- a/cpp/include/cuvs/cluster/detail/kmeans_auto_find_k.cuh +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include - -#include - -#include - -#include -#include - -namespace cuvs::cluster::detail { - -template -void compute_dispersion(raft::resources const& handle, - raft::device_matrix_view X, - KMeansParams& params, - raft::device_matrix_view centroids_view, - raft::device_vector_view labels, - raft::device_vector_view clusterSizes, - rmm::device_uvector& workspace, - raft::host_vector_view clusterDispertionView, - raft::host_vector_view resultsView, - raft::host_scalar_view residual, - raft::host_scalar_view n_iter, - int val, - idx_t n, - idx_t d) -{ - auto centroids_const_view = - raft::make_device_matrix_view(centroids_view.data_handle(), val, d); - - idx_t* clusterSizes_ptr = clusterSizes.data_handle(); - auto cluster_sizes_view = - raft::make_device_vector_view(clusterSizes_ptr, val); - - params.n_clusters = val; - - cuvs::cluster::detail::kmeans_fit_predict( - handle, params, X, std::nullopt, std::make_optional(centroids_view), labels, residual, n_iter); - - detail::countLabels(handle, labels.data_handle(), clusterSizes.data_handle(), n, val, workspace); - - resultsView[val] = residual[0]; - clusterDispertionView[val] = raft::stats::cluster_dispersion( - handle, centroids_const_view, cluster_sizes_view, std::nullopt, n); -} - -template -void find_k(raft::resources const& handle, - raft::device_matrix_view X, - raft::host_scalar_view best_k, - raft::host_scalar_view residual, - raft::host_scalar_view n_iter, - idx_t kmax, - idx_t kmin = 1, - idx_t maxiter = 100, - value_t tol = 1e-2) -{ - idx_t n = X.extent(0); - idx_t d = X.extent(1); - - RAFT_EXPECTS(n >= 1, "n must be >= 1"); - RAFT_EXPECTS(d >= 1, "d must be >= 1"); - RAFT_EXPECTS(kmin >= 1, "kmin must be >= 1"); - RAFT_EXPECTS(kmax <= n, "kmax must be <= number of data samples in X"); - RAFT_EXPECTS(tol >= 0, "tolerance must be >= 0"); - RAFT_EXPECTS(maxiter >= 0, "maxiter must be >= 0"); - // Allocate memory - // Device memory - - auto centroids = raft::make_device_matrix(handle, kmax, X.extent(1)); - auto clusterSizes = raft::make_device_vector(handle, kmax); - auto labels = raft::make_device_vector(handle, n); - - rmm::device_uvector workspace(0, resource::get_cuda_stream(handle)); - - idx_t* clusterSizes_ptr = clusterSizes.data_handle(); - - // Host memory - auto results = raft::make_host_vector(kmax + 1); - auto clusterDispersion = raft::make_host_vector(kmax + 1); - - auto clusterDispertionView = clusterDispersion.view(); - auto resultsView = results.view(); - - // Loop to find *best* k - // Perform k-means in binary search - int left = kmin; // must be at least 2 - int right = kmax; // int(floor(len(data)/2)) #assumption of clusters of size 2 at least - int mid = ((unsigned int)left + (unsigned int)right) >> 1; - int oldmid = mid; - int tests = 0; - double objective[3]; // 0= left of mid, 1= right of mid - if (left == 1) left = 2; // at least do 2 clusters - - KMeansParams params; - params.max_iter = maxiter; - params.tol = tol; - - auto centroids_view = - raft::make_device_matrix_view(centroids.data_handle(), left, d); - compute_dispersion(handle, - X, - params, - centroids_view, - labels.view(), - clusterSizes.view(), - workspace, - clusterDispertionView, - resultsView, - residual, - n_iter, - left, - n, - d); - - // eval right edge0 - resultsView[right] = 1e20; - while (resultsView[right] > resultsView[left] && tests < 3) { - centroids_view = - raft::make_device_matrix_view(centroids.data_handle(), right, d); - compute_dispersion(handle, - X, - params, - centroids_view, - labels.view(), - clusterSizes.view(), - workspace, - clusterDispertionView, - resultsView, - residual, - n_iter, - right, - n, - d); - - tests += 1; - } - - objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; - objective[1] = (n - right) / (right - 1) * clusterDispertionView[right] / resultsView[right]; - while (left < right - 1) { - resultsView[mid] = 1e20; - tests = 0; - while (resultsView[mid] > resultsView[left] && tests < 3) { - centroids_view = - raft::make_device_matrix_view(centroids.data_handle(), mid, d); - compute_dispersion(handle, - X, - params, - centroids_view, - labels.view(), - clusterSizes.view(), - workspace, - clusterDispertionView, - resultsView, - residual, - n_iter, - mid, - n, - d); - - if (resultsView[mid] > resultsView[left] && (mid + 1) < right) { - mid += 1; - resultsView[mid] = 1e20; - } else if (resultsView[mid] > resultsView[left] && (mid - 1) > left) { - mid -= 1; - resultsView[mid] = 1e20; - } - tests += 1; - } - - // maximize Calinski-Harabasz Index, minimize resid/ cluster - objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; - objective[1] = (n - right) / (right - 1) * clusterDispertionView[right] / resultsView[right]; - objective[2] = (n - mid) / (mid - 1) * clusterDispertionView[mid] / resultsView[mid]; - objective[0] = (objective[2] - objective[0]) / (mid - left); - objective[1] = (objective[1] - objective[2]) / (right - mid); - - if (objective[0] > 0 && objective[1] < 0) { - // our point is in the left-of-mid side - right = mid; - } else { - left = mid; - } - oldmid = mid; - mid = ((unsigned int)right + (unsigned int)left) >> 1; - } - - best_k[0] = right; - objective[0] = (n - left) / (left - 1) * clusterDispertionView[left] / resultsView[left]; - objective[1] = (n - oldmid) / (oldmid - 1) * clusterDispertionView[oldmid] / resultsView[oldmid]; - if (objective[1] < objective[0]) { best_k[0] = left; } - - // if best_k isn't what we just ran, re-run to get correct centroids and dist data on return-> - // this saves memory - if (best_k[0] != oldmid) { - auto centroids_view = - raft::make_device_matrix_view(centroids.data_handle(), best_k[0], d); - - params.n_clusters = best_k[0]; - cuvs::cluster::detail::kmeans_fit_predict(handle, - params, - X, - std::nullopt, - std::make_optional(centroids_view), - labels.view(), - residual, - n_iter); - } -} -} // namespace cuvs::cluster::detail \ No newline at end of file diff --git a/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh b/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh deleted file mode 100644 index 1b946cc1e..000000000 --- a/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh +++ /dev/null @@ -1,1097 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -namespace cuvs::cluster::detail { - -constexpr static inline float kAdjustCentersWeight = 7.0f; - -/** - * @brief Predict labels for the dataset; floating-point types only. - * - * NB: no minibatch splitting is done here, it may require large amount of temporary memory (n_rows - * * n_cluster * sizeof(MathT)). - * - * @tparam MathT type of the centroids and mapped data - * @tparam IdxT index type - * @tparam LabelT label type - * - * @param[in] handle The raft handle. - * @param[in] params Structure containing the hyper-parameters - * @param[in] centers Pointer to the row-major matrix of cluster centers [n_clusters, dim] - * @param[in] n_clusters Number of clusters/centers - * @param[in] dim Dimensionality of the data - * @param[in] dataset Pointer to the data [n_rows, dim] - * @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows] - * @param[in] n_rows Number samples in the `dataset` - * @param[out] labels Output predictions [n_rows] - * @param[inout] mr (optional) Memory resource to use for temporary allocations - */ -template -inline std::enable_if_t> predict_core( - const raft::resources& handle, - const kmeans_balanced_params& params, - const MathT* centers, - IdxT n_clusters, - IdxT dim, - const MathT* dataset, - const MathT* dataset_norm, - IdxT n_rows, - LabelT* labels, - rmm::mr::device_memory_resource* mr) -{ - auto stream = resource::get_cuda_stream(handle); - switch (params.metric) { - case cuvs::distance::DistanceType::L2Expanded: - case cuvs::distance::DistanceType::L2SqrtExpanded: { - auto workspace = raft::make_device_mdarray( - handle, mr, make_extents((sizeof(int)) * n_rows)); - - auto minClusterAndDistance = raft::make_device_mdarray, IdxT>( - handle, mr, make_extents(n_rows)); - raft::KeyValuePair initial_value(0, std::numeric_limits::max()); - thrust::fill(raft::resource::get_thrust_policy(handle), - minClusterAndDistance.data_handle(), - minClusterAndDistance.data_handle() + minClusterAndDistance.size(), - initial_value); - - auto centroidsNorm = - raft::make_device_mdarray(handle, mr, make_extents(n_clusters)); - raft::linalg::rowNorm( - centroidsNorm.data_handle(), centers, dim, n_clusters, raft::linalg::L2Norm, true, stream); - - cuvs::distance::fusedL2NNMinReduce, IdxT>( - minClusterAndDistance.data_handle(), - dataset, - centers, - dataset_norm, - centroidsNorm.data_handle(), - n_rows, - n_clusters, - dim, - (void*)workspace.data_handle(), - (params.metric == cuvs::distance::DistanceType::L2Expanded) ? false : true, - false, - stream); - - // todo(lsugy): use KVP + iterator in caller. - // Copy keys to output labels - thrust::transform(raft::resource::get_thrust_policy(handle), - minClusterAndDistance.data_handle(), - minClusterAndDistance.data_handle() + n_rows, - labels, - raft::compose_op, raft::key_op>()); - break; - } - case cuvs::distance::DistanceType::InnerProduct: { - // TODO: pass buffer - rmm::device_uvector distances(n_rows * n_clusters, stream, mr); - - MathT alpha = -1.0; - MathT beta = 0.0; - - linalg::gemm(handle, - true, - false, - n_clusters, - n_rows, - dim, - &alpha, - centers, - dim, - dataset, - dim, - &beta, - distances.data(), - n_clusters, - stream); - - auto distances_const_view = raft::make_device_matrix_view( - distances.data(), n_rows, n_clusters); - auto labels_view = raft::make_device_vector_view(labels, n_rows); - raft::matrix::argmin(handle, distances_const_view, labels_view); - break; - } - default: { - RAFT_FAIL("The chosen distance metric is not supported (%d)", int(params.metric)); - } - } -} - -/** - * @brief Suggest a minibatch size for kmeans prediction. - * - * This function is used as a heuristic to split the work over a large dataset - * to reduce the size of temporary memory allocations. - * - * @tparam MathT type of the centroids and mapped data - * @tparam IdxT index type - * - * @param[in] n_clusters number of clusters in kmeans clustering - * @param[in] n_rows Number of samples in the dataset - * @param[in] dim Number of features in the dataset - * @param[in] metric Distance metric - * @param[in] needs_conversion Whether the data needs to be converted to MathT - * @return A suggested minibatch size and the expected memory cost per-row (in bytes) - */ -template -constexpr auto calc_minibatch_size(IdxT n_clusters, - IdxT n_rows, - IdxT dim, - cuvs::distance::DistanceType metric, - bool needs_conversion) -> std::tuple -{ - n_clusters = std::max(1, n_clusters); - - // Estimate memory needs per row (i.e element of the batch). - size_t mem_per_row = 0; - switch (metric) { - // fusedL2NN needs a mutex and a key-value pair for each row. - case distance::DistanceType::L2Expanded: - case distance::DistanceType::L2SqrtExpanded: { - mem_per_row += sizeof(int); - mem_per_row += sizeof(raft::KeyValuePair); - } break; - // Other metrics require storing a distance matrix. - default: { - mem_per_row += sizeof(MathT) * n_clusters; - } - } - - // If we need to convert to MathT, space required for the converted batch. - if (!needs_conversion) { mem_per_row += sizeof(MathT) * dim; } - - // Heuristic: calculate the minibatch size in order to use at most 1GB of memory. - IdxT minibatch_size = (1 << 30) / mem_per_row; - minibatch_size = 64 * div_rounding_up_safe(minibatch_size, IdxT{64}); - minibatch_size = std::min(minibatch_size, n_rows); - return std::make_tuple(minibatch_size, mem_per_row); -} - -/** - * @brief Given the data and labels, calculate cluster centers and sizes in one sweep. - * - * @note all pointers must be accessible on the device. - * - * @tparam T element type - * @tparam MathT type of the centroids and mapped data - * @tparam IdxT index type - * @tparam LabelT label type - * @tparam CounterT counter type supported by CUDA's native atomicAdd - * @tparam MappingOpT type of the mapping operation - * - * @param[in] handle The raft handle. - * @param[inout] centers Pointer to the output [n_clusters, dim] - * @param[inout] cluster_sizes Number of rows in each cluster [n_clusters] - * @param[in] n_clusters Number of clusters/centers - * @param[in] dim Dimensionality of the data - * @param[in] dataset Pointer to the data [n_rows, dim] - * @param[in] n_rows Number of samples in the `dataset` - * @param[in] labels Output predictions [n_rows] - * @param[in] reset_counters Whether to clear the output arrays before calculating. - * When set to `false`, this function may be used to update existing centers and sizes using - * the weighted average principle. - * @param[in] mapping_op Mapping operation from T to MathT - * @param[inout] mr (optional) Memory resource to use for temporary allocations on the device - */ -template -void calc_centers_and_sizes(const raft::resources& handle, - MathT* centers, - CounterT* cluster_sizes, - IdxT n_clusters, - IdxT dim, - const T* dataset, - IdxT n_rows, - const LabelT* labels, - bool reset_counters, - MappingOpT mapping_op, - rmm::mr::device_memory_resource* mr = nullptr) -{ - auto stream = resource::get_cuda_stream(handle); - if (mr == nullptr) { mr = resource::get_workspace_resource(handle); } - - if (!reset_counters) { - raft::linalg::matrixVectorOp( - centers, centers, cluster_sizes, dim, n_clusters, true, false, raft::mul_op(), stream); - } - - rmm::device_uvector workspace(0, stream, mr); - - // If we reset the counters, we can compute directly the new sizes in cluster_sizes. - // If we don't reset, we compute in a temporary buffer and add in a separate step. - rmm::device_uvector temp_cluster_sizes(0, stream, mr); - CounterT* temp_sizes = cluster_sizes; - if (!reset_counters) { - temp_cluster_sizes.resize(n_clusters, stream); - temp_sizes = temp_cluster_sizes.data(); - } - - // Apply mapping only when the data and math types are different. - if constexpr (std::is_same_v) { - raft::linalg::reduce_rows_by_key( - dataset, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters); - } else { - // todo(lsugy): use iterator from KV output of fusedL2NN - cub::TransformInputIterator mapping_itr(dataset, mapping_op); - raft::linalg::reduce_rows_by_key( - mapping_itr, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters); - } - - // Compute weight of each cluster - cuvs::cluster::detail::countLabels(handle, labels, temp_sizes, n_rows, n_clusters, workspace); - - // Add previous sizes if necessary - if (!reset_counters) { - raft::linalg::add(cluster_sizes, cluster_sizes, temp_sizes, n_clusters, stream); - } - - raft::linalg::matrixVectorOp(centers, - centers, - cluster_sizes, - dim, - n_clusters, - true, - false, - raft::div_checkzero_op(), - stream); -} - -/** Computes the L2 norm of the dataset, converting to MathT if necessary */ -template -void compute_norm(const raft::resources& handle, - MathT* dataset_norm, - const T* dataset, - IdxT dim, - IdxT n_rows, - MappingOpT mapping_op, - rmm::mr::device_memory_resource* mr = nullptr) -{ - raft::common::nvtx::range fun_scope("compute_norm"); - auto stream = resource::get_cuda_stream(handle); - if (mr == nullptr) { mr = resource::get_workspace_resource(handle); } - rmm::device_uvector mapped_dataset(0, stream, mr); - - const MathT* dataset_ptr = nullptr; - - if (std::is_same_v) { - dataset_ptr = reinterpret_cast(dataset); - } else { - mapped_dataset.resize(n_rows * dim, stream); - - linalg::unaryOp(mapped_dataset.data(), dataset, n_rows * dim, mapping_op, stream); - - dataset_ptr = (const MathT*)mapped_dataset.data(); - } - - raft::linalg::rowNorm( - dataset_norm, dataset_ptr, dim, n_rows, raft::linalg::L2Norm, true, stream); -} - -/** - * @brief Predict labels for the dataset. - * - * @tparam T element type - * @tparam MathT type of the centroids and mapped data - * @tparam IdxT index type - * @tparam LabelT label type - * @tparam MappingOpT type of the mapping operation - * - * @param[in] handle The raft handle - * @param[in] params Structure containing the hyper-parameters - * @param[in] centers Pointer to the row-major matrix of cluster centers [n_clusters, dim] - * @param[in] n_clusters Number of clusters/centers - * @param[in] dim Dimensionality of the data - * @param[in] dataset Pointer to the data [n_rows, dim] - * @param[in] n_rows Number samples in the `dataset` - * @param[out] labels Output predictions [n_rows] - * @param[in] mapping_op Mapping operation from T to MathT - * @param[inout] mr (optional) memory resource to use for temporary allocations - * @param[in] dataset_norm (optional) Pre-computed norms of each row in the dataset [n_rows] - */ -template -void predict(const raft::resources& handle, - const kmeans_balanced_params& params, - const MathT* centers, - IdxT n_clusters, - IdxT dim, - const T* dataset, - IdxT n_rows, - LabelT* labels, - MappingOpT mapping_op, - rmm::mr::device_memory_resource* mr = nullptr, - const MathT* dataset_norm = nullptr) -{ - auto stream = resource::get_cuda_stream(handle); - raft::common::nvtx::range fun_scope( - "predict(%zu, %u)", static_cast(n_rows), n_clusters); - if (mr == nullptr) { mr = resource::get_workspace_resource(handle); } - auto [max_minibatch_size, _mem_per_row] = - calc_minibatch_size(n_clusters, n_rows, dim, params.metric, std::is_same_v); - rmm::device_uvector cur_dataset( - std::is_same_v ? 0 : max_minibatch_size * dim, stream, mr); - bool need_compute_norm = - dataset_norm == nullptr && (params.metric == cuvs::distance::DistanceType::L2Expanded || - params.metric == cuvs::distance::DistanceType::L2SqrtExpanded); - rmm::device_uvector cur_dataset_norm( - need_compute_norm ? max_minibatch_size : 0, stream, mr); - const MathT* dataset_norm_ptr = nullptr; - auto cur_dataset_ptr = cur_dataset.data(); - for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) { - IdxT minibatch_size = std::min(max_minibatch_size, n_rows - offset); - - if constexpr (std::is_same_v) { - cur_dataset_ptr = const_cast(dataset + offset * dim); - } else { - linalg::unaryOp( - cur_dataset_ptr, dataset + offset * dim, minibatch_size * dim, mapping_op, stream); - } - - // Compute the norm now if it hasn't been pre-computed. - if (need_compute_norm) { - compute_norm( - handle, cur_dataset_norm.data(), cur_dataset_ptr, dim, minibatch_size, mapping_op, mr); - dataset_norm_ptr = cur_dataset_norm.data(); - } else if (dataset_norm != nullptr) { - dataset_norm_ptr = dataset_norm + offset; - } - - predict_core(handle, - params, - centers, - n_clusters, - dim, - cur_dataset_ptr, - dataset_norm_ptr, - minibatch_size, - labels + offset, - mr); - } -} - -template -__launch_bounds__((WarpSize * BlockDimY)) RAFT_KERNEL - adjust_centers_kernel(MathT* centers, // [n_clusters, dim] - IdxT n_clusters, - IdxT dim, - const T* dataset, // [n_rows, dim] - IdxT n_rows, - const LabelT* labels, // [n_rows] - const CounterT* cluster_sizes, // [n_clusters] - MathT threshold, - IdxT average, - IdxT seed, - IdxT* count, - MappingOpT mapping_op) -{ - IdxT l = threadIdx.y + BlockDimY * static_cast(blockIdx.y); - if (l >= n_clusters) return; - auto csize = static_cast(cluster_sizes[l]); - // skip big clusters - if (csize > static_cast(average * threshold)) return; - - // choose a "random" i that belongs to a rather large cluster - IdxT i; - IdxT j = laneId(); - if (j == 0) { - do { - auto old = atomicAdd(count, IdxT{1}); - i = (seed * (old + 1)) % n_rows; - } while (static_cast(cluster_sizes[labels[i]]) < average); - } - i = raft::shfl(i, 0); - - // Adjust the center of the selected smaller cluster to gravitate towards - // a sample from the selected larger cluster. - const IdxT li = static_cast(labels[i]); - // Weight of the current center for the weighted average. - // We dump it for anomalously small clusters, but keep constant otherwise. - const MathT wc = min(static_cast(csize), static_cast(kAdjustCentersWeight)); - // Weight for the datapoint used to shift the center. - const MathT wd = 1.0; - for (; j < dim; j += raft::WarpSize) { - MathT val = 0; - val += wc * centers[j + dim * li]; - val += wd * mapping_op(dataset[j + dim * i]); - val /= wc + wd; - centers[j + dim * l] = val; - } -} - -/** - * @brief Adjust centers for clusters that have small number of entries. - * - * For each cluster, where the cluster size is not bigger than a threshold, the center is moved - * towards a data point that belongs to a large cluster. - * - * NB: if this function returns `true`, you should update the labels. - * - * NB: all pointers must be on the device side. - * - * @tparam T element type - * @tparam MathT type of the centroids and mapped data - * @tparam IdxT index type - * @tparam LabelT label type - * @tparam CounterT counter type supported by CUDA's native atomicAdd - * @tparam MappingOpT type of the mapping operation - * - * @param[inout] centers cluster centers [n_clusters, dim] - * @param[in] n_clusters number of rows in `centers` - * @param[in] dim number of columns in `centers` and `dataset` - * @param[in] dataset a host pointer to the row-major data matrix [n_rows, dim] - * @param[in] n_rows number of rows in `dataset` - * @param[in] labels a host pointer to the cluster indices [n_rows] - * @param[in] cluster_sizes number of rows in each cluster [n_clusters] - * @param[in] threshold defines a criterion for adjusting a cluster - * (cluster_sizes <= average_size * threshold) - * 0 <= threshold < 1 - * @param[in] mapping_op Mapping operation from T to MathT - * @param[in] stream CUDA stream - * @param[inout] device_memory memory resource to use for temporary allocations - * - * @return whether any of the centers has been updated (and thus, `labels` need to be recalculated). - */ -template -auto adjust_centers(MathT* centers, - IdxT n_clusters, - IdxT dim, - const T* dataset, - IdxT n_rows, - const LabelT* labels, - const CounterT* cluster_sizes, - MathT threshold, - MappingOpT mapping_op, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* device_memory) -> bool -{ - raft::common::nvtx::range fun_scope( - "adjust_centers(%zu, %u)", static_cast(n_rows), n_clusters); - if (n_clusters == 0) { return false; } - constexpr static std::array kPrimes{29, 71, 113, 173, 229, 281, 349, 409, 463, 541, - 601, 659, 733, 809, 863, 941, 1013, 1069, 1151, 1223, - 1291, 1373, 1451, 1511, 1583, 1657, 1733, 1811, 1889, 1987, - 2053, 2129, 2213, 2287, 2357, 2423, 2531, 2617, 2687, 2741}; - static IdxT i = 0; - static IdxT i_primes = 0; - - bool adjusted = false; - IdxT average = n_rows / n_clusters; - IdxT ofst; - do { - i_primes = (i_primes + 1) % kPrimes.size(); - ofst = kPrimes[i_primes]; - } while (n_rows % ofst == 0); - - constexpr uint32_t kBlockDimY = 4; - const dim3 block_dim(WarpSize, kBlockDimY, 1); - const dim3 grid_dim(1, raft::ceildiv(n_clusters, static_cast(kBlockDimY)), 1); - rmm::device_scalar update_count(0, stream, device_memory); - adjust_centers_kernel<<>>(centers, - n_clusters, - dim, - dataset, - n_rows, - labels, - cluster_sizes, - threshold, - average, - ofst, - update_count.data(), - mapping_op); - adjusted = update_count.value(stream) > 0; // NB: rmm scalar performs the sync - - return adjusted; -} - -/** - * @brief Expectation-maximization-balancing combined in an iterative process. - * - * Note, the `cluster_centers` is assumed to be already initialized here. - * Thus, this function can be used for fine-tuning existing clusters; - * to train from scratch, use `build_clusters` function below. - * - * @tparam T element type - * @tparam MathT type of the centroids and mapped data - * @tparam IdxT index type - * @tparam LabelT label type - * @tparam CounterT counter type supported by CUDA's native atomicAdd - * @tparam MappingOpT type of the mapping operation - * - * @param[in] handle The raft handle - * @param[in] params Structure containing the hyper-parameters - * @param[in] n_iters Requested number of iterations (can differ from params.n_iter!) - * @param[in] dim Dimensionality of the dataset - * @param[in] dataset Pointer to a managed row-major array [n_rows, dim] - * @param[in] dataset_norm Pointer to the precomputed norm (for L2 metrics only) [n_rows] - * @param[in] n_rows Number of rows in the dataset - * @param[in] n_cluster Requested number of clusters - * @param[inout] cluster_centers Pointer to a managed row-major array [n_clusters, dim] - * @param[out] cluster_labels Pointer to a managed row-major array [n_rows] - * @param[out] cluster_sizes Pointer to a managed row-major array [n_clusters] - * @param[in] balancing_pullback - * if the cluster centers are rebalanced on this number of iterations, - * one extra iteration is performed (this could happen several times) (default should be `2`). - * In other words, the first and then every `ballancing_pullback`-th rebalancing operation adds - * one more iteration to the main cycle. - * @param[in] balancing_threshold - * the rebalancing takes place if any cluster is smaller than `avg_size * balancing_threshold` - * on a given iteration (default should be `~ 0.25`). - * @param[in] mapping_op Mapping operation from T to MathT - * @param[inout] device_memory - * A memory resource for device allocations (makes sense to provide a memory pool here) - */ -template -void balancing_em_iters(const raft::resources& handle, - const kmeans_balanced_params& params, - uint32_t n_iters, - IdxT dim, - const T* dataset, - const MathT* dataset_norm, - IdxT n_rows, - IdxT n_clusters, - MathT* cluster_centers, - LabelT* cluster_labels, - CounterT* cluster_sizes, - uint32_t balancing_pullback, - MathT balancing_threshold, - MappingOpT mapping_op, - rmm::mr::device_memory_resource* device_memory) -{ - auto stream = resource::get_cuda_stream(handle); - uint32_t balancing_counter = balancing_pullback; - for (uint32_t iter = 0; iter < n_iters; iter++) { - // Balancing step - move the centers around to equalize cluster sizes - // (but not on the first iteration) - if (iter > 0 && adjust_centers(cluster_centers, - n_clusters, - dim, - dataset, - n_rows, - cluster_labels, - cluster_sizes, - balancing_threshold, - mapping_op, - stream, - device_memory)) { - if (balancing_counter++ >= balancing_pullback) { - balancing_counter -= balancing_pullback; - n_iters++; - } - } - switch (params.metric) { - // For some metrics, cluster calculation and adjustment tends to favor zero center vectors. - // To avoid converging to zero, we normalize the center vectors on every iteration. - case cuvs::distance::DistanceType::InnerProduct: - case cuvs::distance::DistanceType::CosineExpanded: - case cuvs::distance::DistanceType::CorrelationExpanded: { - auto clusters_in_view = raft::make_device_matrix_view( - cluster_centers, n_clusters, dim); - auto clusters_out_view = raft::make_device_matrix_view( - cluster_centers, n_clusters, dim); - raft::linalg::row_normalize( - handle, clusters_in_view, clusters_out_view, raft::linalg::L2Norm); - break; - } - default: break; - } - // E: Expectation step - predict labels - predict(handle, - params, - cluster_centers, - n_clusters, - dim, - dataset, - n_rows, - cluster_labels, - mapping_op, - device_memory, - dataset_norm); - // M: Maximization step - calculate optimal cluster centers - calc_centers_and_sizes(handle, - cluster_centers, - cluster_sizes, - n_clusters, - dim, - dataset, - n_rows, - cluster_labels, - true, - mapping_op, - device_memory); - } -} - -/** Randomly initialize cluster centers and then call `balancing_em_iters`. */ -template -void build_clusters(const raft::resources& handle, - const kmeans_balanced_params& params, - IdxT dim, - const T* dataset, - IdxT n_rows, - IdxT n_clusters, - MathT* cluster_centers, - LabelT* cluster_labels, - CounterT* cluster_sizes, - MappingOpT mapping_op, - rmm::mr::device_memory_resource* device_memory, - const MathT* dataset_norm = nullptr) -{ - auto stream = resource::get_cuda_stream(handle); - - // "randomly" initialize labels - auto labels_view = raft::make_device_vector_view(cluster_labels, n_rows); - linalg::map_offset( - handle, - labels_view, - raft::compose_op(raft::cast_op(), raft::mod_const_op(n_clusters))); - - // update centers to match the initialized labels. - calc_centers_and_sizes(handle, - cluster_centers, - cluster_sizes, - n_clusters, - dim, - dataset, - n_rows, - cluster_labels, - true, - mapping_op, - device_memory); - - // run EM - balancing_em_iters(handle, - params, - params.n_iters, - dim, - dataset, - dataset_norm, - n_rows, - n_clusters, - cluster_centers, - cluster_labels, - cluster_sizes, - 2, - MathT{0.25}, - mapping_op, - device_memory); -} - -/** Calculate how many fine clusters should belong to each mesocluster. */ -template -inline auto arrange_fine_clusters(IdxT n_clusters, - IdxT n_mesoclusters, - IdxT n_rows, - const CounterT* mesocluster_sizes) -{ - std::vector fine_clusters_nums(n_mesoclusters); - std::vector fine_clusters_csum(n_mesoclusters + 1); - fine_clusters_csum[0] = 0; - - IdxT n_lists_rem = n_clusters; - IdxT n_nonempty_ms_rem = 0; - for (IdxT i = 0; i < n_mesoclusters; i++) { - n_nonempty_ms_rem += mesocluster_sizes[i] > CounterT{0} ? 1 : 0; - } - IdxT n_rows_rem = n_rows; - CounterT mesocluster_size_sum = 0; - CounterT mesocluster_size_max = 0; - IdxT fine_clusters_nums_max = 0; - for (IdxT i = 0; i < n_mesoclusters; i++) { - if (i < n_mesoclusters - 1) { - // Although the algorithm is meant to produce balanced clusters, when something - // goes wrong, we may get empty clusters (e.g. during development/debugging). - // The code below ensures a proportional arrangement of fine cluster numbers - // per mesocluster, even if some clusters are empty. - if (mesocluster_sizes[i] == 0) { - fine_clusters_nums[i] = 0; - } else { - n_nonempty_ms_rem--; - auto s = static_cast( - static_cast(n_lists_rem * mesocluster_sizes[i]) / n_rows_rem + .5); - s = std::min(s, n_lists_rem - n_nonempty_ms_rem); - fine_clusters_nums[i] = std::max(s, IdxT{1}); - } - } else { - fine_clusters_nums[i] = n_lists_rem; - } - n_lists_rem -= fine_clusters_nums[i]; - n_rows_rem -= mesocluster_sizes[i]; - mesocluster_size_max = max(mesocluster_size_max, mesocluster_sizes[i]); - mesocluster_size_sum += mesocluster_sizes[i]; - fine_clusters_nums_max = max(fine_clusters_nums_max, fine_clusters_nums[i]); - fine_clusters_csum[i + 1] = fine_clusters_csum[i] + fine_clusters_nums[i]; - } - - RAFT_EXPECTS(static_cast(mesocluster_size_sum) == n_rows, - "mesocluster sizes do not add up (%zu) to the total trainset size (%zu)", - static_cast(mesocluster_size_sum), - static_cast(n_rows)); - RAFT_EXPECTS(fine_clusters_csum[n_mesoclusters] == n_clusters, - "fine cluster numbers do not add up (%zu) to the total number of clusters (%zu)", - static_cast(fine_clusters_csum[n_mesoclusters]), - static_cast(n_clusters)); - - return std::make_tuple(static_cast(mesocluster_size_max), - fine_clusters_nums_max, - std::move(fine_clusters_nums), - std::move(fine_clusters_csum)); -} - -/** - * Given the (coarse) mesoclusters and the distribution of fine clusters within them, - * build the fine clusters. - * - * Processing one mesocluster at a time: - * 1. Copy mesocluster data into a separate buffer - * 2. Predict fine cluster - * 3. Refince the fine cluster centers - * - * As a result, the fine clusters are what is returned by `build_hierarchical`; - * this function returns the total number of fine clusters, which can be checked to be - * the same as the requested number of clusters. - * - * Note: this function uses at most `fine_clusters_nums_max` points per mesocluster for training; - * if one of the clusters is larger than that (as given by `mesocluster_sizes`), the extra data - * is ignored and a warning is reported. - */ -template -auto build_fine_clusters(const raft::resources& handle, - const kmeans_balanced_params& params, - IdxT dim, - const T* dataset_mptr, - const MathT* dataset_norm_mptr, - const LabelT* labels_mptr, - IdxT n_rows, - const IdxT* fine_clusters_nums, - const IdxT* fine_clusters_csum, - const CounterT* mesocluster_sizes, - IdxT n_mesoclusters, - IdxT mesocluster_size_max, - IdxT fine_clusters_nums_max, - MathT* cluster_centers, - MappingOpT mapping_op, - rmm::mr::device_memory_resource* managed_memory, - rmm::mr::device_memory_resource* device_memory) -> IdxT -{ - auto stream = resource::get_cuda_stream(handle); - rmm::device_uvector mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory); - rmm::device_uvector mc_trainset_buf(mesocluster_size_max * dim, stream, device_memory); - rmm::device_uvector mc_trainset_norm_buf(mesocluster_size_max, stream, device_memory); - auto mc_trainset_ids = mc_trainset_ids_buf.data(); - auto mc_trainset = mc_trainset_buf.data(); - auto mc_trainset_norm = mc_trainset_norm_buf.data(); - - // label (cluster ID) of each vector - rmm::device_uvector mc_trainset_labels(mesocluster_size_max, stream, device_memory); - - rmm::device_uvector mc_trainset_ccenters( - fine_clusters_nums_max * dim, stream, device_memory); - // number of vectors in each cluster - rmm::device_uvector mc_trainset_csizes_tmp( - fine_clusters_nums_max, stream, device_memory); - - // Training clusters in each meso-cluster - IdxT n_clusters_done = 0; - for (IdxT i = 0; i < n_mesoclusters; i++) { - IdxT k = 0; - for (IdxT j = 0; j < n_rows && k < mesocluster_size_max; j++) { - if (labels_mptr[j] == LabelT(i)) { mc_trainset_ids[k++] = j; } - } - if (k != static_cast(mesocluster_sizes[i])) - RAFT_LOG_WARN("Incorrect mesocluster size at %d. %zu vs %zu", - static_cast(i), - static_cast(k), - static_cast(mesocluster_sizes[i])); - if (k == 0) { - RAFT_LOG_DEBUG("Empty cluster %d", i); - RAFT_EXPECTS(fine_clusters_nums[i] == 0, - "Number of fine clusters must be zero for the empty mesocluster (got %d)", - static_cast(fine_clusters_nums[i])); - continue; - } else { - RAFT_EXPECTS(fine_clusters_nums[i] > 0, - "Number of fine clusters must be non-zero for a non-empty mesocluster"); - } - - cub::TransformInputIterator mapping_itr(dataset_mptr, mapping_op); - raft::matrix::gather(mapping_itr, dim, n_rows, mc_trainset_ids, k, mc_trainset, stream); - if (params.metric == cuvs::distance::DistanceType::L2Expanded || - params.metric == cuvs::distance::DistanceType::L2SqrtExpanded) { - thrust::gather(raft::resource::get_thrust_policy(handle), - mc_trainset_ids, - mc_trainset_ids + k, - dataset_norm_mptr, - mc_trainset_norm); - } - - build_clusters(handle, - params, - dim, - mc_trainset, - k, - fine_clusters_nums[i], - mc_trainset_ccenters.data(), - mc_trainset_labels.data(), - mc_trainset_csizes_tmp.data(), - mapping_op, - device_memory, - mc_trainset_norm); - - raft::copy(cluster_centers + (dim * fine_clusters_csum[i]), - mc_trainset_ccenters.data(), - fine_clusters_nums[i] * dim, - stream); - resource::sync_stream(handle, stream); - n_clusters_done += fine_clusters_nums[i]; - } - return n_clusters_done; -} - -/** - * @brief Hierarchical balanced k-means - * - * @tparam T element type - * @tparam MathT type of the centroids and mapped data - * @tparam IdxT index type - * @tparam LabelT label type - * @tparam MappingOpT type of the mapping operation - * - * @param[in] handle The raft handle. - * @param[in] params Structure containing the hyper-parameters - * @param dim number of columns in `centers` and `dataset` - * @param[in] dataset a device pointer to the source dataset [n_rows, dim] - * @param n_rows number of rows in the input - * @param[out] cluster_centers a device pointer to the found cluster centers [n_cluster, dim] - * @param n_cluster - * @param metric the distance type - * @param mapping_op Mapping operation from T to MathT - * @param stream - */ -template -void build_hierarchical(const raft::resources& handle, - const kmeans_balanced_params& params, - IdxT dim, - const T* dataset, - IdxT n_rows, - MathT* cluster_centers, - IdxT n_clusters, - MappingOpT mapping_op) -{ - auto stream = resource::get_cuda_stream(handle); - using LabelT = uint32_t; - - raft::common::nvtx::range fun_scope( - "build_hierarchical(%zu, %u)", static_cast(n_rows), n_clusters); - - IdxT n_mesoclusters = std::min(n_clusters, static_cast(std::sqrt(n_clusters) + 0.5)); - RAFT_LOG_DEBUG("build_hierarchical: n_mesoclusters: %u", n_mesoclusters); - - rmm::mr::managed_memory_resource managed_memory; - rmm::mr::device_memory_resource* device_memory = resource::get_workspace_resource(handle); - auto [max_minibatch_size, mem_per_row] = - calc_minibatch_size(n_clusters, n_rows, dim, params.metric, std::is_same_v); - auto pool_guard = - raft::get_pool_memory_resource(device_memory, mem_per_row * size_t(max_minibatch_size)); - if (pool_guard) { - RAFT_LOG_DEBUG("build_hierarchical: using pool memory resource with initial size %zu bytes", - mem_per_row * size_t(max_minibatch_size)); - } - - // Precompute the L2 norm of the dataset if relevant. - const MathT* dataset_norm = nullptr; - rmm::device_uvector dataset_norm_buf(0, stream, device_memory); - if (params.metric == cuvs::distance::DistanceType::L2Expanded || - params.metric == cuvs::distance::DistanceType::L2SqrtExpanded) { - dataset_norm_buf.resize(n_rows, stream); - for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) { - IdxT minibatch_size = std::min(max_minibatch_size, n_rows - offset); - compute_norm(handle, - dataset_norm_buf.data() + offset, - dataset + dim * offset, - dim, - minibatch_size, - mapping_op, - device_memory); - } - dataset_norm = (const MathT*)dataset_norm_buf.data(); - } - - /* Temporary workaround to cub::DeviceHistogram not supporting any type that isn't natively - * supported by atomicAdd: find a supported CounterT based on the IdxT. */ - typedef typename std::conditional_t - CounterT; - - // build coarse clusters (mesoclusters) - rmm::device_uvector mesocluster_labels_buf(n_rows, stream, &managed_memory); - rmm::device_uvector mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory); - { - rmm::device_uvector mesocluster_centers_buf(n_mesoclusters * dim, stream, device_memory); - build_clusters(handle, - params, - dim, - dataset, - n_rows, - n_mesoclusters, - mesocluster_centers_buf.data(), - mesocluster_labels_buf.data(), - mesocluster_sizes_buf.data(), - mapping_op, - device_memory, - dataset_norm); - } - - auto mesocluster_sizes = mesocluster_sizes_buf.data(); - auto mesocluster_labels = mesocluster_labels_buf.data(); - - resource::sync_stream(handle, stream); - - // build fine clusters - auto [mesocluster_size_max, fine_clusters_nums_max, fine_clusters_nums, fine_clusters_csum] = - arrange_fine_clusters(n_clusters, n_mesoclusters, n_rows, mesocluster_sizes); - - const IdxT mesocluster_size_max_balanced = div_rounding_up_safe( - 2lu * size_t(n_rows), std::max(size_t(n_mesoclusters), 1lu)); - if (mesocluster_size_max > mesocluster_size_max_balanced) { - RAFT_LOG_WARN( - "build_hierarchical: built unbalanced mesoclusters (max_mesocluster_size == %u > %u). " - "At most %u points will be used for training within each mesocluster. " - "Consider increasing the number of training iterations `n_iters`.", - mesocluster_size_max, - mesocluster_size_max_balanced, - mesocluster_size_max_balanced); - RAFT_LOG_TRACE_VEC(mesocluster_sizes, n_mesoclusters); - RAFT_LOG_TRACE_VEC(fine_clusters_nums.data(), n_mesoclusters); - mesocluster_size_max = mesocluster_size_max_balanced; - } - - auto n_clusters_done = build_fine_clusters(handle, - params, - dim, - dataset, - dataset_norm, - mesocluster_labels, - n_rows, - fine_clusters_nums.data(), - fine_clusters_csum.data(), - mesocluster_sizes, - n_mesoclusters, - mesocluster_size_max, - fine_clusters_nums_max, - cluster_centers, - mapping_op, - &managed_memory, - device_memory); - RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters."); - - rmm::device_uvector cluster_sizes(n_clusters, stream, device_memory); - rmm::device_uvector labels(n_rows, stream, device_memory); - - // Fine-tuning k-means for all clusters - // - // (*) Since the likely cluster centroids have been calculated hierarchically already, the number - // of iterations for fine-tuning kmeans for whole clusters should be reduced. However, there is a - // possibility that the clusters could be unbalanced here, in which case the actual number of - // iterations would be increased. - // - balancing_em_iters(handle, - params, - std::max(params.n_iters / 10, 2), - dim, - dataset, - dataset_norm, - n_rows, - n_clusters, - cluster_centers, - labels.data(), - cluster_sizes.data(), - 5, - MathT{0.2}, - mapping_op, - device_memory); -} - -} // namespace cuvs::cluster::detail diff --git a/cpp/include/cuvs/cluster/detail/kmeans_common.cuh b/cpp/include/cuvs/cluster/detail/kmeans_common.cuh deleted file mode 100644 index d4f6a43a2..000000000 --- a/cpp/include/cuvs/cluster/detail/kmeans_common.cuh +++ /dev/null @@ -1,663 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace cuvs { -namespace cluster { -namespace detail { - -template -struct SamplingOp { - DataT* rnd; - uint8_t* flag; - DataT cluster_cost; - double oversampling_factor; - IndexT n_clusters; - - CUB_RUNTIME_FUNCTION __forceinline__ - SamplingOp(DataT c, double l, IndexT k, DataT* rand, uint8_t* ptr) - : cluster_cost(c), oversampling_factor(l), n_clusters(k), rnd(rand), flag(ptr) - { - } - - __host__ __device__ __forceinline__ bool operator()( - const raft::KeyValuePair& a) const - { - DataT prob_threshold = (DataT)rnd[a.key]; - - DataT prob_x = ((oversampling_factor * n_clusters * a.value) / cluster_cost); - - return !flag[a.key] && (prob_x > prob_threshold); - } -}; - -template -struct KeyValueIndexOp { - __host__ __device__ __forceinline__ IndexT - operator()(const raft::KeyValuePair& a) const - { - return a.key; - } -}; - -// Computes the intensity histogram from a sequence of labels -template -void countLabels(raft::resources const& handle, - SampleIteratorT labels, - CounterT* count, - IndexT n_samples, - IndexT n_clusters, - rmm::device_uvector& workspace) -{ - cudaStream_t stream = resource::get_cuda_stream(handle); - - // CUB::DeviceHistogram requires a signed index type - typedef typename std::make_signed_t CubIndexT; - - CubIndexT num_levels = n_clusters + 1; - CubIndexT lower_level = 0; - CubIndexT upper_level = n_clusters; - - size_t temp_storage_bytes = 0; - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr, - temp_storage_bytes, - labels, - count, - num_levels, - lower_level, - upper_level, - static_cast(n_samples), - stream)); - - workspace.resize(temp_storage_bytes, stream); - - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(), - temp_storage_bytes, - labels, - count, - num_levels, - lower_level, - upper_level, - static_cast(n_samples), - stream)); -} - -template -void checkWeight(raft::resources const& handle, - raft::device_vector_view weight, - rmm::device_uvector& workspace) -{ - cudaStream_t stream = resource::get_cuda_stream(handle); - auto wt_aggr = raft::make_device_scalar(handle, 0); - auto n_samples = weight.extent(0); - - size_t temp_storage_bytes = 0; - RAFT_CUDA_TRY(cub::DeviceReduce::Sum( - nullptr, temp_storage_bytes, weight.data_handle(), wt_aggr.data_handle(), n_samples, stream)); - - workspace.resize(temp_storage_bytes, stream); - - RAFT_CUDA_TRY(cub::DeviceReduce::Sum(workspace.data(), - temp_storage_bytes, - weight.data_handle(), - wt_aggr.data_handle(), - n_samples, - stream)); - DataT wt_sum = 0; - raft::copy(&wt_sum, wt_aggr.data_handle(), 1, stream); - resource::sync_stream(handle, stream); - - if (wt_sum != n_samples) { - RAFT_LOG_DEBUG( - "[Warning!] KMeans: normalizing the user provided sample weight to " - "sum up to %d samples", - n_samples); - - auto scale = static_cast(n_samples) / wt_sum; - raft::linalg::unaryOp(weight.data_handle(), - weight.data_handle(), - n_samples, - raft::mul_const_op{scale}, - stream); - } -} - -template -IndexT getDataBatchSize(int batch_samples, IndexT n_samples) -{ - auto minVal = std::min(static_cast(batch_samples), n_samples); - return (minVal == 0) ? n_samples : minVal; -} - -template -IndexT getCentroidsBatchSize(int batch_centroids, IndexT n_local_clusters) -{ - auto minVal = std::min(static_cast(batch_centroids), n_local_clusters); - return (minVal == 0) ? n_local_clusters : minVal; -} - -template -void computeClusterCost(raft::resources const& handle, - raft::device_vector_view minClusterDistance, - rmm::device_uvector& workspace, - raft::device_scalar_view clusterCost, - MainOpT main_op, - ReductionOpT reduction_op) -{ - cudaStream_t stream = resource::get_cuda_stream(handle); - - cub::TransformInputIterator itr(minClusterDistance.data_handle(), - main_op); - - size_t temp_storage_bytes = 0; - RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(nullptr, - temp_storage_bytes, - itr, - clusterCost.data_handle(), - minClusterDistance.size(), - reduction_op, - OutputT(), - stream)); - - workspace.resize(temp_storage_bytes, stream); - - RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(workspace.data(), - temp_storage_bytes, - itr, - clusterCost.data_handle(), - minClusterDistance.size(), - reduction_op, - OutputT(), - stream)); -} - -template -void sampleCentroids(raft::resources const& handle, - raft::device_matrix_view X, - raft::device_vector_view minClusterDistance, - raft::device_vector_view isSampleCentroid, - SamplingOp& select_op, - rmm::device_uvector& inRankCp, - rmm::device_uvector& workspace) -{ - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_local_samples = X.extent(0); - auto n_features = X.extent(1); - - auto nSelected = raft::make_device_scalar(handle, 0); - cub::ArgIndexInputIterator ip_itr(minClusterDistance.data_handle()); - auto sampledMinClusterDistance = - raft::make_device_vector, IndexT>(handle, n_local_samples); - size_t temp_storage_bytes = 0; - RAFT_CUDA_TRY(cub::DeviceSelect::If(nullptr, - temp_storage_bytes, - ip_itr, - sampledMinClusterDistance.data_handle(), - nSelected.data_handle(), - n_local_samples, - select_op, - stream)); - - workspace.resize(temp_storage_bytes, stream); - - RAFT_CUDA_TRY(cub::DeviceSelect::If(workspace.data(), - temp_storage_bytes, - ip_itr, - sampledMinClusterDistance.data_handle(), - nSelected.data_handle(), - n_local_samples, - select_op, - stream)); - - IndexT nPtsSampledInRank = 0; - raft::copy(&nPtsSampledInRank, nSelected.data_handle(), 1, stream); - resource::sync_stream(handle, stream); - - uint8_t* rawPtr_isSampleCentroid = isSampleCentroid.data_handle(); - thrust::for_each_n(raft::resource::get_thrust_policy(handle), - sampledMinClusterDistance.data_handle(), - nPtsSampledInRank, - [=] __device__(raft::KeyValuePair val) { - rawPtr_isSampleCentroid[val.key] = 1; - }); - - inRankCp.resize(nPtsSampledInRank * n_features, stream); - - raft::matrix::gather((DataT*)X.data_handle(), - X.extent(1), - X.extent(0), - sampledMinClusterDistance.data_handle(), - nPtsSampledInRank, - inRankCp.data(), - raft::key_op{}, - stream); -} - -// calculate pairwise distance between 'dataset[n x d]' and 'centroids[k x d]', -// result will be stored in 'pairwiseDistance[n x k]' -template -void pairwise_distance_kmeans(raft::resources const& handle, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_matrix_view pairwiseDistance, - rmm::device_uvector& workspace, - cuvs::distance::DistanceType metric) -{ - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - auto n_clusters = centroids.extent(0); - - ASSERT(X.extent(1) == centroids.extent(1), - "# features in dataset and centroids are different (must be same)"); - - cuvs::distance::pairwise_distance(handle, - X.data_handle(), - centroids.data_handle(), - pairwiseDistance.data_handle(), - n_samples, - n_clusters, - n_features, - workspace, - metric); -} - -// shuffle and randomly select 'n_samples_to_gather' from input 'in' and stores -// in 'out' does not modify the input -template -void shuffleAndGather(raft::resources const& handle, - raft::device_matrix_view in, - raft::device_matrix_view out, - uint32_t n_samples_to_gather, - uint64_t seed) -{ - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_samples = in.extent(0); - auto n_features = in.extent(1); - - auto indices = raft::make_device_vector(handle, n_samples); - - // shuffle indices on device - raft::random::permute(indices.data_handle(), - nullptr, - nullptr, - (IndexT)in.extent(1), - (IndexT)in.extent(0), - true, - stream); - - raft::matrix::gather((DataT*)in.data_handle(), - in.extent(1), - in.extent(0), - indices.data_handle(), - static_cast(n_samples_to_gather), - out.data_handle(), - stream); -} - -// Calculates a pair for every sample in input 'X' where key is an -// index to an sample in 'centroids' (index of the nearest centroid) and 'value' -// is the distance between the sample and the 'centroid[key]' -template -void minClusterAndDistanceCompute( - raft::resources const& handle, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view, IndexT> minClusterAndDistance, - raft::device_vector_view L2NormX, - rmm::device_uvector& L2NormBuf_OR_DistBuf, - cuvs::distance::DistanceType metric, - int batch_samples, - int batch_centroids, - rmm::device_uvector& workspace) -{ - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - auto n_clusters = centroids.extent(0); - // todo(lsugy): change batch size computation when using fusedL2NN! - bool is_fused = metric == cuvs::distance::DistanceType::L2Expanded || - metric == cuvs::distance::DistanceType::L2SqrtExpanded; - auto dataBatchSize = is_fused ? (IndexT)n_samples : getDataBatchSize(batch_samples, n_samples); - auto centroidsBatchSize = getCentroidsBatchSize(batch_centroids, n_clusters); - - if (is_fused) { - L2NormBuf_OR_DistBuf.resize(n_clusters, stream); - raft::linalg::rowNorm(L2NormBuf_OR_DistBuf.data(), - centroids.data_handle(), - centroids.extent(1), - centroids.extent(0), - raft::linalg::L2Norm, - true, - stream); - } else { - // TODO: Unless pool allocator is used, passing in a workspace for this - // isn't really increasing performance because this needs to do a re-allocation - // anyways. ref https://github.com/rapidsai/raft/issues/930 - L2NormBuf_OR_DistBuf.resize(dataBatchSize * centroidsBatchSize, stream); - } - - // Note - pairwiseDistance and centroidsNorm share the same buffer - // centroidsNorm [n_clusters] - tensor wrapper around centroids L2 Norm - auto centroidsNorm = - raft::make_device_vector_view(L2NormBuf_OR_DistBuf.data(), n_clusters); - // pairwiseDistance[ns x nc] - tensor wrapper around the distance buffer - auto pairwiseDistance = raft::make_device_matrix_view( - L2NormBuf_OR_DistBuf.data(), dataBatchSize, centroidsBatchSize); - - raft::KeyValuePair initial_value(0, std::numeric_limits::max()); - - thrust::fill(raft::resource::get_thrust_policy(handle), - minClusterAndDistance.data_handle(), - minClusterAndDistance.data_handle() + minClusterAndDistance.size(), - initial_value); - - // tile over the input dataset - for (IndexT dIdx = 0; dIdx < n_samples; dIdx += dataBatchSize) { - // # of samples for the current batch - auto ns = std::min((IndexT)dataBatchSize, n_samples - dIdx); - - // datasetView [ns x n_features] - view representing the current batch of - // input dataset - auto datasetView = raft::make_device_matrix_view( - X.data_handle() + (dIdx * n_features), ns, n_features); - - // minClusterAndDistanceView [ns x n_clusters] - auto minClusterAndDistanceView = - raft::make_device_vector_view, IndexT>( - minClusterAndDistance.data_handle() + dIdx, ns); - - auto L2NormXView = - raft::make_device_vector_view(L2NormX.data_handle() + dIdx, ns); - - if (is_fused) { - workspace.resize((sizeof(int)) * ns, stream); - - // todo(lsugy): remove cIdx - cuvs::distance::fusedL2NNMinReduce, IndexT>( - minClusterAndDistanceView.data_handle(), - datasetView.data_handle(), - centroids.data_handle(), - L2NormXView.data_handle(), - centroidsNorm.data_handle(), - ns, - n_clusters, - n_features, - (void*)workspace.data(), - metric != cuvs::distance::DistanceType::L2Expanded, - false, - stream); - } else { - // tile over the centroids - for (IndexT cIdx = 0; cIdx < n_clusters; cIdx += centroidsBatchSize) { - // # of centroids for the current batch - auto nc = std::min((IndexT)centroidsBatchSize, n_clusters - cIdx); - - // centroidsView [nc x n_features] - view representing the current batch - // of centroids - auto centroidsView = raft::make_device_matrix_view( - centroids.data_handle() + (cIdx * n_features), nc, n_features); - - // pairwiseDistanceView [ns x nc] - view representing the pairwise - // distance for current batch - auto pairwiseDistanceView = - raft::make_device_matrix_view(pairwiseDistance.data_handle(), ns, nc); - - // calculate pairwise distance between current tile of cluster centroids - // and input dataset - pairwise_distance_kmeans( - handle, datasetView, centroidsView, pairwiseDistanceView, workspace, metric); - - // argmin reduction returning pair - // calculates the closest centroid and the distance to the closest - // centroid - raft::linalg::coalescedReduction( - minClusterAndDistanceView.data_handle(), - pairwiseDistanceView.data_handle(), - pairwiseDistanceView.extent(1), - pairwiseDistanceView.extent(0), - initial_value, - stream, - true, - [=] __device__(const DataT val, const IndexT i) { - raft::KeyValuePair pair; - pair.key = cIdx + i; - pair.value = val; - return pair; - }, - raft::argmin_op{}, - raft::identity_op{}); - } - } - } -} - -template -void minClusterDistanceCompute(raft::resources const& handle, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view minClusterDistance, - raft::device_vector_view L2NormX, - rmm::device_uvector& L2NormBuf_OR_DistBuf, - cuvs::distance::DistanceType metric, - int batch_samples, - int batch_centroids, - rmm::device_uvector& workspace) -{ - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - auto n_clusters = centroids.extent(0); - - bool is_fused = metric == cuvs::distance::DistanceType::L2Expanded || - metric == cuvs::distance::DistanceType::L2SqrtExpanded; - auto dataBatchSize = is_fused ? (IndexT)n_samples : getDataBatchSize(batch_samples, n_samples); - auto centroidsBatchSize = getCentroidsBatchSize(batch_centroids, n_clusters); - - if (is_fused) { - L2NormBuf_OR_DistBuf.resize(n_clusters, stream); - raft::linalg::rowNorm(L2NormBuf_OR_DistBuf.data(), - centroids.data_handle(), - centroids.extent(1), - centroids.extent(0), - raft::linalg::L2Norm, - true, - stream); - } else { - L2NormBuf_OR_DistBuf.resize(dataBatchSize * centroidsBatchSize, stream); - } - - // Note - pairwiseDistance and centroidsNorm share the same buffer - // centroidsNorm [n_clusters] - tensor wrapper around centroids L2 Norm - auto centroidsNorm = - raft::make_device_vector_view(L2NormBuf_OR_DistBuf.data(), n_clusters); - // pairwiseDistance[ns x nc] - tensor wrapper around the distance buffer - auto pairwiseDistance = raft::make_device_matrix_view( - L2NormBuf_OR_DistBuf.data(), dataBatchSize, centroidsBatchSize); - - thrust::fill(raft::resource::get_thrust_policy(handle), - minClusterDistance.data_handle(), - minClusterDistance.data_handle() + minClusterDistance.size(), - std::numeric_limits::max()); - - // tile over the input data and calculate distance matrix [n_samples x - // n_clusters] - for (IndexT dIdx = 0; dIdx < n_samples; dIdx += dataBatchSize) { - // # of samples for the current batch - auto ns = std::min((IndexT)dataBatchSize, n_samples - dIdx); - - // datasetView [ns x n_features] - view representing the current batch of - // input dataset - auto datasetView = raft::make_device_matrix_view( - X.data_handle() + dIdx * n_features, ns, n_features); - - // minClusterDistanceView [ns x n_clusters] - auto minClusterDistanceView = - raft::make_device_vector_view(minClusterDistance.data_handle() + dIdx, ns); - - auto L2NormXView = - raft::make_device_vector_view(L2NormX.data_handle() + dIdx, ns); - - if (is_fused) { - workspace.resize((sizeof(IndexT)) * ns, stream); - - cuvs::distance::fusedL2NNMinReduce( - minClusterDistanceView.data_handle(), - datasetView.data_handle(), - centroids.data_handle(), - L2NormXView.data_handle(), - centroidsNorm.data_handle(), - ns, - n_clusters, - n_features, - (void*)workspace.data(), - metric != cuvs::distance::DistanceType::L2Expanded, - false, - stream); - } else { - // tile over the centroids - for (IndexT cIdx = 0; cIdx < n_clusters; cIdx += centroidsBatchSize) { - // # of centroids for the current batch - auto nc = std::min((IndexT)centroidsBatchSize, n_clusters - cIdx); - - // centroidsView [nc x n_features] - view representing the current batch - // of centroids - auto centroidsView = raft::make_device_matrix_view( - centroids.data_handle() + cIdx * n_features, nc, n_features); - - // pairwiseDistanceView [ns x nc] - view representing the pairwise - // distance for current batch - auto pairwiseDistanceView = - raft::make_device_matrix_view(pairwiseDistance.data_handle(), ns, nc); - - // calculate pairwise distance between current tile of cluster centroids - // and input dataset - pairwise_distance_kmeans( - handle, datasetView, centroidsView, pairwiseDistanceView, workspace, metric); - - raft::linalg::coalescedReduction(minClusterDistanceView.data_handle(), - pairwiseDistanceView.data_handle(), - pairwiseDistanceView.extent(1), - pairwiseDistanceView.extent(0), - std::numeric_limits::max(), - stream, - true, - raft::identity_op{}, - raft::min_op{}, - raft::identity_op{}); - } - } - } -} - -template -void countSamplesInCluster(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_vector_view L2NormX, - raft::device_matrix_view centroids, - rmm::device_uvector& workspace, - raft::device_vector_view sampleCountInCluster) -{ - cudaStream_t stream = resource::get_cuda_stream(handle); - auto n_samples = X.extent(0); - auto n_features = X.extent(1); - auto n_clusters = centroids.extent(0); - - // stores (key, value) pair corresponding to each sample where - // - key is the index of nearest cluster - // - value is the distance to the nearest cluster - auto minClusterAndDistance = - raft::make_device_vector, IndexT>(handle, n_samples); - - // temporary buffer to store distance matrix, destructor releases the resource - rmm::device_uvector L2NormBuf_OR_DistBuf(0, stream); - - // computes minClusterAndDistance[0:n_samples) where minClusterAndDistance[i] - // is a pair where - // 'key' is index to an sample in 'centroids' (index of the nearest - // centroid) and 'value' is the distance between the sample 'X[i]' and the - // 'centroid[key]' - detail::minClusterAndDistanceCompute(handle, - X, - (raft::device_matrix_view)centroids, - minClusterAndDistance.view(), - L2NormX, - L2NormBuf_OR_DistBuf, - params.metric, - params.batch_samples, - params.batch_centroids, - workspace); - - // Using TransformInputIteratorT to dereference an array of raft::KeyValuePair - // and converting them to just return the Key to be used in reduce_rows_by_key - // prims - detail::KeyValueIndexOp conversion_op; - cub::TransformInputIterator, - raft::KeyValuePair*> - itr(minClusterAndDistance.data_handle(), conversion_op); - - // count # of samples in each cluster - countLabels(handle, - itr, - sampleCountInCluster.data_handle(), - (IndexT)n_samples, - (IndexT)n_clusters, - workspace); -} -} // namespace detail -} // namespace cluster -} // namespace cuvs diff --git a/cpp/include/cuvs/cluster/detail/mst.cuh b/cpp/include/cuvs/cluster/detail/mst.cuh deleted file mode 100644 index 6d304d64c..000000000 --- a/cpp/include/cuvs/cluster/detail/mst.cuh +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include - -namespace cuvs::cluster::detail { - -template -void merge_msts(sparse::solver::Graph_COO& coo1, - sparse::solver::Graph_COO& coo2, - cudaStream_t stream) -{ - /** Add edges to existing mst **/ - int final_nnz = coo2.n_edges + coo1.n_edges; - - coo1.src.resize(final_nnz, stream); - coo1.dst.resize(final_nnz, stream); - coo1.weights.resize(final_nnz, stream); - - /** - * Construct final edge list - */ - raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream); - raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream); - raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream); - - coo1.n_edges = final_nnz; -} - -/** - * Connect an unconnected knn graph (one in which mst returns an msf). The - * device buffers underlying the Graph_COO object are modified in-place. - * @tparam value_idx index type - * @tparam value_t floating-point value type - * @param[in] handle raft handle - * @param[in] X original dense data from which knn grpah was constructed - * @param[inout] msf edge list containing the mst result - * @param[in] m number of rows in X - * @param[in] n number of columns in X - * @param[inout] color the color labels array returned from the mst invocation - * @return updated MST edge list - */ -template -void connect_knn_graph( - raft::resources const& handle, - const value_t* X, - sparse::solver::Graph_COO& msf, - size_t m, - size_t n, - value_idx* color, - red_op reduction_op, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2SqrtExpanded) -{ - auto stream = resource::get_cuda_stream(handle); - - raft::sparse::COO connected_edges(stream); - - // default row and column batch sizes are chosen for computing cross component nearest neighbors. - // Reference: PR #1445 - static constexpr size_t default_row_batch_size = 4096; - static constexpr size_t default_col_batch_size = 16; - - raft::sparse::neighbors::cross_component_nn(handle, - connected_edges, - X, - color, - m, - n, - reduction_op, - min(m, default_row_batch_size), - min(n, default_col_batch_size)); - - rmm::device_uvector indptr2(m + 1, stream); - raft::sparse::convert::sorted_coo_to_csr( - connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, stream); - - // On the second call, we hand the MST the original colors - // and the new set of edges and let it restart the optimization process - auto new_mst = - raft::sparse::solver::mst(handle, - indptr2.data(), - connected_edges.cols(), - connected_edges.vals(), - m, - connected_edges.nnz, - color, - stream, - false, - false); - - merge_msts(msf, new_mst, stream); -} - -/** - * Constructs an MST and sorts the resulting edges in ascending - * order by their weight. - * - * Hierarchical clustering heavily relies upon the ordering - * and vertices returned in the MST. If the result of the - * MST was actually a minimum-spanning forest, the CSR - * being passed into the MST is not connected. In such a - * case, this graph will be connected by performing a - * KNN across the components. - * @tparam value_idx - * @tparam value_t - * @param[in] handle raft handle - * @param[in] indptr CSR indptr of connectivities graph - * @param[in] indices CSR indices array of connectivities graph - * @param[in] pw_dists CSR weights array of connectivities graph - * @param[in] m number of rows in X / src vertices in connectivities graph - * @param[in] n number of columns in X - * @param[out] mst_src output src edges - * @param[out] mst_dst output dst edges - * @param[out] mst_weight output weights (distances) - * @param[in] max_iter maximum iterations to run knn graph connection. This - * argument is really just a safeguard against the potential for infinite loops. - */ -template -void build_sorted_mst( - raft::resources const& handle, - const value_t* X, - const value_idx* indptr, - const value_idx* indices, - const value_t* pw_dists, - size_t m, - size_t n, - value_idx* mst_src, - value_idx* mst_dst, - value_t* mst_weight, - value_idx* color, - size_t nnz, - red_op reduction_op, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2SqrtExpanded, - int max_iter = 10) -{ - auto stream = resource::get_cuda_stream(handle); - - // We want to have MST initialize colors on first call. - auto mst_coo = raft::sparse::solver::mst( - handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true); - - int iters = 1; - int n_components = raft::sparse::neighbors::get_n_components(color, m, stream); - - while (n_components > 1 && iters < max_iter) { - connect_knn_graph(handle, X, mst_coo, m, n, color, reduction_op); - - iters++; - - n_components = raft::sparse::neighbors::get_n_components(color, m, stream); - } - - /** - * The `max_iter` argument was introduced only to prevent the potential for an infinite loop. - * Ideally the log2(n) guarantees of the MST should be enough to connect KNN graphs with a - * massive number of data samples in very few iterations. If it does not, there are 3 likely - * reasons why (in order of their likelihood): - * 1. There is a bug in this code somewhere - * 2. Either the given KNN graph wasn't generated from X or the same metric is not being used - * to generate the 1-nn (currently only L2SqrtExpanded is supported). - * 3. max_iter was not large enough to connect the graph (less likely). - * - * Note that a KNN graph generated from 50 random isotropic balls (with significant overlap) - * was able to be connected in a single iteration. - */ - RAFT_EXPECTS(n_components == 1, - "KNN graph could not be connected in %d iterations. " - "Please verify that the input knn graph is generated from X " - "(and the same distance metric used)," - " or increase 'max_iter'", - max_iter); - - raft::sparse::op::coo_sort_by_weight( - mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream); - - raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream); - raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream); - raft::copy_async(mst_weight, mst_coo.weights.data(), mst_coo.n_edges, stream); -} - -}; // namespace cuvs::cluster::detail \ No newline at end of file diff --git a/cpp/include/cuvs/cluster/detail/single_linkage.cuh b/cpp/include/cuvs/cluster/detail/single_linkage.cuh deleted file mode 100644 index 5eb5ffb61..000000000 --- a/cpp/include/cuvs/cluster/detail/single_linkage.cuh +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include -#include -#include -#include - -namespace cuvs::cluster::detail { - -static const size_t EMPTY = 0; - -/** - * Single-linkage clustering, capable of constructing a KNN graph to - * scale the algorithm beyond the n^2 memory consumption of implementations - * that use the fully-connected graph of pairwise distances by connecting - * a knn graph when k is not large enough to connect it. - - * @tparam value_idx - * @tparam value_t - * @tparam dist_type method to use for constructing connectivities graph - * @param[in] handle raft handle - * @param[in] X dense input matrix in row-major layout - * @param[in] m number of rows in X - * @param[in] n number of columns in X - * @param[in] metric distance metrix to use when constructing connectivities graph - * @param[out] out struct containing output dendrogram and cluster assignments - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect - control - * of k. The algorithm will set `k = log(n) + c` - * @param[in] n_clusters number of clusters to assign data samples - */ -template -void single_linkage(raft::resources const& handle, - const value_t* X, - size_t m, - size_t n, - cuvs::distance::DistanceType metric, - linkage_output* out, - int c, - size_t n_clusters) -{ - ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points"); - - auto stream = resource::get_cuda_stream(handle); - - rmm::device_uvector indptr(EMPTY, stream); - rmm::device_uvector indices(EMPTY, stream); - rmm::device_uvector pw_dists(EMPTY, stream); - - /** - * 1. Construct distance graph - */ - detail::get_distance_graph( - handle, X, m, n, metric, indptr, indices, pw_dists, c); - - rmm::device_uvector mst_rows(m - 1, stream); - rmm::device_uvector mst_cols(m - 1, stream); - rmm::device_uvector mst_data(m - 1, stream); - - /** - * 2. Construct MST, sorted by weights - */ - rmm::device_uvector color(m, stream); - raft::sparse::neighbors::FixConnectivitiesRedOp op(m); - detail::build_sorted_mst(handle, - X, - indptr.data(), - indices.data(), - pw_dists.data(), - m, - n, - mst_rows.data(), - mst_cols.data(), - mst_data.data(), - color.data(), - indices.size(), - op, - metric); - - pw_dists.release(); - - /** - * Perform hierarchical labeling - */ - size_t n_edges = mst_rows.size(); - - rmm::device_uvector out_delta(n_edges, stream); - rmm::device_uvector out_size(n_edges, stream); - // Create dendrogram - detail::build_dendrogram_host(handle, - mst_rows.data(), - mst_cols.data(), - mst_data.data(), - n_edges, - out->children, - out_delta.data(), - out_size.data()); - detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m); - - out->m = m; - out->n_clusters = n_clusters; - out->n_leaves = m; - out->n_connected_components = 1; -} -}; // namespace cuvs::cluster::detail \ No newline at end of file diff --git a/cpp/include/cuvs/cluster/kmeans.cuh b/cpp/include/cuvs/cluster/kmeans.cuh deleted file mode 100644 index e773a09ea..000000000 --- a/cpp/include/cuvs/cluster/kmeans.cuh +++ /dev/null @@ -1,1116 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs::cluster::kmeans { - -/** - * Functor used for sampling centroids - */ -template -using SamplingOp = detail::SamplingOp; - -/** - * Functor used to extract the index from a KeyValue pair - * storing both index and a distance. - */ -template -using KeyValueIndexOp = detail::KeyValueIndexOp; - -/** - * @brief Find clusters with k-means algorithm. - * Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * - * @code{.cpp} - * #include - * #include - * #include - * using namespace cuvs::cluster; - * ... - * raft::raft::resources handle; - * cuvs::cluster::KMeansParams params; - * int n_features = 15, inertia, n_iter; - * auto centroids = raft::make_device_matrix(handle, params.n_clusters, n_features); - * - * kmeans::fit(handle, - * params, - * X, - * std::nullopt, - * centroids, - * raft::make_scalar_view(&inertia), - * raft::make_scalar_view(&n_iter)); - * @endcode - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format. - * [dim = n_samples x n_features] - * @param[in] sample_weight Optional weights for each observation in X. - * [len = n_samples] - * @param[inout] centroids [in] When init is InitMethod::Array, use - * centroids as the initial cluster centers. - * [out] The generated centroids from the - * kmeans algorithm are stored at the address - * pointed by 'centroids'. - * [dim = n_clusters x n_features] - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - */ -template -void fit(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter) -{ - detail::kmeans_fit(handle, params, X, sample_weight, centroids, inertia, n_iter); -} - -/** - * @brief Predict the closest cluster each sample in X belongs to. - * - * @code{.cpp} - * #include - * #include - * #include - * using namespace cuvs::cluster; - * ... - * raft::raft::resources handle; - * cuvs::cluster::KMeansParams params; - * int n_features = 15, inertia, n_iter; - * auto centroids = raft::make_device_matrix(handle, params.n_clusters, n_features); - * - * kmeans::fit(handle, - * params, - * X, - * std::nullopt, - * centroids.view(), - * raft::make_scalar_view(&inertia), - * raft::make_scalar_view(&n_iter)); - * ... - * auto labels = raft::make_device_vector(handle, X.extent(0)); - * - * kmeans::predict(handle, - * params, - * X, - * std::nullopt, - * centroids.view(), - * false, - * labels.view(), - * raft::make_scalar_view(&ineratia)); - * @endcode - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X New data to predict. - * [dim = n_samples x n_features] - * @param[in] sample_weight Optional weights for each observation in X. - * [len = n_samples] - * @param[in] centroids Cluster centroids. The data must be in - * row-major format. - * [dim = n_clusters x n_features] - * @param[in] normalize_weight True if the weights should be normalized - * @param[out] labels Index of the cluster each sample in X - * belongs to. - * [len = n_samples] - * @param[out] inertia Sum of squared distances of samples to - * their closest cluster center. - */ -template -void predict(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, - raft::device_vector_view labels, - bool normalize_weight, - raft::host_scalar_view inertia) -{ - detail::kmeans_predict( - handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia); -} - -/** - * @brief Compute k-means clustering and predicts cluster index for each sample - * in the input. - * - * @code{.cpp} - * #include - * #include - * #include - * using namespace cuvs::cluster; - * ... - * raft::raft::resources handle; - * cuvs::cluster::KMeansParams params; - * int n_features = 15, inertia, n_iter; - * auto centroids = raft::make_device_matrix(handle, params.n_clusters, n_features); - * auto labels = raft::make_device_vector(handle, X.extent(0)); - * - * kmeans::fit_predict(handle, - * params, - * X, - * std::nullopt, - * centroids.view(), - * labels.view(), - * raft::make_scalar_view(&inertia), - * raft::make_scalar_view(&n_iter)); - * @endcode - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must be - * in row-major format. - * [dim = n_samples x n_features] - * @param[in] sample_weight Optional weights for each observation in X. - * [len = n_samples] - * @param[inout] centroids Optional - * [in] When init is InitMethod::Array, use - * centroids as the initial cluster centers - * [out] The generated centroids from the - * kmeans algorithm are stored at the address - * pointed by 'centroids'. - * [dim = n_clusters x n_features] - * @param[out] labels Index of the cluster each sample in X belongs - * to. - * [len = n_samples] - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - */ -template -void fit_predict(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - std::optional> centroids, - raft::device_vector_view labels, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter) -{ - detail::kmeans_fit_predict( - handle, params, X, sample_weight, centroids, labels, inertia, n_iter); -} - -/** - * @brief Transform X to a cluster-distance space. - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format - * [dim = n_samples x n_features] - * @param[in] centroids Cluster centroids. The data must be in row-major format. - * [dim = n_clusters x n_features] - * @param[out] X_new X transformed in the new space. - * [dim = n_samples x n_features] - */ -template -void transform(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_matrix_view X_new) -{ - detail::kmeans_transform(handle, params, X, centroids, X_new); -} - -template -void transform(raft::resources const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* centroids, - IndexT n_samples, - IndexT n_features, - DataT* X_new) -{ - detail::kmeans_transform( - handle, params, X, centroids, n_samples, n_features, X_new); -} - -/** - * Automatically find the optimal value of k using a binary search. - * This method maximizes the Calinski-Harabasz Index while minimizing the per-cluster inertia. - * - * @code{.cpp} - * #include - * #include - * #include - * - * #include - * - * using namespace cuvs::cluster; - * - * raft::handle_t handle; - * int n_samples = 100, n_features = 15, n_clusters = 10; - * auto X = raft::make_device_matrix(handle, n_samples, n_features); - * auto labels = raft::make_device_vector(handle, n_samples); - * - * raft::random::make_blobs(handle, X, labels, n_clusters); - * - * auto best_k = raft::make_host_scalar(0); - * auto n_iter = raft::make_host_scalar(0); - * auto inertia = raft::make_host_scalar(0); - * - * kmeans::find_k(handle, X, best_k.view(), inertia.view(), n_iter.view(), n_clusters+1); - * - * @endcode - * - * @tparam idx_t indexing type (should be integral) - * @tparam value_t value type (should be floating point) - * @param handle raft handle - * @param X input observations (shape n_samples, n_dims) - * @param best_k best k found from binary search - * @param inertia inertia of best k found - * @param n_iter number of iterations used to find best k - * @param kmax maximum k to try in search - * @param kmin minimum k to try in search (should be >= 1) - * @param maxiter maximum number of iterations to run - * @param tol tolerance for early stopping convergence - */ -template -void find_k(raft::resources const& handle, - raft::device_matrix_view X, - raft::host_scalar_view best_k, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter, - idx_t kmax, - idx_t kmin = 1, - idx_t maxiter = 100, - value_t tol = 1e-3) -{ - detail::find_k(handle, X, best_k, inertia, n_iter, kmax, kmin, maxiter, tol); -} - -/** - * @brief Select centroids according to a sampling operation - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] minClusterDistance Distance for every sample to it's nearest centroid - * [dim = n_samples] - * @param[in] isSampleCentroid Flag the sample chosen as initial centroid - * [dim = n_samples] - * @param[in] select_op The sampling operation used to select the centroids - * @param[out] inRankCp The sampled centroids - * [dim = n_selected_centroids x n_features] - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void sample_centroids(raft::resources const& handle, - raft::device_matrix_view X, - raft::device_vector_view minClusterDistance, - raft::device_vector_view isSampleCentroid, - SamplingOp& select_op, - rmm::device_uvector& inRankCp, - rmm::device_uvector& workspace) -{ - detail::sampleCentroids( - handle, X, minClusterDistance, isSampleCentroid, select_op, inRankCp, workspace); -} - -/** - * @brief Compute cluster cost - * - * @tparam DataT the type of data used for weights, distances. - * @tparam ReductionOpT the type of data used for the reduction operation. - * - * @param[in] handle The raft handle - * @param[in] minClusterDistance Distance for every sample to it's nearest centroid - * [dim = n_samples] - * @param[in] workspace Temporary workspace buffer which can get resized - * @param[out] clusterCost Resulting cluster cost - * @param[in] reduction_op The reduction operation used for the cost - * - */ -template -void cluster_cost(raft::resources const& handle, - raft::device_vector_view minClusterDistance, - rmm::device_uvector& workspace, - raft::device_scalar_view clusterCost, - ReductionOpT reduction_op) -{ - detail::computeClusterCost( - handle, minClusterDistance, workspace, clusterCost, raft::identity_op{}, reduction_op); -} - -/** - * @brief Update centroids given current centroids and number of points assigned to each centroid. - * This function also produces a vector of RAFT key/value pairs containing the cluster assignment - * for each point and its distance. - * - * @tparam DataT - * @tparam IndexT - * @param[in] handle: Raft handle to use for managing library resources - * @param[in] X: input matrix (size n_samples, n_features) - * @param[in] sample_weights: number of samples currently assigned to each centroid (size n_samples) - * @param[in] centroids: matrix of current centroids (size n_clusters, n_features) - * @param[in] labels: Iterator of labels (can also be a raw pointer) - * @param[out] weight_per_cluster: sum of sample weights per cluster (size n_clusters) - * @param[out] new_centroids: output matrix of updated centroids (size n_clusters, n_features) - */ -template -void update_centroids(raft::resources const& handle, - raft::device_matrix_view X, - raft::device_vector_view sample_weights, - raft::device_matrix_view centroids, - LabelsIterator labels, - raft::device_vector_view weight_per_cluster, - raft::device_matrix_view new_centroids) -{ - // TODO: Passing these into the algorithm doesn't really present much of a benefit - // because they are being resized anyways. - // ref https://github.com/rapidsai/raft/issues/930 - rmm::device_uvector workspace(0, resource::get_cuda_stream(handle)); - - detail::update_centroids( - handle, X, sample_weights, centroids, labels, weight_per_cluster, new_centroids, workspace); -} - -/** - * @brief Compute distance for every sample to it's nearest centroid - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[out] minClusterDistance Distance for every sample to it's nearest centroid - * [dim = n_samples] - * @param[in] L2NormX L2 norm of X : ||x||^2 - * [dim = n_samples] - * @param[out] L2NormBuf_OR_DistBuf Resizable buffer to store L2 norm of centroids or distance - * matrix - * @param[in] metric Distance metric to use - * @param[in] batch_samples batch size for input data samples - * @param[in] batch_centroids batch size for input centroids - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void min_cluster_distance(raft::resources const& handle, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view minClusterDistance, - raft::device_vector_view L2NormX, - rmm::device_uvector& L2NormBuf_OR_DistBuf, - cuvs::distance::DistanceType metric, - int batch_samples, - int batch_centroids, - rmm::device_uvector& workspace) -{ - detail::minClusterDistanceCompute(handle, - X, - centroids, - minClusterDistance, - L2NormX, - L2NormBuf_OR_DistBuf, - metric, - batch_samples, - batch_centroids, - workspace); -} - -/** - * @brief Calculates a pair for every sample in input 'X' where key is an - * index of one of the 'centroids' (index of the nearest centroid) and 'value' - * is the distance between the sample and the 'centroid[key]' - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[out] minClusterAndDistance Distance vector that contains for every sample, the nearest - * centroid and it's distance - * [dim = n_samples] - * @param[in] L2NormX L2 norm of X : ||x||^2 - * [dim = n_samples] - * @param[out] L2NormBuf_OR_DistBuf Resizable buffer to store L2 norm of centroids or distance - * matrix - * @param[in] metric distance metric - * @param[in] batch_samples batch size of data samples - * @param[in] batch_centroids batch size of centroids - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void min_cluster_and_distance( - raft::resources const& handle, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view, IndexT> minClusterAndDistance, - raft::device_vector_view L2NormX, - rmm::device_uvector& L2NormBuf_OR_DistBuf, - cuvs::distance::DistanceType metric, - int batch_samples, - int batch_centroids, - rmm::device_uvector& workspace) -{ - detail::minClusterAndDistanceCompute(handle, - X, - centroids, - minClusterAndDistance, - L2NormX, - L2NormBuf_OR_DistBuf, - metric, - batch_samples, - batch_centroids, - workspace); -} - -/** - * @brief Shuffle and randomly select 'n_samples_to_gather' from input 'in' and stores - * in 'out' does not modify the input - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] in The data to shuffle and gather - * [dim = n_samples x n_features] - * @param[out] out The sampled data - * [dim = n_samples_to_gather x n_features] - * @param[in] n_samples_to_gather Number of sample to gather - * @param[in] seed Seed for the shuffle - * - */ -template -void shuffle_and_gather(raft::resources const& handle, - raft::device_matrix_view in, - raft::device_matrix_view out, - uint32_t n_samples_to_gather, - uint64_t seed) -{ - detail::shuffleAndGather(handle, in, out, n_samples_to_gather, seed); -} - -/** - * @brief Count the number of samples in each cluster - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] L2NormX L2 norm of X : ||x||^2 - * [dim = n_samples] - * @param[in] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[in] workspace Temporary workspace buffer which can get resized - * @param[out] sampleCountInCluster The count for each centroid - * [dim = n_cluster] - * - */ -template -void count_samples_in_cluster(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_vector_view L2NormX, - raft::device_matrix_view centroids, - rmm::device_uvector& workspace, - raft::device_vector_view sampleCountInCluster) -{ - detail::countSamplesInCluster( - handle, params, X, L2NormX, centroids, workspace, sampleCountInCluster); -} - -/** - * @brief Selects 'n_clusters' samples from the input X using kmeans++ algorithm. - * - * @see "k-means++: the advantages of careful seeding". 2007, Arthur, D. and Vassilvitskii, S. - * ACM-SIAM symposium on Discrete algorithms. - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[out] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[in] workspace Temporary workspace buffer which can get resized - */ -template -void init_plus_plus(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - rmm::device_uvector& workspace) -{ - detail::kmeansPlusPlus(handle, params, X, centroids, workspace); -} - -/* - * @brief Main function used to fit KMeans (after cluster initialization) - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format. - * [dim = n_samples x n_features] - * @param[in] sample_weight Weights for each observation in X. - * [len = n_samples] - * @param[inout] centroids [in] Initial cluster centers. - * [out] The generated centroids from the - * kmeans algorithm are stored at the address - * pointed by 'centroids'. - * [dim = n_clusters x n_features] - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - * @param[in] workspace Temporary workspace buffer which can get resized - */ -template -void fit_main(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_vector_view sample_weights, - raft::device_matrix_view centroids, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter, - rmm::device_uvector& workspace) -{ - detail::kmeans_fit_main( - handle, params, X, sample_weights, centroids, inertia, n_iter, workspace); -} - -}; // namespace cuvs::cluster::kmeans - -namespace cuvs::cluster { - -/** - * Note: All of the functions below in cuvs::cluster are deprecated and will - * be removed in a future release. Please use cuvs::cluster::kmeans instead. - */ - -/** - * @brief Find clusters with k-means algorithm. - * Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format. - * [dim = n_samples x n_features] - * @param[in] sample_weight Optional weights for each observation in X. - * [len = n_samples] - * @param[inout] centroids [in] When init is InitMethod::Array, use - * centroids as the initial cluster centers. - * [out] The generated centroids from the - * kmeans algorithm are stored at the address - * pointed by 'centroids'. - * [dim = n_clusters x n_features] - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - */ -template -void kmeans_fit(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter) -{ - kmeans::fit(handle, params, X, sample_weight, centroids, inertia, n_iter); -} - -template -void kmeans_fit(raft::resources const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - DataT* centroids, - IndexT n_samples, - IndexT n_features, - DataT& inertia, - IndexT& n_iter) -{ - kmeans::fit( - handle, params, X, sample_weight, centroids, n_samples, n_features, inertia, n_iter); -} - -/** - * @brief Predict the closest cluster each sample in X belongs to. - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X New data to predict. - * [dim = n_samples x n_features] - * @param[in] sample_weight Optional weights for each observation in X. - * [len = n_samples] - * @param[in] centroids Cluster centroids. The data must be in - * row-major format. - * [dim = n_clusters x n_features] - * @param[in] normalize_weight True if the weights should be normalized - * @param[out] labels Index of the cluster each sample in X - * belongs to. - * [len = n_samples] - * @param[out] inertia Sum of squared distances of samples to - * their closest cluster center. - */ -template -void kmeans_predict(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, - raft::device_vector_view labels, - bool normalize_weight, - raft::host_scalar_view inertia) -{ - kmeans::predict( - handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia); -} - -template -void kmeans_predict(raft::resources const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - const DataT* centroids, - IndexT n_samples, - IndexT n_features, - IndexT* labels, - bool normalize_weight, - DataT& inertia) -{ - kmeans::predict(handle, - params, - X, - sample_weight, - centroids, - n_samples, - n_features, - labels, - normalize_weight, - inertia); -} - -/** - * @brief Compute k-means clustering and predicts cluster index for each sample - * in the input. - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must be - * in row-major format. - * [dim = n_samples x n_features] - * @param[in] sample_weight Optional weights for each observation in X. - * [len = n_samples] - * @param[inout] centroids Optional - * [in] When init is InitMethod::Array, use - * centroids as the initial cluster centers - * [out] The generated centroids from the - * kmeans algorithm are stored at the address - * pointed by 'centroids'. - * [dim = n_clusters x n_features] - * @param[out] labels Index of the cluster each sample in X belongs - * to. - * [len = n_samples] - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - */ -template -void kmeans_fit_predict(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - std::optional> centroids, - raft::device_vector_view labels, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter) -{ - kmeans::fit_predict( - handle, params, X, sample_weight, centroids, labels, inertia, n_iter); -} - -template -void kmeans_fit_predict(raft::resources const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - DataT* centroids, - IndexT n_samples, - IndexT n_features, - IndexT* labels, - DataT& inertia, - IndexT& n_iter) -{ - kmeans::fit_predict( - handle, params, X, sample_weight, centroids, n_samples, n_features, labels, inertia, n_iter); -} - -/** - * @brief Transform X to a cluster-distance space. - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format - * [dim = n_samples x n_features] - * @param[in] centroids Cluster centroids. The data must be in row-major format. - * [dim = n_clusters x n_features] - * @param[out] X_new X transformed in the new space. - * [dim = n_samples x n_features] - */ -template -void kmeans_transform(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_matrix_view X_new) -{ - kmeans::transform(handle, params, X, centroids, X_new); -} - -template -void kmeans_transform(raft::resources const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* centroids, - IndexT n_samples, - IndexT n_features, - DataT* X_new) -{ - kmeans::transform(handle, params, X, centroids, n_samples, n_features, X_new); -} - -template -using SamplingOp = kmeans::SamplingOp; - -template -using KeyValueIndexOp = kmeans::KeyValueIndexOp; - -/** - * @brief Select centroids according to a sampling operation - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] minClusterDistance Distance for every sample to it's nearest centroid - * [dim = n_samples] - * @param[in] isSampleCentroid Flag the sample chosen as initial centroid - * [dim = n_samples] - * @param[in] select_op The sampling operation used to select the centroids - * @param[out] inRankCp The sampled centroids - * [dim = n_selected_centroids x n_features] - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void sampleCentroids(raft::resources const& handle, - raft::device_matrix_view X, - raft::device_vector_view minClusterDistance, - raft::device_vector_view isSampleCentroid, - SamplingOp& select_op, - rmm::device_uvector& inRankCp, - rmm::device_uvector& workspace) -{ - kmeans::sample_centroids( - handle, X, minClusterDistance, isSampleCentroid, select_op, inRankCp, workspace); -} - -/** - * @brief Compute cluster cost - * - * @tparam DataT the type of data used for weights, distances. - * @tparam ReductionOpT the type of data used for the reduction operation. - * - * @param[in] handle The raft handle - * @param[in] minClusterDistance Distance for every sample to it's nearest centroid - * [dim = n_samples] - * @param[in] workspace Temporary workspace buffer which can get resized - * @param[out] clusterCost Resulting cluster cost - * @param[in] reduction_op The reduction operation used for the cost - * - */ -template -void computeClusterCost(raft::resources const& handle, - raft::device_vector_view minClusterDistance, - rmm::device_uvector& workspace, - raft::device_scalar_view clusterCost, - ReductionOpT reduction_op) -{ - kmeans::cluster_cost(handle, minClusterDistance, workspace, clusterCost, reduction_op); -} - -/** - * @brief Compute distance for every sample to it's nearest centroid - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[out] minClusterDistance Distance for every sample to it's nearest centroid - * [dim = n_samples] - * @param[in] L2NormX L2 norm of X : ||x||^2 - * [dim = n_samples] - * @param[out] L2NormBuf_OR_DistBuf Resizable buffer to store L2 norm of centroids or distance - * matrix - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void minClusterDistanceCompute(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view minClusterDistance, - raft::device_vector_view L2NormX, - rmm::device_uvector& L2NormBuf_OR_DistBuf, - rmm::device_uvector& workspace) -{ - kmeans::min_cluster_distance(handle, - X, - centroids, - minClusterDistance, - L2NormX, - L2NormBuf_OR_DistBuf, - params.metric, - params.batch_samples, - params.batch_centroids, - workspace); -} - -/** - * @brief Calculates a pair for every sample in input 'X' where key is an - * index of one of the 'centroids' (index of the nearest centroid) and 'value' - * is the distance between the sample and the 'centroid[key]' - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[out] minClusterAndDistance Distance vector that contains for every sample, the nearest - * centroid and it's distance - * [dim = n_samples] - * @param[in] L2NormX L2 norm of X : ||x||^2 - * [dim = n_samples] - * @param[out] L2NormBuf_OR_DistBuf Resizable buffer to store L2 norm of centroids or distance - * matrix - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void minClusterAndDistanceCompute( - raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view, IndexT> minClusterAndDistance, - raft::device_vector_view L2NormX, - rmm::device_uvector& L2NormBuf_OR_DistBuf, - rmm::device_uvector& workspace) -{ - kmeans::min_cluster_and_distance(handle, - X, - centroids, - minClusterAndDistance, - L2NormX, - L2NormBuf_OR_DistBuf, - params.metric, - params.batch_samples, - params.batch_centroids, - workspace); -} - -/** - * @brief Shuffle and randomly select 'n_samples_to_gather' from input 'in' and stores - * in 'out' does not modify the input - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] in The data to shuffle and gather - * [dim = n_samples x n_features] - * @param[out] out The sampled data - * [dim = n_samples_to_gather x n_features] - * @param[in] n_samples_to_gather Number of sample to gather - * @param[in] seed Seed for the shuffle - * - */ -template -void shuffleAndGather(raft::resources const& handle, - raft::device_matrix_view in, - raft::device_matrix_view out, - uint32_t n_samples_to_gather, - uint64_t seed) -{ - kmeans::shuffle_and_gather(handle, in, out, n_samples_to_gather, seed); -} - -/** - * @brief Count the number of samples in each cluster - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] L2NormX L2 norm of X : ||x||^2 - * [dim = n_samples] - * @param[in] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[in] workspace Temporary workspace buffer which can get resized - * @param[out] sampleCountInCluster The count for each centroid - * [dim = n_cluster] - * - */ -template -void countSamplesInCluster(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_vector_view L2NormX, - raft::device_matrix_view centroids, - rmm::device_uvector& workspace, - raft::device_vector_view sampleCountInCluster) -{ - kmeans::count_samples_in_cluster( - handle, params, X, L2NormX, centroids, workspace, sampleCountInCluster); -} - -/* - * @brief Selects 'n_clusters' samples from the input X using kmeans++ algorithm. - - * @note This is the algorithm described in - * "k-means++: the advantages of careful seeding". 2007, Arthur, D. and Vassilvitskii, S. - * ACM-SIAM symposium on Discrete algorithms. - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[out] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[in] workspace Temporary workspace buffer which can get resized - */ -template -void kmeansPlusPlus(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroidsRawData, - rmm::device_uvector& workspace) -{ - kmeans::init_plus_plus(handle, params, X, centroidsRawData, workspace); -} - -/* - * @brief Main function used to fit KMeans (after cluster initialization) - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format. - * [dim = n_samples x n_features] - * @param[in] sample_weight Weights for each observation in X. - * [len = n_samples] - * @param[inout] centroids [in] Initial cluster centers. - * [out] The generated centroids from the - * kmeans algorithm are stored at the address - * pointed by 'centroids'. - * [dim = n_clusters x n_features] - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - * @param[in] workspace Temporary workspace buffer which can get resized - */ -template -void kmeans_fit_main(raft::resources const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_vector_view weight, - raft::device_matrix_view centroidsRawData, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter, - rmm::device_uvector& workspace) -{ - kmeans::fit_main( - handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace); -} -}; // namespace cuvs::cluster diff --git a/cpp/include/cuvs/cluster/kmeans_balanced.cuh b/cpp/include/cuvs/cluster/kmeans_balanced.cuh deleted file mode 100644 index 7735587e7..000000000 --- a/cpp/include/cuvs/cluster/kmeans_balanced.cuh +++ /dev/null @@ -1,366 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include -#include -#include - -namespace cuvs::cluster::kmeans_balanced { - -/** - * @brief Find clusters of balanced sizes with a hierarchical k-means algorithm. - * - * This variant of the k-means algorithm first clusters the dataset in mesoclusters, then clusters - * the subsets associated to each mesocluster into fine clusters, and finally runs a few k-means - * iterations over the whole dataset and with all the centroids to obtain the final clusters. - * - * Each k-means iteration applies expectation-maximization-balancing: - * - Balancing: adjust centers for clusters that have a small number of entries. If the size of a - * cluster is below a threshold, the center is moved towards a bigger cluster. - * - Expectation: predict the labels (i.e find closest cluster centroid to each point) - * - Maximization: calculate optimal centroids (i.e find the center of gravity of each cluster) - * - * The number of mesoclusters is chosen by rounding the square root of the number of clusters. E.g - * for 512 clusters, we would have 23 mesoclusters. The number of fine clusters per mesocluster is - * chosen proportionally to the number of points in each mesocluster. - * - * This variant of k-means uses random initialization and a fixed number of iterations, though - * iterations can be repeated if the balancing step moved the centroids. - * - * Additionally, this algorithm supports quantized datasets in arbitrary types but the core part of - * the algorithm will work with a floating-point type, hence a conversion function can be provided - * to map the data type to the math type. - * - * @code{.cpp} - * #include - * #include - * #include - * ... - * raft::handle_t handle; - * cuvs::cluster::kmeans_balanced_params params; - * auto centroids = raft::make_device_matrix(handle, n_clusters, n_features); - * cuvs::cluster::kmeans_balanced::fit(handle, params, X, centroids.view()); - * @endcode - * - * @tparam DataT Type of the input data. - * @tparam MathT Type of the centroids and mapped data. - * @tparam IndexT Type used for indexing. - * @tparam MappingOpT Type of the mapping function. - * @param[in] handle The raft resources - * @param[in] params Structure containing the hyper-parameters - * @param[in] X Training instances to cluster. The data must be in row-major format. - * [dim = n_samples x n_features] - * @param[out] centroids The generated centroids [dim = n_clusters x n_features] - * @param[in] mapping_op (optional) Functor to convert from the input datatype to the arithmetic - * datatype. If DataT == MathT, this must be the identity. - */ -template -void fit(const raft::resources& handle, - kmeans_balanced_params const& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - MappingOpT mapping_op = raft::identity_op()) -{ - RAFT_EXPECTS(X.extent(1) == centroids.extent(1), - "Number of features in dataset and centroids are different"); - RAFT_EXPECTS(static_cast(X.extent(0)) * static_cast(X.extent(1)) <= - static_cast(std::numeric_limits::max()), - "The chosen index type cannot represent all indices for the given dataset"); - RAFT_EXPECTS(centroids.extent(0) > IndexT{0} && centroids.extent(0) <= X.extent(0), - "The number of centroids must be strictly positive and cannot exceed the number of " - "points in the training dataset."); - - detail::build_hierarchical(handle, - params, - X.extent(1), - X.data_handle(), - X.extent(0), - centroids.data_handle(), - centroids.extent(0), - mapping_op); -} - -/** - * @brief Predict the closest cluster each sample in X belongs to. - * - * @code{.cpp} - * #include - * #include - * #include - * ... - * raft::handle_t handle; - * cuvs::cluster::kmeans_balanced_params params; - * auto labels = raft::make_device_vector(handle, n_rows); - * cuvs::cluster::kmeans_balanced::predict(handle, params, X, centroids, labels); - * @endcode - * - * @tparam DataT Type of the input data. - * @tparam MathT Type of the centroids and mapped data. - * @tparam IndexT Type used for indexing. - * @tparam LabelT Type of the output labels. - * @tparam MappingOpT Type of the mapping function. - * @param[in] handle The raft resources - * @param[in] params Structure containing the hyper-parameters - * @param[in] X Dataset for which to infer the closest clusters. - * [dim = n_samples x n_features] - * @param[in] centroids The input centroids [dim = n_clusters x n_features] - * @param[out] labels The output labels [dim = n_samples] - * @param[in] mapping_op (optional) Functor to convert from the input datatype to the arithmetic - * datatype. If DataT == MathT, this must be the identity. - */ -template -void predict(const raft::resources& handle, - kmeans_balanced_params const& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view labels, - MappingOpT mapping_op = raft::identity_op()) -{ - RAFT_EXPECTS(X.extent(0) == labels.extent(0), - "Number of rows in dataset and labels are different"); - RAFT_EXPECTS(X.extent(1) == centroids.extent(1), - "Number of features in dataset and centroids are different"); - RAFT_EXPECTS(static_cast(X.extent(0)) * static_cast(X.extent(1)) <= - static_cast(std::numeric_limits::max()), - "The chosen index type cannot represent all indices for the given dataset"); - RAFT_EXPECTS(static_cast(centroids.extent(0)) <= - static_cast(std::numeric_limits::max()), - "The chosen label type cannot represent all cluster labels"); - - detail::predict(handle, - params, - centroids.data_handle(), - centroids.extent(0), - X.extent(1), - X.data_handle(), - X.extent(0), - labels.data_handle(), - mapping_op); -} - -/** - * @brief Compute hierarchical balanced k-means clustering and predict cluster index for each sample - * in the input. - * - * @code{.cpp} - * #include - * #include - * #include - * ... - * raft::handle_t handle; - * cuvs::cluster::kmeans_balanced_params params; - * auto centroids = raft::make_device_matrix(handle, n_clusters, n_features); - * auto labels = raft::make_device_vector(handle, n_rows); - * cuvs::cluster::kmeans_balanced::fit_predict( - * handle, params, X, centroids.view(), labels.view()); - * @endcode - * - * @tparam DataT Type of the input data. - * @tparam MathT Type of the centroids and mapped data. - * @tparam IndexT Type used for indexing. - * @tparam LabelT Type of the output labels. - * @tparam MappingOpT Type of the mapping function. - * @param[in] handle The raft resources - * @param[in] params Structure containing the hyper-parameters - * @param[in] X Training instances to cluster. The data must be in row-major format. - * [dim = n_samples x n_features] - * @param[out] centroids The output centroids [dim = n_clusters x n_features] - * @param[out] labels The output labels [dim = n_samples] - * @param[in] mapping_op (optional) Functor to convert from the input datatype to the arithmetic - * datatype. If DataT and MathT are the same, this must be the identity. - */ -template -void fit_predict(const raft::resources& handle, - kmeans_balanced_params const& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view labels, - MappingOpT mapping_op = raft::identity_op()) -{ - auto centroids_const = raft::make_device_matrix_view( - centroids.data_handle(), centroids.extent(0), centroids.extent(1)); - cuvs::cluster::kmeans_balanced::fit(handle, params, X, centroids, mapping_op); - cuvs::cluster::kmeans_balanced::predict(handle, params, X, centroids_const, labels, mapping_op); -} - -namespace helpers { - -/** - * @brief Randomly initialize centers and apply expectation-maximization-balancing iterations - * - * This is essentially the non-hierarchical balanced k-means algorithm which is used by the - * hierarchical algorithm once to build the mesoclusters and once per mesocluster to build the fine - * clusters. - * - * @code{.cpp} - * #include - * #include - * #include - * ... - * raft::handle_t handle; - * cuvs::cluster::kmeans_balanced_params params; - * auto centroids = raft::make_device_matrix(handle, n_clusters, n_features); - * auto labels = raft::make_device_vector(handle, n_samples); - * auto sizes = raft::make_device_vector(handle, n_clusters); - * cuvs::cluster::kmeans_balanced::build_clusters( - * handle, params, X, centroids.view(), labels.view(), sizes.view()); - * @endcode - * - * @tparam DataT Type of the input data. - * @tparam MathT Type of the centroids and mapped data. - * @tparam IndexT Type used for indexing. - * @tparam LabelT Type of the output labels. - * @tparam CounterT Counter type supported by CUDA's native atomicAdd. - * @tparam MappingOpT Type of the mapping function. - * @param[in] handle The raft resources - * @param[in] params Structure containing the hyper-parameters - * @param[in] X Training instances to cluster. The data must be in row-major format. - * [dim = n_samples x n_features] - * @param[out] centroids The output centroids [dim = n_clusters x n_features] - * @param[out] labels The output labels [dim = n_samples] - * @param[out] cluster_sizes Size of each cluster [dim = n_clusters] - * @param[in] mapping_op (optional) Functor to convert from the input datatype to the - * arithmetic datatype. If DataT == MathT, this must be the identity. - * @param[in] X_norm (optional) Dataset's row norms [dim = n_samples] - */ -template -void build_clusters(const raft::resources& handle, - const kmeans_balanced_params& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view labels, - raft::device_vector_view cluster_sizes, - MappingOpT mapping_op = raft::identity_op(), - std::optional> X_norm = std::nullopt) -{ - RAFT_EXPECTS(X.extent(0) == labels.extent(0), - "Number of rows in dataset and labels are different"); - RAFT_EXPECTS(X.extent(1) == centroids.extent(1), - "Number of features in dataset and centroids are different"); - RAFT_EXPECTS(centroids.extent(0) == cluster_sizes.extent(0), - "Number of rows in centroids and clusyer_sizes are different"); - - detail::build_clusters(handle, - params, - X.extent(1), - X.data_handle(), - X.extent(0), - centroids.extent(0), - centroids.data_handle(), - labels.data_handle(), - cluster_sizes.data_handle(), - mapping_op, - resource::get_workspace_resource(handle), - X_norm.has_value() ? X_norm.value().data_handle() : nullptr); -} - -/** - * @brief Given the data and labels, calculate cluster centers and sizes in one sweep. - * - * Let `S_i = {x_k | x_k \in X & labels[k] == i}` be the vectors in the dataset with label i. - * - * On exit, - * `centers_i = (\sum_{x \in S_i} x + w_i * center_i) / (|S_i| + w_i)`, - * where `w_i = reset_counters ? 0 : cluster_size[i]`. - * - * In other words, the updated cluster centers are a weighted average of the existing cluster - * center, and the coordinates of the points labeled with i. _This allows calling this function - * multiple times with different datasets with the same effect as if calling this function once - * on the combined dataset_. - * - * @code{.cpp} - * #include - * #include - * ... - * raft::handle_t handle; - * auto centroids = raft::make_device_matrix(handle, n_clusters, n_features); - * auto sizes = raft::make_device_vector(handle, n_clusters); - * cuvs::cluster::kmeans_balanced::calc_centers_and_sizes( - * handle, X, labels, centroids.view(), sizes.view(), true); - * @endcode - * - * @tparam DataT Type of the input data. - * @tparam MathT Type of the centroids and mapped data. - * @tparam IndexT Type used for indexing. - * @tparam LabelT Type of the output labels. - * @tparam CounterT Counter type supported by CUDA's native atomicAdd. - * @tparam MappingOpT Type of the mapping function. - * @param[in] handle The raft resources - * @param[in] X Dataset for which to calculate cluster centers. The data must be in - * row-major format. [dim = n_samples x n_features] - * @param[in] labels The input labels [dim = n_samples] - * @param[out] centroids The output centroids [dim = n_clusters x n_features] - * @param[out] cluster_sizes Size of each cluster [dim = n_clusters] - * @param[in] reset_counters Whether to clear the output arrays before calculating. - * When set to `false`, this function may be used to update existing - * centers and sizes using the weighted average principle. - * @param[in] mapping_op (optional) Functor to convert from the input datatype to the - * arithmetic datatype. If DataT == MathT, this must be the identity. - */ -template -void calc_centers_and_sizes(const raft::resources& handle, - raft::device_matrix_view X, - raft::device_vector_view labels, - raft::device_matrix_view centroids, - raft::device_vector_view cluster_sizes, - bool reset_counters = true, - MappingOpT mapping_op = raft::identity_op()) -{ - RAFT_EXPECTS(X.extent(0) == labels.extent(0), - "Number of rows in dataset and labels are different"); - RAFT_EXPECTS(X.extent(1) == centroids.extent(1), - "Number of features in dataset and centroids are different"); - RAFT_EXPECTS(centroids.extent(0) == cluster_sizes.extent(0), - "Number of rows in centroids and clusyer_sizes are different"); - - detail::calc_centers_and_sizes(handle, - centroids.data_handle(), - cluster_sizes.data_handle(), - centroids.extent(0), - X.extent(1), - X.data_handle(), - X.extent(0), - labels.data_handle(), - reset_counters, - mapping_op); -} - -} // namespace helpers - -} // namespace cuvs::cluster::kmeans_balanced diff --git a/cpp/include/cuvs/cluster/kmeans_balanced_types.hpp b/cpp/include/cuvs/cluster/kmeans_balanced_types.hpp deleted file mode 100644 index 5a4793fbe..000000000 --- a/cpp/include/cuvs/cluster/kmeans_balanced_types.hpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -namespace cuvs::cluster::kmeans_balanced { - -/** - * Simple object to specify hyper-parameters to the balanced k-means algorithm. - * - * The following metrics are currently supported in k-means balanced: - * - InnerProduct - * - L2Expanded - * - L2SqrtExpanded - */ -struct kmeans_balanced_params : kmeans_base_params { - /** - * Number of training iterations - */ - uint32_t n_iters = 20; -}; - -} // namespace cuvs::cluster::kmeans_balanced - -namespace cuvs::cluster { - -using kmeans_balanced::kmeans_balanced_params; - -} // namespace cuvs::cluster diff --git a/cpp/include/cuvs/cluster/kmeans_deprecated.cuh b/cpp/include/cuvs/cluster/kmeans_deprecated.cuh deleted file mode 100644 index c31f7e686..000000000 --- a/cpp/include/cuvs/cluster/kmeans_deprecated.cuh +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -namespace cuvs::cluster { - -/** - * @brief Find clusters with k-means algorithm. - * Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * @tparam index_type_t the type of data used for indexing. - * @tparam value_type_t the type of data used for weights, distances. - * @param handle the raft handle. - * @param n Number of observation vectors. - * @param d Dimension of observation vectors. - * @param k Number of clusters. - * @param tol Tolerance for convergence. k-means stops when the - * change in residual divided by n is less than tol. - * @param maxiter Maximum number of k-means iterations. - * @param obs (Input, device memory, d*n entries) Observation - * matrix. Matrix is stored column-major and each column is an - * observation vector. Matrix dimensions are d x n. - * @param codes (Output, device memory, n entries) Cluster - * assignments. - * @param residual On exit, residual sum of squares (sum of squares - * of distances between observation vectors and centroids). - * @param iters on exit, number of k-means iterations. - * @param seed random seed to be used. - * @return error flag - */ -template -int kmeans(raft::resources const& handle, - index_type_t n, - index_type_t d, - index_type_t k, - value_type_t tol, - index_type_t maxiter, - const value_type_t* __restrict__ obs, - index_type_t* __restrict__ codes, - value_type_t& residual, - index_type_t& iters, - unsigned long long seed = 123456) -{ - return detail::kmeans( - handle, n, d, k, tol, maxiter, obs, codes, residual, iters, seed); -} -} // namespace cuvs::cluster diff --git a/cpp/include/cuvs/cluster/kmeans_types.hpp b/cpp/include/cuvs/cluster/kmeans_types.hpp deleted file mode 100644 index c9090166d..000000000 --- a/cpp/include/cuvs/cluster/kmeans_types.hpp +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include - -namespace cuvs::cluster { - -/** Base structure for parameters that are common to all k-means algorithms */ -struct kmeans_base_params { - /** - * Metric to use for distance computation. The supported metrics can vary per algorithm. - */ - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded; -}; - -} // namespace cuvs::cluster - -namespace cuvs::cluster::kmeans { - -/** - * Simple object to specify hyper-parameters to the kmeans algorithm. - */ -struct KMeansParams : kmeans_base_params { - enum InitMethod { - - /** - * Sample the centroids using the kmeans++ strategy - */ - KMeansPlusPlus, - - /** - * Sample the centroids uniformly at random - */ - Random, - - /** - * User provides the array of initial centroids - */ - Array - }; - - /** - * The number of clusters to form as well as the number of centroids to generate (default:8). - */ - int n_clusters = 8; - - /** - * Method for initialization, defaults to k-means++: - * - InitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm - * to select the initial cluster centers. - * - InitMethod::Random (random): Choose 'n_clusters' observations (rows) at - * random from the input data for the initial centroids. - * - InitMethod::Array (ndarray): Use 'centroids' as initial cluster centers. - */ - InitMethod init = KMeansPlusPlus; - - /** - * Maximum number of iterations of the k-means algorithm for a single run. - */ - int max_iter = 300; - - /** - * Relative tolerance with regards to inertia to declare convergence. - */ - double tol = 1e-4; - - /** - * verbosity level. - */ - int verbosity = RAFT_LEVEL_INFO; - - /** - * Seed to the random number generator. - */ - raft::random::RngState rng_state{0}; - - /** - * Number of instance k-means algorithm will be run with different seeds. - */ - int n_init = 1; - - /** - * Oversampling factor for use in the k-means|| algorithm - */ - double oversampling_factor = 2.0; - - // batch_samples and batch_centroids are used to tile 1NN computation which is - // useful to optimize/control the memory footprint - // Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0 - // then don't tile the centroids - int batch_samples = 1 << 15; - - /** - * if 0 then batch_centroids = n_clusters - */ - int batch_centroids = 0; // - - bool inertia_check = false; -}; - -} // namespace cuvs::cluster::kmeans - -namespace cuvs::cluster { - -using kmeans::KMeansParams; - -} // namespace cuvs::cluster diff --git a/cpp/include/cuvs/cluster/single_linkage.cuh b/cpp/include/cuvs/cluster/single_linkage.cuh deleted file mode 100644 index 88c964678..000000000 --- a/cpp/include/cuvs/cluster/single_linkage.cuh +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include - -namespace cuvs::cluster { - -/** - * Note: All of the functions below in the cuvs::cluster namespace are deprecated - * and will be removed in a future release. Please use cuvs::cluster::hierarchy - * instead. - */ - -/** - * Single-linkage clustering, capable of constructing a KNN graph to - * scale the algorithm beyond the n^2 memory consumption of implementations - * that use the fully-connected graph of pairwise distances by connecting - * a knn graph when k is not large enough to connect it. - - * @tparam value_idx - * @tparam value_t - * @tparam dist_type method to use for constructing connectivities graph - * @param[in] handle raft handle - * @param[in] X dense input matrix in row-major layout - * @param[in] m number of rows in X - * @param[in] n number of columns in X - * @param[in] metric distance metrix to use when constructing connectivities graph - * @param[out] out struct containing output dendrogram and cluster assignments - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect - control - * of k. The algorithm will set `k = log(n) + c` - * @param[in] n_clusters number of clusters to assign data samples - */ -template -void single_linkage(raft::resources const& handle, - const value_t* X, - size_t m, - size_t n, - cuvs::distance::DistanceType metric, - linkage_output* out, - int c, - size_t n_clusters) -{ - detail::single_linkage( - handle, X, m, n, metric, out, c, n_clusters); -} -}; // namespace cuvs::cluster - -namespace cuvs::cluster::hierarchy { - -constexpr int DEFAULT_CONST_C = 15; - -/** - * Single-linkage clustering, capable of constructing a KNN graph to - * scale the algorithm beyond the n^2 memory consumption of implementations - * that use the fully-connected graph of pairwise distances by connecting - * a knn graph when k is not large enough to connect it. - - * @tparam value_idx - * @tparam value_t - * @tparam dist_type method to use for constructing connectivities graph - * @param[in] handle raft handle - * @param[in] X dense input matrix in row-major layout - * @param[out] dendrogram output dendrogram (size [n_rows - 1] * 2) - * @param[out] labels output labels vector (size n_rows) - * @param[in] metric distance metrix to use when constructing connectivities graph - * @param[in] n_clusters number of clusters to assign data samples - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect - control of k. The algorithm will set `k = log(n) + c` - */ -template -void single_linkage(raft::resources const& handle, - raft::device_matrix_view X, - raft::device_matrix_view dendrogram, - raft::device_vector_view labels, - cuvs::distance::DistanceType metric, - size_t n_clusters, - std::optional c = std::make_optional(DEFAULT_CONST_C)) -{ - linkage_output out_arrs; - out_arrs.children = dendrogram.data_handle(); - out_arrs.labels = labels.data_handle(); - - cuvs::cluster::single_linkage( - handle, - X.data_handle(), - static_cast(X.extent(0)), - static_cast(X.extent(1)), - metric, - &out_arrs, - c.has_value() ? c.value() : DEFAULT_CONST_C, - n_clusters); -} -}; // namespace cuvs::cluster::hierarchy diff --git a/cpp/include/cuvs/cluster/single_linkage_types.hpp b/cpp/include/cuvs/cluster/single_linkage_types.hpp deleted file mode 100644 index 8da65a01f..000000000 --- a/cpp/include/cuvs/cluster/single_linkage_types.hpp +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace cuvs::cluster::hierarchy { - -/** - * Determines the method for computing the minimum spanning tree (MST) - */ -enum LinkageDistance { - - /** - * Use a pairwise distance matrix as input to the mst. This - * is very fast and the best option for fairly small datasets (~50k data points) - */ - PAIRWISE = 0, - - /** - * Construct a KNN graph as input to the mst and provide additional - * edges if the mst does not converge. This is slower but scales - * to very large datasets. - */ - KNN_GRAPH = 1 -}; - -}; // namespace cuvs::cluster::hierarchy - -// The code below is now considered legacy -namespace cuvs::cluster { - -using hierarchy::LinkageDistance; - -/** - * Simple container object for consolidating linkage results. This closely - * mirrors the trained instance variables populated in - * Scikit-learn's AgglomerativeClustering estimator. - * @tparam value_idx - * @tparam value_t - */ -template -class linkage_output { - public: - idx_t m; - idx_t n_clusters; - - idx_t n_leaves; - idx_t n_connected_components; - - // TODO: These will be made private in a future release - idx_t* labels; // size: m - idx_t* children; // size: (m-1, 2) - - raft::device_vector_view get_labels() - { - return raft::make_device_vector_view(labels, m); - } - - raft::device_matrix_view get_children() - { - return raft::make_device_matrix_view(children, m - 1, 2); - } -}; - -class linkage_output_int : public linkage_output {}; -class linkage_output_int64 : public linkage_output {}; - -}; // namespace cuvs::cluster diff --git a/cpp/include/cuvs/distance/detail/compress_to_bits.cuh b/cpp/include/cuvs/distance/detail/compress_to_bits.cuh deleted file mode 100644 index 9ce47774a..000000000 --- a/cpp/include/cuvs/distance/detail/compress_to_bits.cuh +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include - -namespace cuvs::distance::detail { - -/** - * @brief Compress 2D boolean matrix to bitfield - * - * Utility kernel for masked_l2_nn. - * - * @tparam T - * - * @parameter[in] in An `m x n` boolean matrix. Row major. - * @parameter[out] out An `(m / bits_per_elem) x n` matrix with elements of - * type T, where T is of size `bits_per_elem` bits. - * Note: the division (`/`) is a ceilDiv. - */ -template ::value>> -RAFT_KERNEL compress_to_bits_kernel( - raft::device_matrix_view in, - raft::device_matrix_view out) -{ - constexpr int bits_per_element = 8 * sizeof(T); - constexpr int tile_dim_m = bits_per_element; - constexpr int nthreads = 128; - constexpr int tile_dim_n = nthreads; // read 128 bools at once = 1 sector - - // Tile in shared memory is transposed - __shared__ bool smem[tile_dim_n][tile_dim_m]; - - const int num_tiles_per_m = raft::ceildiv(in.extent(0), tile_dim_m); - const int num_tiles_per_n = raft::ceildiv(in.extent(1), tile_dim_n); - - for (int lin_tile_idx = blockIdx.x; true; lin_tile_idx += gridDim.x) { - const int tile_idx_n = tile_dim_n * (lin_tile_idx % num_tiles_per_n); - const int tile_idx_m = tile_dim_m * (lin_tile_idx / num_tiles_per_n); - - if (in.extent(0) <= tile_idx_m) { break; } - // Fill shared memory tile - bool reg_buf[tile_dim_m]; -#pragma unroll - for (int i = 0; i < tile_dim_m; ++i) { - const int in_m = tile_idx_m + i; - const int in_n = tile_idx_n + threadIdx.x; - bool in_bounds = in_m < in.extent(0) && in_n < in.extent(1); - reg_buf[i] = in_bounds ? in(in_m, in_n) : false; - smem[threadIdx.x][i] = reg_buf[i]; - } - __syncthreads(); - - // Drain memory tile into single output element out_elem. - T out_elem{0}; -#pragma unroll - for (int j = 0; j < tile_dim_n; ++j) { - if (smem[threadIdx.x][j]) { out_elem |= T(1) << j; } - } - __syncthreads(); - - // Write output. - int out_m = tile_idx_m / bits_per_element; - int out_n = tile_idx_n + threadIdx.x; - - if (out_m < out.extent(0) && out_n < out.extent(1)) { out(out_m, out_n) = out_elem; } - } -} - -/** - * @brief Compress 2D boolean matrix to bitfield - * - * Utility kernel for masked_l2_nn. - * - * @tparam T - * - * @parameter[in] in An `m x n` boolean matrix. Row major. - * @parameter[out] out An `(m / bits_per_elem) x n` matrix with elements of - * type T, where T is of size `bits_per_elem` bits. - * Note: the division (`/`) is a ceilDiv. - */ -template ::value>> -void compress_to_bits(raft::resources const& handle, - raft::device_matrix_view in, - raft::device_matrix_view out) -{ - auto stream = resource::get_cuda_stream(handle); - constexpr int bits_per_element = 8 * sizeof(T); - - RAFT_EXPECTS(raft::ceildiv(in.extent(0), bits_per_element) == out.extent(0), - "Number of output rows must be ceildiv(input rows, bits_per_elem)"); - RAFT_EXPECTS(in.extent(1) == out.extent(1), "Number of output columns must equal input columns."); - - const int num_SMs = raft::getMultiProcessorCount(); - int blocks_per_sm = 0; - constexpr int num_threads = 128; - constexpr int dyn_smem_size = 0; - RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &blocks_per_sm, compress_to_bits_kernel, num_threads, dyn_smem_size)); - - dim3 grid(num_SMs * blocks_per_sm); - dim3 block(128); - compress_to_bits_kernel<<>>(in, out); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -}; // namespace cuvs::distance::detail diff --git a/cpp/include/cuvs/distance/detail/distance.cuh b/cpp/include/cuvs/distance/detail/distance.cuh deleted file mode 100644 index ea935bdcb..000000000 --- a/cpp/include/cuvs/distance/detail/distance.cuh +++ /dev/null @@ -1,814 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace distance { -namespace detail { - -/** - * @brief: A tag type for overload resolution based on DistanceType - * - * It is not possible to partially specialize function templates on a single - * parameter. Instead, it is often easier to use a combination of conventional - * method overloading and a parameter with a specific tag type. The following - * type is used to help method overloading based on the DistanceType enum. - */ -template -using distance_tag = std::integral_constant; - -/** - * @brief Implement pairwise_matrix for specific distance - * - * There are multiple overloads for this function, one for each distance type. - * They are implemented below. The documentation of this function serves as - * documentation for all functions. The following overloads are defined: - * - * - DistanceType::Canberra: - * - DistanceType::CorrelationExpanded: - * - DistanceType::CosineExpanded: - * - DistanceType::HammingUnexpanded: - * - DistanceType::HellingerExpanded: - * - DistanceType::JensenShannon: - * - DistanceType::KLDivergence: - * - DistanceType::L1: - * - DistanceType::L2Expanded: - * - DistanceType::L2SqrtExpanded: - * - DistanceType::L2Unexpanded: - * - DistanceType::L2SqrtUnexpanded: - * - DistanceType::Linf: - * - DistanceType::LpUnexpanded: - * - DistanceType::RusselRaoExpanded: - * - * @tparam DataT Input data type - * @tparam AccT Accumulation data type - * @tparam OutT Output data type - * @tparam FinOpT Type of final operation - * @tparam IdxT Index type - * - * @param handle RAFT resources handle - * @param distance_type A tag type to indicate which distance is calculated. - * @param x First set of points - * @param y Second set of points - * @param out Output distance matrix - * @param m Number of points in x - * @param n Number of points in y - * @param k Dimensionality of points in x, y - * @param workspace Temporary workspace needed for computations - * @param worksize Number of bytes of the workspace - * @param is_row_major Whether the matrices are row-major or col-major - * @param metric_arg The `p` argument for Lp. - */ -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT* workspace, // unused - size_t worksize, // unused - FinOpT fin_op, - bool is_row_major, - DataT metric_arg) // unused -{ - ops::canberra_distance_op distance_op{}; - - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT* workspace, - size_t worksize, - FinOpT fin_op, - bool is_row_major, - DataT) // unused -{ - ASSERT(!(worksize < 2 * (m + n) * sizeof(AccT)), "workspace size error"); - ASSERT(workspace != nullptr, "workspace is null"); - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - AccT* x_norm = workspace; - AccT* y_norm = workspace; - AccT* sq_x_norm = workspace; - AccT* sq_y_norm = workspace; - // TODO: Column major case looks to have lower accuracy for X == Y, - // perhaps the use of stridedSummationKernel could be causing this, - // need to investigate and fix. - if (x == y && is_row_major) { - raft::linalg::reduce(x_norm, - x, - k, - std::max(m, n), - (AccT)0, - is_row_major, - true, - stream, - false, - raft::identity_op(), - raft::add_op()); - sq_x_norm += std::max(m, n); - sq_y_norm = sq_x_norm; - raft::linalg::rowNorm( - sq_x_norm, x, k, std::max(m, n), raft::linalg::L2Norm, is_row_major, stream); - } else { - y_norm += m; - raft::linalg::reduce(x_norm, - x, - k, - m, - (AccT)0, - is_row_major, - true, - stream, - false, - raft::identity_op(), - raft::add_op()); - raft::linalg::reduce(y_norm, - y, - k, - n, - (AccT)0, - is_row_major, - true, - stream, - false, - raft::identity_op(), - raft::add_op()); - - sq_x_norm += (m + n); - sq_y_norm = sq_x_norm + m; - raft::linalg::rowNorm(sq_x_norm, x, k, m, raft::linalg::L2Norm, is_row_major, stream); - raft::linalg::rowNorm(sq_y_norm, y, k, n, raft::linalg::L2Norm, is_row_major, stream); - } - - using OpT = ops::correlation_distance_op; - OpT corr_op(is_row_major, sq_x_norm, sq_y_norm, m, n, k); - pairwise_matrix_dispatch( - corr_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT* workspace, - size_t worksize, - FinOpT fin_op, - bool is_row_major, - DataT) // unused -{ - // raft distance support inputs as float/double and output as uint8_t/float/double. - static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))), - "OutT can be uint8_t, float, double," - "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT)."); - - ASSERT(!(worksize < (m + n) * sizeof(AccT)), "workspace size error"); - ASSERT(workspace != nullptr, "workspace is null"); - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - DataT* x_norm = workspace; - DataT* y_norm = workspace; - // TODO: Column major case looks to have lower accuracy for X == Y, - // perhaps the use of stridedSummationKernel could be causing this, - // need to investigate and fix. - if (x == y && is_row_major) { - raft::linalg::rowNorm( - x_norm, x, k, std::max(m, n), raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{}); - } else { - y_norm += m; - raft::linalg::rowNorm( - x_norm, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{}); - raft::linalg::rowNorm( - y_norm, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{}); - } - - ops::cosine_distance_op distance_op{}; - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - ops::hamming_distance_op distance_op{k}; - - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - raft::linalg::gemm(handle, - out, - const_cast(x), - const_cast(y), - m, - n, - k, - !is_row_major, - !is_row_major, - is_row_major, - stream); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - // First sqrt x and y - const auto raft_sqrt = raft::linalg::unaryOp; - - raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream); - if (x != y) { raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream); } - - // Then calculate Hellinger distance - ops::hellinger_distance_op distance_op{}; - - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); - - // Finally revert sqrt of x and y - raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream); - if (x != y) { raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream); } - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - ops::jensen_shannon_distance_op distance_op{}; - - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - auto unaryOp_lambda = [] __device__(DataT input) { - const bool x_zero = (input == 0); - return (!x_zero) * raft::log(input + x_zero); - }; - - auto unaryOp_lambda_reverse = [] __device__(DataT input) { - // reverse previous log (x) back to x using (e ^ log(x)) - const bool x_zero = (input == 0); - return (!x_zero) * raft::exp(input); - }; - - if (x != y) { - raft::linalg::unaryOp( - (DataT*)y, y, n * k, unaryOp_lambda, stream); - } - - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - // This op takes some shortcuts when x equals y. So its behavior changes based - // on this. - ops::kl_divergence_op distance_op{is_row_major, x == y}; - - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); - - if (x != y) { - // Now reverse previous log (x) back to x using (e ^ log(x)) - raft::linalg::unaryOp( - (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream); - } -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - ops::l1_distance_op distance_op{}; - - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl_l2_expanded( // NOTE: different name - bool perform_sqrt, // dispatch on sqrt - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT* workspace, - size_t worksize, - FinOpT fin_op, - cudaStream_t stream, - bool is_row_major) -{ - // raft distance support inputs as float/double and output as uint8_t/float/double. - static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))), - "OutT can be uint8_t, float, double," - "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT)."); - - ASSERT(!(worksize < (m + n) * sizeof(AccT)), "workspace size error"); - ASSERT(workspace != nullptr, "workspace is null"); - - DataT* x_norm = workspace; - DataT* y_norm = workspace; - // TODO: Column major case looks to have lower accuracy for X == Y, - // perhaps the use of stridedSummationKernel could be causing this, - // need to investigate and fix. - if ((x == y) && is_row_major) { - raft::linalg::rowNorm(x_norm, - x, - k, - std::max(m, n), - raft::linalg::L2Norm, - is_row_major, - stream, - raft::identity_op{}); - } else { - y_norm += m; - raft::linalg::rowNorm( - x_norm, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{}); - raft::linalg::rowNorm( - y_norm, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{}); - } - - ops::l2_exp_distance_op distance_op{perform_sqrt}; - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT* workspace, - size_t worksize, - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - bool perform_sqrt = false; - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - distance_impl_l2_expanded( - perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT* workspace, - size_t worksize, - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - bool perform_sqrt = true; - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - distance_impl_l2_expanded( - perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - bool perform_sqrt = false; - ops::l2_unexp_distance_op l2_op(perform_sqrt); - - // The unexpanded L2 does not require the norms of a and b to be calculated. - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - pairwise_matrix_dispatch( - l2_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - bool perform_sqrt = true; - ops::l2_unexp_distance_op l2_op(perform_sqrt); - - // The unexpanded L2 does not require the norms of a and b to be calculated. - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - pairwise_matrix_dispatch( - l2_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - ops::l_inf_distance_op distance_op{}; - - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT metric_arg) -{ - ops::lp_unexp_distance_op distance_op{metric_arg}; - - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -template -void distance_impl(raft::resources const& handle, - distance_tag distance_type, - const DataT* x, - const DataT* y, - OutT* out, - IdxT m, - IdxT n, - IdxT k, - AccT*, // workspace unused - size_t, // worksize unused - FinOpT fin_op, - bool is_row_major, - DataT) // metric_arg unused -{ - ops::russel_rao_distance_op distance_op{k}; - - const DataT* x_norm = nullptr; - const DataT* y_norm = nullptr; - - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - pairwise_matrix_dispatch( - distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major); -} - -/** - * @brief Evaluate pairwise distances with the user epilogue lamba allowed - * @tparam DistanceType which distance to evaluate - * @tparam InType input argument type - * @tparam AccType accumulation type - * @tparam OutType output type - * @tparam FinalLambda user-defined epilogue lamba - * @tparam Index_ Index type - * - * @param x first set of points - * @param y second set of points - * @param out output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param workspace temporary workspace needed for computations - * @param worksize number of bytes of the workspace - * @param fin_op the final gemm epilogue lambda - * @param stream cuda stream - * @param isRowMajor whether the matrices are row-major or col-major - * - * @note fin_op: This is a device lambda which is supposed to operate upon the - * input which is AccType and returns the output in OutType. It's signature is - * as follows:
OutType fin_op(AccType in, int g_idx);
. If one needs - * any other parameters, feel free to pass them via closure. - */ -template -void distance(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* out, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - bool isRowMajor = true, - InType metric_arg = 2.0f) -{ - // raft distance support inputs as float/double and output as uint8_t/float/double. - static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))), - "OutType can be uint8_t, float, double," - "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType)."); - - distance_impl( - handle, - distance_tag{}, - x, - y, - out, - m, - n, - k, - reinterpret_cast(workspace), - worksize, - fin_op, - isRowMajor, - metric_arg); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -/** - * @brief Evaluate pairwise distances for the simple use case - * @tparam DistanceType which distance to evaluate - * @tparam InType input argument type - * @tparam AccType accumulation type - * @tparam OutType output type - * @tparam Index_ Index type - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param workspace temporary workspace needed for computations - * @param worksize number of bytes of the workspace - * @param stream cuda stream - * @param isRowMajor whether the matrices are row-major or col-major - */ -template -void distance(raft::resources const& handle, - const InType* x, - const InType* y, - OutType* out, - Index_ m, - Index_ n, - Index_ k, - void* workspace, - size_t worksize, - bool isRowMajor = true, - InType metric_arg = 2.0f) -{ - auto fin_op = raft::identity_op(); - - distance( - handle, x, y, out, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg); -} - -/** - * @brief Return the exact workspace size to compute the distance - * @tparam DistanceType which distance to evaluate - * @tparam InType input argument type - * @tparam AccType accumulation type - * @tparam OutType output type - * @tparam Index_ Index type - * @param x first set of points - * @param y second set of points - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * - * @note If the specified distanceType doesn't need the workspace at all, it - * returns 0. - */ -template -size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k) -{ - size_t worksize = 0; - constexpr bool is_allocated = (distanceType <= cuvs::distance::DistanceType::CosineExpanded) || - (distanceType == cuvs::distance::DistanceType::CorrelationExpanded); - constexpr int numOfBuffers = - (distanceType == cuvs::distance::DistanceType::CorrelationExpanded) ? 2 : 1; - - if (is_allocated) { - // TODO : when X == Y allocate std::max(m, n) instead of m + n when column major input - // accuracy issue is resolved until then we allocate as m + n. - worksize += numOfBuffers * m * sizeof(AccType); - worksize += numOfBuffers * n * sizeof(AccType); - } - - return worksize; -} - -}; // namespace detail -}; // namespace distance -}; // namespace cuvs diff --git a/cpp/include/cuvs/distance/detail/distance_ops/all_ops.cuh b/cpp/include/cuvs/distance/detail/distance_ops/all_ops.cuh deleted file mode 100644 index ecbede398..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/all_ops.cuh +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -// Defines a named requirement "has_cutlass_op" -#include - -// The distance operations: -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include diff --git a/cpp/include/cuvs/distance/detail/distance_ops/canberra.cuh b/cpp/include/cuvs/distance/detail/distance_ops/canberra.cuh deleted file mode 100644 index 8bbdc9945..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/canberra.cuh +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // raft::abs -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * @brief The canberra distance matrix calculation - * - * It computes the following equation: - * - * c_ij = sum_k |x_ik - y_kj| / ( |x_ik| + |y_kj| ) - */ -template -struct canberra_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - // Load norms of input data - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = true; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const - { - const auto diff = raft::abs(x - y); - const auto add = raft::abs(x) + raft::abs(y); - // deal with potential for 0 in denominator by - // forcing 0/1 instead - acc += ((add != 0) * diff / (add + (add == 0))); - }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { - return; - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/correlation.cuh b/cpp/include/cuvs/distance/detail/distance_ops/correlation.cuh deleted file mode 100644 index f033f3dfa..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/correlation.cuh +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // DI - -namespace cuvs::distance::detail::ops { - -/** @brief The correlation distance - * - * It computes the following equation: - * - * d(x, y) = ((x - mean(x)) â‹… (y - mean(y))) - * / - * (|| x - mean(x) ||_2 || y - mean(y) ||_2) - */ -template -struct correlation_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - const DataT* x2n; - const DataT* y2n; - IdxT m; - IdxT n; - IdxT k; - - correlation_distance_op( - bool is_row_major, const DataT* x2n_, const DataT* y2n_, IdxT m_, IdxT n_, IdxT k_) noexcept - : x2n(x2n_), y2n(y2n_), m(m_), n(n_), k(k_) - { - // The distance op is typically created before the row-major/col-major - // swapping has been done. So we do it here. - if (!is_row_major) { - std::swap(x2n, y2n); - std::swap(m, n); - } - } - - // Load norms of input data - static constexpr bool use_norms = true; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize + (2 * (Policy::Mblk + Policy::Nblk) * sizeof(DataT)); - } - - DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { - // Note how we can sneakily get a pointer to shared memory here, to store - // more data. If the implementation of PairwiseDistanceMatKernel ever - // changes, this will be where we find the bugs. - extern __shared__ char smem[]; - - DataT regx2n[Policy::AccRowsPerTh], regy2n[Policy::AccColsPerTh]; - - DataT* sx2Norm = - (DataT*)(&smem[Policy::SmemSize + (Policy::Mblk + Policy::Nblk) * sizeof(DataT)]); - DataT* sy2Norm = (&sx2Norm[Policy::Mblk]); - - // Load x & y norms required by this threadblock in shmem buffer - if (gridStrideX == blockIdx.x * Policy::Nblk) { - for (int i = threadIdx.x; i < Policy::Mblk; i += Policy::Nthreads) { - auto idx = gridStrideY + i; - sx2Norm[i] = idx < m ? x2n[idx] : 0; - } - } - - for (int i = threadIdx.x; i < Policy::Nblk; i += Policy::Nthreads) { - auto idx = gridStrideX + i; - sy2Norm[i] = idx < n ? y2n[idx] : 0; - } - __syncthreads(); - -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { - regx2n[i] = sx2Norm[i * Policy::AccThRows + (threadIdx.x / Policy::AccThCols)]; - } -#pragma unroll - for (int i = 0; i < Policy::AccColsPerTh; ++i) { - regy2n[i] = sy2Norm[i * Policy::AccThCols + (threadIdx.x % Policy::AccThCols)]; - } - -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - auto numer = k * acc[i][j] - (regxn[i] * regyn[j]); - auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]); - auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]); - - acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom)); - } - } - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/cosine.cuh b/cpp/include/cuvs/distance/detail/distance_ops/cosine.cuh deleted file mode 100644 index d48731651..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/cosine.cuh +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // DI - -namespace cuvs::distance::detail::ops { - -// Epilogue operator for CUTLASS based kernel -template -struct cosine_cutlass_op { - __device__ cosine_cutlass_op() noexcept {} - __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept - { - return static_cast(1.0) - static_cast(accVal / (aNorm * bNorm)); - } - __device__ AccT operator()(DataT aData) const noexcept { return aData; } -}; - -/** - * @brief the expanded cosine distance matrix calculation - * - * It computes the following equation: - * - * d(x, y) = 1 - (x â‹… y) / ( ||x||_2 ||y||_2) - */ -template -struct cosine_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - // Load norms of input data - static constexpr bool use_norms = true; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT)); - } - - DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j])); - } - } - } - - constexpr cosine_cutlass_op get_cutlass_op() const - { - return cosine_cutlass_op(); - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/cutlass.cuh b/cpp/include/cuvs/distance/detail/distance_ops/cutlass.cuh deleted file mode 100644 index 6d928314d..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/cutlass.cuh +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // std::false_type -#include // std::declval - -namespace cuvs::distance::detail::ops { - -// This file defines the named requirement "has_cutlass_op" that can be used to -// determine if a distance operation has a CUTLASS op that can be used to pass -// to CUTLASS. Examples of distance operations that satisfy this requirement are -// cosine_distance_op and l2_exp_distance_op. - -// Primary template handles types that do not support CUTLASS. -// This pattern is described in: -// https://en.cppreference.com/w/cpp/types/void_t -template -struct has_cutlass_op : std::false_type {}; - -// Specialization recognizes types that do support CUTLASS -template -struct has_cutlass_op().get_cutlass_op())>> - : std::true_type {}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/hamming.cuh b/cpp/include/cuvs/distance/detail/distance_ops/hamming.cuh deleted file mode 100644 index 7c6553f38..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/hamming.cuh +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * @brief the Hamming Unexpanded distance matrix calculation - * It computes the following equation: - * - * c_ij = sum_k (x_ik != y_kj) / k - */ -template -struct hamming_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - IdxT k; - - hamming_distance_op(IdxT k_) noexcept : k(k_) {} - - // Load norms of input data - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const { acc += (x != y); }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { - const DataT one_over_k = DataT(1.0) / k; -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - acc[i][j] *= one_over_k; - } - } - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/hellinger.cuh b/cpp/include/cuvs/distance/detail/distance_ops/hellinger.cuh deleted file mode 100644 index ad5ca3156..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/hellinger.cuh +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * @brief the Hellinger distance matrix calculation - * - * It computes the following equation: - * - * c_ij = sqrt(1 - sum_k sqrt(x_ik * y_kj)) - * - */ -template -struct hellinger_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - // Load norms of input data - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const - { - // This is sqrt(x) * sqrt(y). - const auto product = x * y; - acc += product; - }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative - const auto finalVal = (1 - acc[i][j]); - const auto rectifier = (!signbit(finalVal)); - acc[i][j] = raft::sqrt(rectifier * finalVal); - } - } - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/jensen_shannon.cuh b/cpp/include/cuvs/distance/detail/distance_ops/jensen_shannon.cuh deleted file mode 100644 index 216639494..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/jensen_shannon.cuh +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include // raft::log -#include // DI - -namespace cuvs::distance::detail::ops { - -// Describes the computation the jensen_shannon distance - -/** - * @brief the Jensen Shannon distance matrix calculation - * - * It computes the following equation: - * - * c_ij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i)) - * + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i))))) - */ -template -struct jensen_shannon_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - // Load norms of input data - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = true; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const - { - const DataT m = 0.5f * (x + y); - const bool m_zero = (m == 0); - const auto logM = (!m_zero) * raft::log(m + m_zero); - - const bool x_zero = (x == 0); - const bool y_zero = (y == 0); - acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero))); - }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - acc[i][j] = raft::sqrt(0.5 * acc[i][j]); - } - } - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/kl_divergence.cuh b/cpp/include/cuvs/distance/detail/distance_ops/kl_divergence.cuh deleted file mode 100644 index 929c3a559..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/kl_divergence.cuh +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include // raft::log -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * @brief the KL Divergence distance matrix calculation - * - * It computes the following equation: - * - * c_ij = 0.5 * sum(x * log (x / y)); - */ -template -struct kl_divergence_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - const bool is_row_major; - const bool x_equal_y; - - kl_divergence_op(bool row_major_, bool x_equal_y_ = false) noexcept - : is_row_major(row_major_), x_equal_y(x_equal_y_) - { - } - - // Load norms of input data - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = true; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const - { - // TODO: make sure that these branches get hoisted out of main loop.. Could - // be quite expensive otherwise. - if (x_equal_y) { - if (is_row_major) { - const bool x_zero = (x == 0); - const bool y_zero = (y == 0); - acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero)); - } else { - const bool y_zero = (y == 0); - const bool x_zero = (x == 0); - acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero)); - } - } else { - if (is_row_major) { - const bool x_zero = (x == 0); - acc += x * (raft::log(x + x_zero) - y); - } else { - const bool y_zero = (y == 0); - acc += y * (raft::log(y + y_zero) - x); - } - } - }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - acc[i][j] = (0.5f * acc[i][j]); - } - } - } -}; -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/l1.cuh b/cpp/include/cuvs/distance/detail/distance_ops/l1.cuh deleted file mode 100644 index 76eaffaf3..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/l1.cuh +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * @brief the L1 distance matrix calculation - * - * It computes the following equation: - * - * c_ij = sum_k abs(x_ik - y_kj) - */ -template -struct l1_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - // Do not load norms of data, the computation of L1 distance does not use them. - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const { acc += raft::abs(x - y); }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { - return; - }; -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/cuvs/distance/detail/distance_ops/l2_exp.cuh deleted file mode 100644 index f45c41206..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/l2_exp.cuh +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * Reserve 1 digit of precision from each floating-point type - * for round-off error tolerance. - * @tparam DataT - */ -template -__device__ constexpr DataT get_clamp_precision() -{ - switch (sizeof(DataT)) { - case 2: return 1e-3; - case 4: return 1e-6; - case 8: return 1e-15; - default: return 0; - } -} - -// Epilogue operator for CUTLASS based kernel -template -struct l2_exp_cutlass_op { - bool sqrt; - - __device__ l2_exp_cutlass_op() noexcept : sqrt(false) {} - __device__ l2_exp_cutlass_op(bool isSqrt) noexcept : sqrt(isSqrt) {} - inline __device__ AccT operator()(DataT aNorm, DataT bNorm, DataT accVal) const noexcept - { - AccT outVal = aNorm + bNorm - DataT(2.0) * accVal; - - /** - * Self-neighboring points should have (aNorm == bNorm) == accVal and the dot product (accVal) - * can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal instead. - */ - outVal = outVal * !((outVal * outVal < get_clamp_precision()) * (aNorm == bNorm)); - return sqrt ? raft::sqrt(outVal * (outVal > 0)) : outVal; - } - - __device__ AccT operator()(DataT aData) const noexcept { return aData; } -}; - -/** - * @brief the expanded euclidean distance matrix calculation - * - * It computes the following equation: - * - * c_ij = - 2 sum_k x_ik * y_kj + ||x_i.||_2 + ||y_.j||_2 - * - */ -template -struct l2_exp_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - const bool sqrt; - - l2_exp_distance_op(bool sqrt_) noexcept : sqrt(sqrt_) {} - - // Load norms of input data - static constexpr bool use_norms = true; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT)); - } - - DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - DataT accVal = acc[i][j]; - DataT val = regxn[i] + regyn[j] - (DataT)2.0 * accVal; - - /** - * Self-neighboring points should have (aNorm == bNorm) == accVal and the dot product - * (accVal) can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal - * instead. - */ - acc[i][j] = - val * (val > 0) * !((val * val < get_clamp_precision()) * (regxn[i] == regyn[j])); - } - } - if (sqrt) { -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - acc[i][j] = raft::sqrt(acc[i][j]); - } - } - } - } - - constexpr l2_exp_cutlass_op get_cutlass_op() const - { - return l2_exp_cutlass_op(sqrt); - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/cuvs/distance/detail/distance_ops/l2_unexp.cuh deleted file mode 100644 index aa6cc27f3..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/l2_unexp.cuh +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * @brief the unexpanded euclidean distance matrix calculation - * - * It computes the following equation: - * - * c_ij = optional_sqrt ( sum_k (x_ik - y_kj)^2 ) - */ -template -struct l2_unexp_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - bool sqrt; - - l2_unexp_distance_op(bool sqrt_) noexcept : sqrt(sqrt_) {} - - // Do not load norms of data, the computation of L1 distance does not use them. - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const - { - const auto diff = x - y; - acc += diff * diff; - }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { - if (sqrt) { -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - acc[i][j] = raft::sqrt(acc[i][j]); - } - } - } - }; -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/l_inf.cuh b/cpp/include/cuvs/distance/detail/distance_ops/l_inf.cuh deleted file mode 100644 index d8f9384d7..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/l_inf.cuh +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * @brief the L_inf (Chebyshev) distance matrix calculation - * - * It computes the following equation: - * - * c_ij = max_k | x_ik - y_kj | - */ -template -struct l_inf_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - // Load norms of input data - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const - { - const auto diff = raft::abs(x - y); - acc = raft::max(acc, diff); - }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { - return; - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/lp_unexp.cuh b/cpp/include/cuvs/distance/detail/distance_ops/lp_unexp.cuh deleted file mode 100644 index 6136f9f3e..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/lp_unexp.cuh +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include // raft::pow, raft::abs -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * @brief the unexpanded Lp (Minkowski) distance matrix calculation - * - * It computes the following equation: - * - * c_ij = (sum_k |x_ik - y_jk|^p)^(1/p) - */ -template -struct lp_unexp_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - DataT p; - - lp_unexp_distance_op(DataT p_) noexcept : p(p_) {} - - // Load norms of input data - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = true; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const - { - const auto diff = raft::abs(x - y); - acc += raft::pow(diff, p); - }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { - const auto one_over_p = 1.0f / p; -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - acc[i][j] = raft::pow(acc[i][j], one_over_p); - } - } - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/cuvs/distance/detail/distance_ops/russel_rao.cuh deleted file mode 100644 index 5dffdcdb8..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/russel_rao.cuh +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // DI - -namespace cuvs::distance::detail::ops { - -/** - * @brief the Russell Rao distance matrix calculation - * - * It computes the following equation: - * - * c_ij = (k - (sum_k x_ik * y_kj)) / k - */ -template -struct russel_rao_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - IdxT k; - const float one_over_k; - - russel_rao_distance_op(IdxT k_) noexcept : k(k_), one_over_k(1.0f / k_) {} - - // Load norms of input data - static constexpr bool use_norms = false; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { -#pragma unroll - for (int i = 0; i < Policy::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < Policy::AccColsPerTh; ++j) { - acc[i][j] = (k - acc[i][j]) * one_over_k; - } - } - } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/distance_ops/template.cuh b/cpp/include/cuvs/distance/detail/distance_ops/template.cuh deleted file mode 100644 index bdb933237..000000000 --- a/cpp/include/cuvs/distance/detail/distance_ops/template.cuh +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // DI - -namespace cuvs::distance::detail::ops { - -// Describes the computation the template distance -// -// Fill in the TODO items. - -template -struct template_distance_op { - using DataT = DataType; - using AccT = AccType; - using IdxT = IdxType; - - TODO member; - - template_distance_op(TODO member_) noexcept : member(member_) {} - - // Load norms of input data - static constexpr bool use_norms = TODO; - // Whether the core function requires so many instructions that it makes sense - // to reduce loop unrolling, etc. We do this to keep compile times in check. - static constexpr bool expensive_inner_loop = false; - - // Size of shared memory. This is normally decided by the kernel policy, but - // some ops such as correlation_distance_op use more. - template - static constexpr size_t shared_mem_size() - { - return Policy::SmemSize + TODO; - } - - DI void core(AccT& acc, DataT& x, DataT& y) const { TODO; }; - - template - DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh], - DataT* regxn, - DataT* regyn, - IdxT gridStrideX, - IdxT gridStrideY) const - { - TODO; - } - - // If exist, returns a cutlass op that performs the same operation. - // See cosine and l2_exp distance ops for an example. - constexpr l2_exp_cutlass_op get_cutlass_op() const { TODO; } -}; - -} // namespace cuvs::distance::detail::ops diff --git a/cpp/include/cuvs/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h b/cpp/include/cuvs/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h deleted file mode 100644 index f659ed256..000000000 --- a/cpp/include/cuvs/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h +++ /dev/null @@ -1,671 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/*! \file - - \brief Epilogue for threadblock scoped GEMMs using Tensor Ops. - - The epilogue rearranges the result of a matrix product through shared memory to match canonical - tensor layouts in global memory. Epilogues support conversion and reduction operations. - -This file contains a customized version of EpilogueWithBroadcast from CUTLASS 2.9.1 -(https://github.com/NVIDIA/cutlass/blob/v2.9.1/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h) - -Changes: -- customized the compute_source_needed_() and apply_output_operator_() to suit the needs of per row -reduction -*/ - -#pragma once - -#if defined(__CUDACC_RTC__) -#include -#include -#else -#include -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include -#include - -#include - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace epilogue { -namespace threadblock { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// This base class is meant to define the concept required of the -/// EpilogueWithBroadcast::OutputOp -template -struct EpilogueWithBroadcastOpBaseCustom { - using ElementOutput = ElementC_; - using ElementAccumulator = ElementAccumulator_; - using ElementCompute = ElementCompute_; - using ElementZ = ElementZ_; - using ElementT = ElementT_; - static int const kElementsPerAccess = ElementsPerAccess; - - using FragmentAccumulator = Array; - using FragmentCompute = Array; - using FragmentC = Array; - using FragmentZ = Array; - using FragmentT = Array; - - /// If true, the 'Z' tensor is stored - static bool const kStoreZ = StoreZ; - - /// If true, the 'T' tensor is stored - static bool const kStoreT = StoreT; - - /// Parameters structure - required - struct Params {}; - - // - // Methods - // - - /// Constructor from Params - EpilogueWithBroadcastOpBaseCustom(Params const& params_) {} - - /// Determine if the source is needed. May return false if - bool is_source_needed() const { return true; } - - CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition, int k_partition_count) {} - - /// Applies the operation when is_source_needed() is true - CUTLASS_HOST_DEVICE - void operator()(FragmentZ& frag_Z, - FragmentT& frag_T, - FragmentAccumulator const& AB, - FragmentC const& frag_C, - FragmentCompute const& V) const - { - } - - /// Applies the operation when is_source_needed() is false - CUTLASS_HOST_DEVICE - void operator()(FragmentZ& frag_Z, - FragmentT& frag_T, - FragmentAccumulator const& AB, - FragmentCompute const& V) const - { - } -}; - -//////////////////////////////////////////////////////////////////////////////// - -/// Epilogue operator with bias vector broadcast over columns. -/// -/// Computes the following: -/// -/// -/// Z, T = OutputOp(AB, C, Broadcast) -/// -/// if (ElementwiseOp::kStoreZ) { -/// store(converted_u); -/// } -/// -/// if (ElementwiseOp::kStoreT) { -/// store(v); -/// } -/// -template < - typename Shape_, ///< Shape of threadblock tile (concept: GemmShape) - typename WarpMmaOperator_, ///< Warp-level MMA operator (concept: gemm::warp::MmaTensorOp) - int PartitionsK, ///< Number of partitions of the K dimension - typename OutputTileIterator_, ///< Tile iterator reading and writing output tensors (z) - typename TensorTileIterator_, ///< Additional tile iterator for tensor-valued operands (t) - typename ElementVector_, ///< Pointer to broadcast vector - typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting accumulators - typename WarpTileIterator_, ///< Warp-scoped tile iterator writing accumulators to SMEM - typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading from SMEM - typename OutputOp_, ///< Output operator - concept is EpilogueWithBroadcastOp - typename Padding_, ///< Padding added to SMEM allocation to avoid bank conflicts (concept: - ///< MatrixShape) - int FragmentsPerPartition = 1, ///< Used to coarsten the epilogue granularity - int IterationsUnroll = ///< Used to reduce binary size when epilogue op is large - (!IsEpilogueFunctorHeavy::value)> -class EpilogueWithBroadcastCustom : public EpilogueBase { - public: - using Base = EpilogueBase; - - using Shape = Shape_; - using WarpMmaOperator = WarpMmaOperator_; - static int const kPartitionsK = PartitionsK; - using OutputTileIterator = OutputTileIterator_; - using TensorTileIterator = TensorTileIterator_; - using ElementVector = ElementVector_; - using AccumulatorFragmentIterator = AccumulatorFragmentIterator_; - using WarpTileIterator = WarpTileIterator_; - using SharedLoadIterator = SharedLoadIterator_; - using OutputOp = OutputOp_; - using Padding = Padding_; - - using Layout = layout::RowMajor; - using LongIndex = typename Layout::LongIndex; - - /// The complete warp-level accumulator tile - using AccumulatorTile = typename Base::AccumulatorTile; - - /// Accumulator element - using ElementAccumulator = typename WarpTileIterator::Element; - - /// Compute data type produced by the output op - using ElementCompute = typename OutputOp::ElementCompute; - - /// Compute fragment - using FragmentCompute = Array; - - /// Thread map used by output tile iterators - using ThreadMap = typename OutputTileIterator::ThreadMap; - - /// Fragment object used to store the broadcast values - using BroadcastFragment = - Array; - - /// Output element - using ElementOutput = typename OutputTileIterator::Element; - - /// Data type of additional tensor - using ElementTensor = typename TensorTileIterator::Element; - - /// Output access size - static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess; - - /// Tensor reference to destination tensor - using TensorRef = typename OutputTileIterator::TensorRef; - - /// Tensor reference to sync tensor - using SyncTensorRef = typename cutlass::TensorRef; - - /// Const tensor reference to source tensor - using ConstTensorRef = typename OutputTileIterator::ConstTensorRef; - - /// Array type used to output - using OutputAccessType = - Array; - - /// Array type used by output functor - using AccumulatorAccessType = - Array; - - /// Array type used by output functor - using ComputeAccessType = Array; - - /// Tensor access type - using TensorAccessType = Array; - - /// Number of warps - using WarpCount = typename Base::WarpCount; - - /// Shared memory allocation from epilogue base class - using BaseSharedStorage = typename Base::SharedStorage; - - static int constexpr kSmemTiles = - Base::kFragmentsPerIteration > 1 ? Base::kFragmentsPerIteration : kPartitionsK; - static int constexpr kSmemPointerOffset = Base::SharedStorage::StorageShape::kCount / kSmemTiles; - - /// Used for the broadcast - struct BroadcastDetail { - /// Number of threads per warp - static int const kWarpSize = 32; - - static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; - - /// Number of distinct scalar column indices handled by each thread - static int const kColumnsPerThread = - ThreadMap::Iterations::kColumn * ThreadMap::kElementsPerAccess; - - /// Number of distinct scalar row indices handled by each thread - static int const kRowsPerThread = - ThreadMap::Iterations::kCount / ThreadMap::Iterations::kColumn; - - /// Number of threads per threadblock - static int const kThreadCount = kWarpSize * WarpCount::kCount; - - /// Number of distinct threads per row of output tile - static int const kThreadsPerRow = (Shape::kN / kColumnsPerThread); - - /// Number of distinct threads which must be reduced during the final reduction phase within the - /// threadblock. - static int const kThreadRows = kThreadCount / kThreadsPerRow; - - /// I'm not sure what I meant here. - static int const kThreadAccessesPerRow = - const_max(1, (Shape::kN + kThreadCount - 1) / kThreadCount); - - /// Shape of the shared memory allocation for the epilogue - using StorageShape = MatrixShape; - - /// Debug printing - CUTLASS_DEVICE - static void print() - { -#if 0 - printf("BroadcastDetail {\n"); - printf( - " kColumnsPerThread: %d\nkRowsPerThread: %d\n,kThreadCount: %d\nkThreadsPerRow: %d\n" - "kThreadRows: %d\nThreadAccessesPerRow: %d\nStorageShape: %d x %d (count: %d)\n", - kColumnsPerThread, - kRowsPerThread, - kThreadCount, - kThreadsPerRow, - kThreadRows, - kThreadAccessesPerRow, - StorageShape::kRow, - StorageShape::kColumn, - StorageShape::kCount - ); - printf("};\n"); -#endif - } - }; - - /// Shared storage structure (shadows base) with additional SMEM buffer for reduction - struct SharedStorage { - union { - BaseSharedStorage base; - }; - - CUTLASS_HOST_DEVICE - SharedStorage() {} - }; - - public: - static_assert(SharedLoadIterator::Fragment::kElements == TensorTileIterator::Fragment::kElements, - "Mismatch between shared load iterator and output tile iterator."); - - static_assert(OutputTileIterator::kElementsPerAccess, - "OutputTileIterator::kElementsPerAccess must not be zero."); - - static_assert(!(OutputTileIterator::Fragment::kElements % OutputTileIterator::kElementsPerAccess), - "Divisibility"); - - private: - /// Loads fragment from shared memory aligned with output tensor - SharedLoadIterator shared_load_iterator_; - - /// Thread index within the threadblock - int thread_idx_; - - public: - /// Constructor - CUTLASS_DEVICE - EpilogueWithBroadcastCustom(SharedStorage& shared_storage, ///< Shared storage object - int thread_idx, ///< ID of a thread within the threadblock - int warp_idx, ///< ID of warp within threadblock - int lane_idx ///< Id of thread within warp - ) - : Base(shared_storage.base, thread_idx, warp_idx, lane_idx), - shared_load_iterator_(shared_storage.base.reference(), thread_idx), - thread_idx_(thread_idx) - { - } - - /// Streams the result to global memory - CUTLASS_DEVICE - void operator()( - OutputOp const& output_op, ///< Output operator - ElementVector const* broadcast_ptr, ///< Broadcast vector - AccumulatorTile const& accumulators, ///< Complete warp-level accumulator tile - OutputTileIterator source_iterator, ///< Tile iterator for source accumulator matrix - TensorTileIterator - tensor_iterator, ///< Threadblock tile iterator for additional tensor operand - MatrixCoord const& - problem_size = ///< Problem size needed to guard against out-of-bounds accesses - MatrixCoord(Shape::kM, Shape::kN), - MatrixCoord const& - threadblock_offset = ///< Threadblock's initial offset within the problem size space - MatrixCoord()) - { - BroadcastFragment broadcast_fragment; - - load_broadcast_fragment_(broadcast_fragment, broadcast_ptr, problem_size, threadblock_offset); - - compute_source_needed_( - output_op, broadcast_fragment, accumulators, source_iterator, tensor_iterator); - } - - private: - CUTLASS_DEVICE - void load_broadcast_fragment_( - BroadcastFragment& - broadcast_fragment, ///< Fragment containing the accumulated partial reduction over columns - ElementVector const* broadcast_ptr, ///< Broadcast vector - MatrixCoord const& - problem_size, ///< Problem size needed to guard against out-of-bounds accesses - MatrixCoord const& - threadblock_offset ///< Threadblock's initial offset within the problem size space - ) - { - broadcast_fragment.clear(); - - // If no pointer is supplied, set with all zeros and avoid memory accesses - if (!broadcast_ptr) { return; } - - int thread_initial_column = ThreadMap::initial_offset(thread_idx_).column(); - - int thread_column_idx = threadblock_offset.column() + thread_initial_column; - broadcast_ptr += thread_initial_column; - - NumericArrayConverter - converter; - using AccessType = AlignedArray; - using ComputeFragmentType = Array; - - ComputeFragmentType* frag_ptr = reinterpret_cast(&broadcast_fragment); - - CUTLASS_PRAGMA_UNROLL - for (int j = 0; j < ThreadMap::Iterations::kColumn; ++j) { - AccessType loaded; - - loaded.clear(); - - if (thread_column_idx < problem_size.column()) { - loaded = *reinterpret_cast(broadcast_ptr); - } - - ComputeFragmentType cvt = converter(loaded); - frag_ptr[j] = cvt; - - thread_column_idx += ThreadMap::Delta::kColumn; - broadcast_ptr += ThreadMap::Delta::kColumn; - } - } - - template - struct acc2smem_source_not_needed; - - template - struct acc2smem_source_not_needed> { - template - CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator, - WarpTileIterator& warp_tile_iterator) - { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < Advance; i++) { - ++accum_fragment_iterator; - } - - CUTLASS_PRAGMA_UNROLL - for (int p = 0; p < Base::kFragmentsPerIteration; ++p) { - typename AccumulatorFragmentIterator::Fragment accum_fragment; - - accum_fragment_iterator.load(accum_fragment); - ++accum_fragment_iterator; - - warp_tile_iterator.store(accum_fragment); - if (p < Base::kFragmentsPerIteration - 1) { - warp_tile_iterator.add_pointer_offset(kSmemPointerOffset); - } - } - - if (Base::kFragmentsPerIteration > 1) { - warp_tile_iterator.add_pointer_offset(kSmemPointerOffset * - (1 - Base::kFragmentsPerIteration)); - } - } - - CUTLASS_DEVICE - static void push(size_t pos, - AccumulatorFragmentIterator const& iterator_begin, - WarpTileIterator& warp_tile_iterator) - { - int dummy[] = { - (pos == (Seq * Base::kFragmentsPerIteration)) && - (helper(iterator_begin, warp_tile_iterator), 0)...}; - - CUTLASS_UNUSED(dummy[0]); - } - }; - - /// Streams the result to global memory - CUTLASS_DEVICE - void compute_source_not_needed_( - OutputOp const& output_op, ///< Output operator - BroadcastFragment const& - broadcast_fragment, ///< Fragment containing the accumulated partial reduction over columns - OutputTileIterator destination_iterator, ///< Tile iterator for destination - AccumulatorTile const& accumulators, ///< Complete warp-level accumulator tile - TensorTileIterator tensor_iterator ///< Threadblock tile iterator for additioanl tensor operand - ) - { - } - - template - struct acc2smem_source_needed; - - template - struct acc2smem_source_needed> { - template - CUTLASS_DEVICE static void helper(AccumulatorFragmentIterator accum_fragment_iterator, - WarpTileIterator& warp_tile_iterator) - { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < Advance; i++) { - ++accum_fragment_iterator; - } - - typename AccumulatorFragmentIterator::Fragment accum_fragment; - accum_fragment_iterator.load(accum_fragment); - warp_tile_iterator.store(accum_fragment); - } - - CUTLASS_DEVICE - static void push(size_t pos, - AccumulatorFragmentIterator const& iterator_begin, - WarpTileIterator& warp_tile_iterator) - { - int dummy[] = {(pos == Seq) && (helper(iterator_begin, warp_tile_iterator), 0)...}; - } - }; - - /// Streams the result to global memory - CUTLASS_DEVICE - void compute_source_needed_( - OutputOp const& output_op, ///< Output operator - BroadcastFragment const& - broadcast_fragment, ///< Fragment containing the accumulated partial reduction over columns - AccumulatorTile const& accumulators, ///< Complete warp-level accumulator tile - OutputTileIterator - source_iterator, ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) - TensorTileIterator tensor_iterator ///< Threadblock tile iterator for additioanl tensor operand - ) - { - typename OutputTileIterator::Fragment source_fragment; - source_fragment.clear(); - - // - // Iterator over warp-level accumulator fragment - // - - AccumulatorFragmentIterator accum_fragment_iterator(accumulators); - - // - // Iterate over accumulator tile - // - -#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1) - for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) { - // - // Convert and store fragment - // - - //__syncthreads(); - - acc2smem_source_needed>::push( - iter, accum_fragment_iterator, this->warp_tile_iterator_); - - __syncthreads(); - - // - // Load fragments from shared memory - // - - typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK]; - - shared_load_iterator_.load(aligned_accum_fragment[0]); - - // - // Apply output operation - // - - typename TensorTileIterator::Fragment frag_T; - - // - // Load the source - // - - source_iterator.load(source_fragment); - ++source_iterator; - - apply_output_operator_( - frag_T, output_op, aligned_accum_fragment[0], source_fragment, broadcast_fragment); - - // - // Conditionally store fragments - // - if (OutputOp::kStoreT) { - tensor_iterator.store(frag_T); - ++tensor_iterator; - } - } - } - - /// Helper to invoke the output functor over each vector of output - CUTLASS_DEVICE - void apply_output_operator_(typename TensorTileIterator::Fragment& frag_T, - OutputOp const& output_op, - typename SharedLoadIterator::Fragment const& frag_AB, - typename OutputTileIterator::Fragment const& frag_C, - BroadcastFragment const& frag_Broadcast) - { - using AccessTypeT = Array; - using AccessTypeBroadcast = Array; - - AccessTypeT* frag_T_ptr = reinterpret_cast(&frag_T); - - AccumulatorAccessType const* frag_AB_ptr = - reinterpret_cast(&frag_AB); - - OutputAccessType const* frag_C_ptr = reinterpret_cast(&frag_C); - - AccessTypeBroadcast const* frag_Broadcast_ptr = - reinterpret_cast(&frag_Broadcast); - - int const kOutputOpIterations = - TensorTileIterator::Fragment::kElements / TensorTileIterator::kElementsPerAccess; - - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kOutputOpIterations; ++i) { - output_op(frag_T_ptr[i], - frag_AB_ptr[i], - frag_C_ptr[(i / ThreadMap::Iterations::kColumn)], - frag_Broadcast_ptr[i % ThreadMap::Iterations::kColumn]); - } - } - - /// Helper to invoke the output functor over each vector of output - CUTLASS_DEVICE - void apply_output_operator_source_not_needed_( - typename OutputTileIterator::Fragment& frag_Z, - typename TensorTileIterator::Fragment& frag_T, - OutputOp const& output_op, - typename SharedLoadIterator::Fragment const& frag_AB, - BroadcastFragment const& frag_Broadcast) - { - } -}; - -//////////////////////////////////////////////////////////////////////////////// - -} // namespace threadblock -} // namespace epilogue -} // namespace cutlass - -//////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/include/cuvs/distance/detail/fused_distance_nn/cutlass_base.cuh b/cpp/include/cuvs/distance/detail/fused_distance_nn/cutlass_base.cuh deleted file mode 100644 index 7c0b5d127..000000000 --- a/cpp/include/cuvs/distance/detail/fused_distance_nn/cutlass_base.cuh +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wtautological-compare" - -// We define CUTLASS_NAMESPACE in case -// RAFT cmake is not used -#ifndef CUTLASS_NAMESPACE -#define cutlass raft_cutlass -#endif - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include // FusedDistanceNNEpilogueElementwise -#include // FusedDistanceNNGemm -#include // getMultiProcessorCount -#include // RAFT_CUTLASS_TRY - -namespace cuvs { -namespace distance { -namespace detail { - -template -void cutlassFusedDistanceNN(const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - int* mutexes, - CGReduceOpT cg_reduce_op, - DistanceFn dist_op, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - cudaStream_t stream) -{ - using EpilogueOutputOp = cutlass::epilogue::thread::FusedDistanceNNEpilogueElementwise< - DataT, // ElementC_ - AccT, // ElementAccumulator_ - DataT, // ElementCompute_ - AccT, // ElementZ_ - OutT, // ElementT_ - // 128 / cutlass::sizeof_bits::value, - 1, // Elements per access 1 - DistanceFn, - CGReduceOpT, - ReduceOpT, - KVPReduceOpT>; - constexpr int batch_count = 1; - - typename EpilogueOutputOp::Params epilog_op_param( - dist_op, cg_reduce_op, redOp, pairRedOp, mutexes); - - // Number of pipelines you want to use - constexpr int NumStages = 3; - // Alignment - constexpr int Alignment = VecLen; - - // default initialize problem size with row major inputs - auto problem_size = cutlass::gemm::GemmCoord(m, n, k); - - constexpr bool isRowMajor = true; - - using fusedDistanceNNKernel = - typename cutlass::gemm::kernel::FusedDistanceNNGemm::GemmKernel; - - using fusedDistanceNN = cutlass::gemm::device::GemmGrouped; - - int num_blocks_per_sm = fusedDistanceNN::maximum_active_blocks(); - int num_sms = raft::getMultiProcessorCount(); - int full_wave = num_blocks_per_sm * num_sms; - constexpr int mmaShapeM = fusedDistanceNNKernel::Mma::Shape::kM; - constexpr int mmaShapeN = fusedDistanceNNKernel::Mma::Shape::kN; - int columnTiles = (problem_size.n() - 1 + mmaShapeN) / mmaShapeN; - int rowTiles = (problem_size.m() - 1 + mmaShapeM) / mmaShapeM; - int totalTiles = columnTiles * rowTiles; - int thread_blocks = - rowTiles < full_wave ? (totalTiles < full_wave ? totalTiles : full_wave) : rowTiles; - - typename fusedDistanceNN::Arguments arguments{ - problem_size, - batch_count, // num of problems. - thread_blocks, - epilog_op_param, - x, - y, - xn, // C matrix eq vector param, which here is A norm - (DataT*)yn, // this is broadcast vec, which is required to be non-const param - dOutput, // Output distance matrix - (int64_t)lda, // stride A - (int64_t)ldb, // stride B - (int64_t)1, // stride A norm - (int64_t)ldd // stride Output matrix - }; - - // Using the arguments, query for extra workspace required for matrix multiplication computation - size_t workspace_size = fusedDistanceNN::get_workspace_size(arguments); - // Allocate workspace memory - rmm::device_uvector workspace(workspace_size, stream); - // Instantiate CUTLASS kernel depending on templates - fusedDistanceNN fusedDistanceNN_op; - // Check the problem size is supported or not - RAFT_CUTLASS_TRY(fusedDistanceNN_op.can_implement(arguments)); - // Initialize CUTLASS kernel with arguments and workspace pointer - RAFT_CUTLASS_TRY(fusedDistanceNN_op.initialize(arguments, workspace.data(), stream)); - // Launch initialized CUTLASS kernel - RAFT_CUTLASS_TRY(fusedDistanceNN_op.run(stream)); -} - -}; // namespace detail -}; // namespace distance -}; // namespace cuvs - -#pragma GCC diagnostic pop diff --git a/cpp/include/cuvs/distance/detail/fused_distance_nn/epilogue.cuh b/cpp/include/cuvs/distance/detail/fused_distance_nn/epilogue.cuh deleted file mode 100644 index 7053f2702..000000000 --- a/cpp/include/cuvs/distance/detail/fused_distance_nn/epilogue.cuh +++ /dev/null @@ -1,136 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/*! \file - \brief Epilogue for threadblock scoped GEMMs using Tensor Ops. - -This is adapted from DefaultEpilogueWithBroadcastTensorOp from CUTLASS 2.9.0 -(https://github.com/NVIDIA/cutlass/blob/master/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h#L75) - -This epilogue allows us to load norm buffers using PredicatedTileIteratorNormVec -and EpilogueWithBroadcast used for distances L2/cosine as well as applies user-define elementwise -operation. --- A norm load is provided PredicatedTileIteratorNormVec --- B norm load is provided by EpilogueWithBroadcast --- elementwise operation is provided by OutputOp -*/ - -#pragma once - -#include -#include -#include - -#include - -#include -#include -#include -#include - -#include -#include - -//////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace epilogue { -namespace threadblock { - -//////////////////////////////////////////////////////////////////////////////// - -/// Defines sensible defaults for epilogues for TensorOps. -template -struct FusedDistanceNNEpilogue { - /// Use defaults related to the existing epilogue - using Base = - DefaultEpilogueTensorOp; - - // - // Stores the result z = (y = GEMM(A, B, C), broadcast) - // - using RowNormTileIterator = cutlass::epilogue::threadblock:: - PredicatedTileIteratorNormVecSmem; - - // - // Additional tensor tile iterator - stores t = Elementwise(z) - // - using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIteratorReducedVec< - typename Base::OutputTileThreadMap, - ElementTensor, - LayoutT, - typename OutputOp::Params>; - - /// Define the epilogue - using Epilogue = cutlass::epilogue::threadblock::EpilogueWithBroadcastCustom< - Shape, - WarpMmaTensorOp, - PartitionsK, - RowNormTileIterator, - OutputTileIterator, - ElementVector, - typename Base::AccumulatorFragmentIterator, - typename Base::WarpTileIterator, - typename Base::SharedLoadIterator, - OutputOp, - typename Base::Padding, - Base::kFragmentsPerIteration>; -}; - -} // namespace threadblock -} // namespace epilogue -} // namespace cutlass - -//////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/include/cuvs/distance/detail/fused_distance_nn/epilogue_elementwise.cuh b/cpp/include/cuvs/distance/detail/fused_distance_nn/epilogue_elementwise.cuh deleted file mode 100644 index a21f3d60e..000000000 --- a/cpp/include/cuvs/distance/detail/fused_distance_nn/epilogue_elementwise.cuh +++ /dev/null @@ -1,216 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// -/*! \file - \brief Functor performing distance operations used by epilogues of pairwise distance - * kernels. -* This is adapted from LinearCombinationBiasElementwise from CUTLASS 2.9.0 -* customized for applying elementwise distance formula on accumulated GEMM value -* and applying user-defined operation which can convert distance values to key-value pair. -* . -*/ - -#pragma once - -#include -#include -#include -#include -#include - -#include - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace epilogue { -namespace thread { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// This base class is meant to define the concept required of the -/// EpilogueWithBroadcast::OutputOp -template -class FusedDistanceNNEpilogueElementwise { - public: - using ElementOutput = ElementC_; - using ElementC = ElementC_; - using ElementAccumulator = ElementAccumulator_; - using ElementCompute = ElementCompute_; - using ElementZ = ElementZ_; - using ElementT = ElementT_; - static int const kElementsPerAccess = ElementsPerAccess; - static int const kCount = kElementsPerAccess; - - using DistanceOp = DistanceOp_; - using CGReduceOp = CGReduceOp_; - - using FragmentAccumulator = Array; - using FragmentCompute = Array; - using FragmentC = Array; - using FragmentZ = Array; - using OutValT = typename CGReduceOp::AccTypeT; - using FragmentT = Array; - - using FragmentOutput = FragmentZ; - - static bool const kIsHeavy = true; // ElementwiseOp::kIsHeavy; - - /// If true, the 'Z' tensor is stored - static bool const kStoreZ = false; // We don't store anything in Z, - - /// If true, the 'T' tensor is stored - static bool const kStoreT = true; // this is our final output storage. - - /// Host-constructable parameters structure - struct Params { - CGReduceOp_ cg_reduce_op; - DistanceOp_ dist_op_; - KVPReduceOpT_ pair_redop_; - ReduceOpT_ red_op_; - int* mutexes_; - using CGReduceT = CGReduceOp_; - // - // Methods - // - CUTLASS_HOST_DEVICE - Params(DistanceOp_ dist_op, - CGReduceOp cg_reduce_op, - ReduceOpT_ red_op, - KVPReduceOpT_ pair_redop, - int* mutexes) - : cg_reduce_op(cg_reduce_op), - dist_op_(dist_op), - pair_redop_(pair_redop), - red_op_(red_op), - mutexes_(mutexes) - { - } - - CUTLASS_HOST_DEVICE - Params() {} - }; - - private: - // - // Data members - // - DistanceOp_ elementwise_op; - KVPReduceOpT_ pair_redop; - - public: - ReduceOpT_ red_op; - - // - // Methods - // - - /// Constructor from Params - CUTLASS_HOST_DEVICE - FusedDistanceNNEpilogueElementwise(Params const& params) - : elementwise_op(params.dist_op_), pair_redop(params.pair_redop_), red_op(params.red_op_) - { - } - - /// Returns true if source is needed - CUTLASS_HOST_DEVICE - bool is_source_needed() const - { - // we use for making sure C matrix is used for A mat norm. - return true; - } - - /// Functionally required for serial reduction in the epilogue - CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition, int k_partition_count) {} - - /// Applies the operation when is_source_needed() is true - CUTLASS_HOST_DEVICE - void operator()(FragmentT& frag_T, - FragmentAccumulator const& AB, - FragmentC const& frag_C, - FragmentCompute const& V) const - { - FragmentCompute tmp_Accum = - NumericArrayConverter()(AB); - FragmentCompute tmp_C = - NumericArrayConverter()(frag_C); - FragmentCompute result_Z; - - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kElementsPerAccess; ++i) { - ElementCompute res_Z = elementwise_op(tmp_C[i], V[i], tmp_Accum[i]); - frag_T[i] = res_Z; - } - } - - /// Applies the operation when is_source_needed() is false - CUTLASS_HOST_DEVICE - void operator()(FragmentZ& frag_Z, - FragmentT& frag_T, - FragmentAccumulator const& AB, - FragmentCompute const& V) const - { - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace thread -} // namespace epilogue -} // namespace cutlass - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/include/cuvs/distance/detail/fused_distance_nn/gemm.h b/cpp/include/cuvs/distance/detail/fused_distance_nn/gemm.h deleted file mode 100644 index fd5956a57..000000000 --- a/cpp/include/cuvs/distance/detail/fused_distance_nn/gemm.h +++ /dev/null @@ -1,410 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include - -#include -#include - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace gemm { -namespace kernel { - -///////////////////////////////////////////////////////////////////////////////////////////////// -/* - * This configuration is used for float inputs with veclen(kAlignmentA/B) = 2 or 4, - * ideal threadblock tile shape is 32x256x16 for such cases as there is no - * registers spills for it. - * - */ -template < - /// Element type for A matrix operand - typename ElementA_, - /// Layout type for A matrix operand - int kAlignmentA, - /// Element type for B matrix operand - typename ElementB_, - /// Layout type for B matrix operand - int kAlignmentB, - /// Element type for C and D matrix operands - typename ElementC_, - /// Element type for internal accumulation - typename ElementAccumulator, - /// Epilogue output operator - must satisfy concept of 'EpilogueWithBroadcastOp' - typename EpilogueOutputOp, - /// Number of stages used in the pipelined mainloop - int Stages, - /// data layout row/column major of inputs - bool isRowMajor> -struct FusedDistanceNNGemm { - // This struct is specialized for fp32/3xTF32 - - /// Threadblock-level tile size (concept: GemmShape) - // <- threadblock tile M = 32, N = 256, K = 16 - // this is more performant but note that for veclen = 1 - // this shape has register spills - using ThreadblockShape = cutlass::gemm::GemmShape<32, 256, 16>; - - // <- threadblock tile M = 32, N = 128, K = 16 - // this shape has high occupancy but less perf - // this is less performant but this shape has *no* register spills - // for any veclens(1, 2, 4) - // using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - - /// Warp-level tile size (concept: GemmShape) - // This code section describes tile size a warp will compute - // <- warp tile M = 64, N = 64, K = 16 - // this is more performant for veclen 2,4. - using WarpShape = cutlass::gemm::GemmShape<32, 64, 16>; - - // this shape has high occupancy but less perf used for 32x128x16 - // using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; - - /// Warp-level tile size (concept: GemmShape) - // This code section describes the size of MMA op - // <- MMA Op tile M = 16, N = 8, K = 4 - using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>; - - /// Operation performed by GEMM - using Operator = cutlass::arch::OpMultiplyAddFastF32; - // using Operator = cutlass::arch::OpMultiplyAdd; // this runs only 1xTF32 for float inputs - - // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU - // SM - using OperatorClass = cutlass::arch::OpClassTensorOp; - - // This code section describes CUDA SM architecture number - using ArchTag = cutlass::arch::Sm80; - - // This code section describes how threadblocks are scheduled on GPU - /// Threadblock-level swizzling operator - using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; - - /// data layout for final output matrix. - // we keep this same layout even for column major inputs - using LayoutOutput = cutlass::layout::RowMajor; - - typedef typename std::conditional::type NormXLayout; - - typedef typename std:: - conditional::type LayoutA_; - - typedef typename std:: - conditional::type LayoutB_; - - using GemmBase = typename DefaultGemmUniversal::GemmKernel; - - // Replace epilogue - using Epilogue = typename cutlass::epilogue::threadblock::FusedDistanceNNEpilogue< - typename GemmBase::Epilogue::Shape, - typename GemmBase::Epilogue::WarpMmaOperator, - GemmBase::Epilogue::kPartitionsK, - ElementAccumulator, - typename EpilogueOutputOp::ElementT, - ElementAccumulator, - EpilogueOutputOp, - NormXLayout, - GemmBase::Epilogue::kElementsPerAccess>::Epilogue; - - // Compose the GEMM kernel - using GemmKernel = FusedDistanceNNPersistent; -}; - -/* - * This configuration is used for float inputs with veclen(kAlignmentA/B) = 1, - * ideal threadblock tile shape is 32x128x16 for such cases as there is no - * registers spills for it. - * - */ -template < - /// Element type for C and D matrix operands - typename ElementC_, - /// Element type for internal accumulation - typename ElementAccumulator, - /// Epilogue output operator - must satisfy concept of 'EpilogueWithBroadcastOp' - typename EpilogueOutputOp, - /// Number of stages used in the pipelined mainloop - int Stages, - /// data layout row/column major of inputs - bool isRowMajor> -struct FusedDistanceNNGemm { - // This struct is specialized for fp32/3xTF32 - using ElementA_ = float; - using ElementB_ = float; - - /// Threadblock-level tile size (concept: GemmShape) - // <- threadblock tile M = 32, N = 128, K = 16 - // this shape has high occupancy and no register spills for veclen = 1. - using ThreadblockShape = cutlass::gemm::GemmShape<32, 128, 16>; - - /// Warp-level tile size (concept: GemmShape) - // This code section describes tile size a warp will compute - // <- warp tile M = 32, N = 32, K = 16 - using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; - - /// Warp-level tile size (concept: GemmShape) - // This code section describes the size of MMA op - // <- MMA Op tile M = 16, N = 8, K = 4 - using InstructionShape = cutlass::gemm::GemmShape<16, 8, 4>; - - /// Operation performed by GEMM - using Operator = cutlass::arch::OpMultiplyAddFastF32; - // using Operator = cutlass::arch::OpMultiplyAdd; // this runs only 1xTF32 for float inputs - - // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU - // SM - using OperatorClass = cutlass::arch::OpClassTensorOp; - - // This code section describes CUDA SM architecture number - using ArchTag = cutlass::arch::Sm80; - - // This code section describes how threadblocks are scheduled on GPU - /// Threadblock-level swizzling operator - using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; - - /// data layout for final output matrix. - // we keep this same layout even for column major inputs - using LayoutOutput = cutlass::layout::RowMajor; - - typedef typename std::conditional::type NormXLayout; - - typedef typename std:: - conditional::type LayoutA_; - - typedef typename std:: - conditional::type LayoutB_; - - using GemmBase = typename DefaultGemmUniversal::GemmKernel; - - // Replace epilogue - using Epilogue = typename cutlass::epilogue::threadblock::FusedDistanceNNEpilogue< - typename GemmBase::Epilogue::Shape, - typename GemmBase::Epilogue::WarpMmaOperator, - GemmBase::Epilogue::kPartitionsK, - ElementAccumulator, - typename EpilogueOutputOp::ElementT, - ElementAccumulator, - EpilogueOutputOp, - NormXLayout, - GemmBase::Epilogue::kElementsPerAccess>::Epilogue; - - // Compose the GEMM kernel - using GemmKernel = FusedDistanceNNPersistent; -}; - -template < - /// Layout type for A matrix operand - int kAlignmentA, - /// Layout type for B matrix operand - int kAlignmentB, - /// Element type for C and D matrix operands - typename ElementC_, - /// Element type for internal accumulation - typename ElementAccumulator, - /// Epilogue output operator - must satisfy concept of 'EpilogueWithBroadcastOp' - typename EpilogueOutputOp, - /// Number of stages used in the pipelined mainloop - int Stages, - /// data layout row/column major of inputs - bool isRowMajor> -struct FusedDistanceNNGemm { - // Threadblock-level tile size (concept: GemmShape) - // <- threadblock tile M = 64, N = 64, K = 16 - using ThreadblockShape = cutlass::gemm::GemmShape<64, 64, 16>; - // using ThreadblockShape = cutlass::gemm::GemmShape<16, 128, 16>; - /// Warp-level tile size (concept: GemmShape) - // This code section describes tile size a warp will compute - // <- warp tile M = 32, N = 32, K = 16 - using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; - // using WarpShape = cutlass::gemm::GemmShape<16, 32, 16>; - /// Warp-level tile size (concept: GemmShape) - // This code section describes the size of MMA op - using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; - - // Operation performed by GEMM - using Operator = cutlass::arch::OpMultiplyAdd; - // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU - // SM - using OperatorClass = cutlass::arch::OpClassTensorOp; - - // This code section describes CUDA SM architecture number - using ArchTag = cutlass::arch::Sm80; - - // This code section describes how threadblocks are scheduled on GPU - /// Threadblock-level swizzling operator - using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; - - /// data layout for final output matrix. - // we keep this same layout even for column major inputs - using LayoutOutput = cutlass::layout::RowMajor; - - typedef typename std::conditional::type NormXLayout; - - typedef typename std:: - conditional::type LayoutA_; - - typedef typename std:: - conditional::type LayoutB_; - - using GemmBase = typename DefaultGemmUniversal::GemmKernel; - - // Replace epilogue - using Epilogue = typename cutlass::epilogue::threadblock::FusedDistanceNNEpilogue< - typename GemmBase::Epilogue::Shape, - typename GemmBase::Epilogue::WarpMmaOperator, - GemmBase::Epilogue::kPartitionsK, - ElementC_, - typename EpilogueOutputOp::ElementT, - ElementC_, - EpilogueOutputOp, - NormXLayout, - GemmBase::Epilogue::kElementsPerAccess>::Epilogue; - - // Compose the GEMM kernel - using GemmKernel = FusedDistanceNNPersistent; -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace kernel -} // namespace gemm -} // namespace cutlass \ No newline at end of file diff --git a/cpp/include/cuvs/distance/detail/fused_distance_nn/persistent_gemm.h b/cpp/include/cuvs/distance/detail/fused_distance_nn/persistent_gemm.h deleted file mode 100644 index 3a8d6c865..000000000 --- a/cpp/include/cuvs/distance/detail/fused_distance_nn/persistent_gemm.h +++ /dev/null @@ -1,515 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/*! \file - \brief Problem visitor for grouped GEMMs -This file contains heavily customized version of GemmGrouped from CUTLASS 2.10.0 -(https://github.com/NVIDIA/cutlass/blob/v2.10.0/include/cutlass/gemm/kernel/gemm_grouped.h) - -Changes: -- adds support for only single problem size to be launched persistently - where each threablock processes more than one tile of the same problem. -*/ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace gemm { -namespace kernel { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -template -struct FusedDistanceNNPersistent { - public: - using Mma = Mma_; - using Epilogue = Epilogue_; - using EpilogueOutputOp = typename Epilogue::OutputOp; - using ThreadblockSwizzle = ThreadblockSwizzle_; - static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_; - static bool const kTransposed = Transposed; - - // Optional transpose - using MapArguments = kernel::detail::MapArguments; - - // Public-facing type definitions related to operand element type, layout, and complex conjugate - // operation. Must interact with the 'kTransposed' notion. - using ElementA = typename MapArguments::ElementA; - using LayoutA = typename MapArguments::LayoutA; - using ElementB = typename MapArguments::ElementB; - using LayoutB = typename MapArguments::LayoutB; - using ElementC = typename Epilogue::OutputTileIterator::Element; - using LayoutC = typename MapArguments::LayoutC; - - static ComplexTransform const kTransformA = MapArguments::kTransformA; - static ComplexTransform const kTransformB = MapArguments::kTransformB; - - // Type definitions about the mainloop. - using Operator = typename Mma::Operator; - using OperatorClass = typename Mma::Operator::OperatorClass; - using ThreadblockShape = typename Mma::Shape; - using WarpShape = typename Mma::Operator::Shape; - using InstructionShape = typename Mma::Policy::Operator::InstructionShape; - using ArchTag = typename Mma::ArchTag; - - static int const kStages = Mma::kStages; - static int const kAlignmentA = MapArguments::kAlignmentA; - static int const kAlignmentB = MapArguments::kAlignmentB; - static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess; - - /// Warp count (concept: GemmShape) - using WarpCount = typename Mma::WarpCount; - static int const kThreadCount = 32 * WarpCount::kCount; - - using ProblemVisitor = GemmGroupedProblemVisitor; - - // - // Structures - // - - struct temp_problem_visitor { - int problem_count; - - CUTLASS_HOST_DEVICE temp_problem_visitor() : problem_count(0){}; - CUTLASS_HOST_DEVICE temp_problem_visitor(int problem_count_) : problem_count(problem_count_){}; - }; - - /// Argument structure - struct Arguments { - // - // Data members - // - GemmCoord problem_sizes; - temp_problem_visitor problem_visitor; - int problem_count; - int threadblock_count; - - typename EpilogueOutputOp::Params output_op; - - void const* ptr_A; - void const* ptr_B; - void const* ptr_C; - void* ptr_Vector; - void* ptr_Tensor; - - typename LayoutA::Stride::Index lda; - typename LayoutB::Stride::Index ldb; - typename LayoutC::Stride::Index ldc; - typename LayoutC::Stride::Index ldt; - - // Only used by device-level operator - GemmCoord* host_problem_sizes; - - // - // Methods - // - - /// Default ctor - CUTLASS_HOST_DEVICE - Arguments() - : // problem_count(0), - threadblock_count(0), - ptr_A(nullptr), - ptr_B(nullptr), - ptr_C(nullptr), - ptr_Vector(nullptr), - ptr_Tensor(nullptr), - lda(0), - ldb(0), - ldc(0), - ldt(0), - host_problem_sizes(nullptr) - { - } - - /// Ctor - CUTLASS_HOST_DEVICE - Arguments(GemmCoord problem_sizes, - int problem_count, - int threadblock_count, - typename EpilogueOutputOp::Params output_op, - void const* ptr_A, - void const* ptr_B, - void const* ptr_C, - void* ptr_Vector, - void* ptr_Tensor, - typename LayoutA::Stride::Index lda, - typename LayoutB::Stride::Index ldb, - typename LayoutC::Stride::Index ldc, - typename LayoutC::Stride::Index ldt, - GemmCoord* host_problem_sizes = nullptr) - : problem_sizes(problem_sizes), - threadblock_count(threadblock_count), - output_op(output_op), - ptr_A(ptr_A), - ptr_B(ptr_B), - ptr_C(ptr_C), - ptr_Vector(ptr_Vector), - ptr_Tensor(ptr_Tensor), - lda(lda), - ldb(ldb), - ldc(ldc), - ldt(ldt), - host_problem_sizes(host_problem_sizes) - { - problem_visitor.problem_count = problem_count; - } - }; - - // - // Structure for precomputing values in host memory and passing to kernels - // - - /// Parameters structure - struct Params { - // typename ProblemVisitor::Params problem_visitor; - temp_problem_visitor problem_visitor; - int threadblock_count; - - typename Mma::IteratorA::Params params_A; - typename Mma::IteratorB::Params params_B; - typename Epilogue::OutputTileIterator::Params params_C; - typename Epilogue::TensorTileIterator::Params params_Tensor; - - typename EpilogueOutputOp::Params output_op; - - void* ptr_A; - void* ptr_B; - void* ptr_C; - void* ptr_Vector; - void* ptr_Tensor; - - GemmCoord problem_size; - typename LayoutA::Stride::Index lda; - typename LayoutB::Stride::Index ldb; - typename LayoutC::Stride::Index ldc; - typename LayoutC::Stride::Index ldt; - - // - // Methods - // - - CUTLASS_HOST_DEVICE - Params() - : params_A(0), - params_B(0), - params_C(0), - ptr_A(nullptr), - ptr_B(nullptr), - ptr_C(nullptr), - ptr_Vector(nullptr), - ptr_Tensor(nullptr), - lda(0), - ldb(0), - ldc(0), - ldt(0) - { - } - - CUTLASS_HOST_DEVICE - Params(Arguments const& args, void* workspace = nullptr, int tile_count = 0) - : problem_size(args.problem_sizes), - threadblock_count(args.threadblock_count), - output_op(args.output_op), - params_A(args.lda), - params_B(args.ldb), - params_C(args.ldc), - // Here we pass additional user args via args.output_op - // to the reduction output tile iterator - params_Tensor(args.ldt, args.output_op), - ptr_A(const_cast(args.ptr_A)), - ptr_B(const_cast(args.ptr_B)), - ptr_C(const_cast(args.ptr_C)), - ptr_Vector(args.ptr_Vector), - ptr_Tensor(args.ptr_Tensor), - lda(args.lda), - ldb(args.ldb), - ldc(args.ldc), - ldt(args.ldt) - { - problem_visitor.problem_count = args.problem_visitor.problem_count; - } - - CUTLASS_HOST_DEVICE - void update(Arguments const& args, void* workspace = nullptr, int tile_count = 0) - { - threadblock_count = args.threadblock_count; - output_op = args.output_op; - ptr_A = const_cast(args.ptr_A); - ptr_B = const_cast(args.ptr_B); - ptr_C = const_cast(args.ptr_C); - ptr_Vector = args.ptr_Vector; - ptr_Tensor = args.ptr_Tensor; - lda = args.lda; - ldb = args.ldb; - ldc = args.ldc; - ldt = args.ldt; - - problem_size = args.problem_sizes; - } - }; - - /// Shared memory storage structure - struct SharedStorage { - union { - typename Mma::SharedStorage main_loop; - typename Epilogue::SharedStorage epilogue; - } kernel; - - typename Epilogue::TensorTileIterator::SharedStorage reduced_store; - typename Epilogue::OutputTileIterator::SharedStorage rownorm_store; - }; - - public: - // - // Methods - // - - CUTLASS_DEVICE - FusedDistanceNNPersistent() {} - - /// Determines whether kernel satisfies alignment - static Status can_implement(cutlass::gemm::GemmCoord const& problem_size) - { - return Status::kSuccess; - } - - static Status can_implement(Arguments const& args) { return Status::kSuccess; } - - static size_t get_extra_workspace_size(Arguments const& args, - cutlass::gemm::GemmCoord const& grid_tiled_shape) - { - return 0; - } - - CUTLASS_DEVICE - static uint32_t tile_count(const cutlass::MatrixCoord& grid) - { - return grid.row() * grid.column(); - } - - /// Get the grid shape - CUTLASS_DEVICE - static cutlass::MatrixCoord grid_shape(const cutlass::gemm::GemmCoord& problem) - { - return cutlass::MatrixCoord(((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM), - ((problem.n() - 1 + ThreadblockShape::kN) / ThreadblockShape::kN)); - } - - /// Executes one GEMM - CUTLASS_DEVICE - void operator()(Params const& params, SharedStorage& shared_storage) - { -#if __CUDA_ARCH__ >= 800 - // - // These types shadow the type-level definitions and support the ability to implement - // a 'transposed' GEMM that computes the transposed problems. - // - using ElementA = typename Mma::IteratorA::Element; - using LayoutA = typename Mma::IteratorA::Layout; - using ElementB = typename Mma::IteratorB::Element; - using LayoutB = typename Mma::IteratorB::Layout; - using ElementC = typename Epilogue::OutputTileIterator::Element; - using LayoutC = typename Epilogue::OutputTileIterator::Layout; - - const GemmCoord& problem_size = params.problem_size; - const auto grid_shape_ = grid_shape(problem_size); - const uint32_t problem_chunk = (tile_count(grid_shape_) - 1 + gridDim.x) / gridDim.x; - const uint32_t problem_chunk_end = blockIdx.x * problem_chunk + problem_chunk; - typename LayoutB::Index column = - ((blockIdx.x * problem_chunk) % grid_shape_.column()) * Mma::Shape::kN; - - typename LayoutB::Index row = - ((blockIdx.x * problem_chunk) / grid_shape_.column()) * Mma::Shape::kM; - if (column) { - shared_storage.reduced_store.initSmem(params.output_op); - shared_storage.rownorm_store.initSmem(params.ptr_C, problem_size.m(), row, sizeof(ElementC)); - } - - // Outer 'persistent' loop to iterate over tiles - for (uint32_t tile_idx = blockIdx.x * problem_chunk; tile_idx < problem_chunk_end; tile_idx++) { - const auto grid_shape_ = grid_shape(problem_size); - cutlass::MatrixCoord threadblock_offset( - int(tile_idx / grid_shape_.column()) * Mma::Shape::kM, - int(tile_idx % grid_shape_.column()) * Mma::Shape::kN); - - const bool isNextTile = ((tile_idx + 1) < problem_chunk_end); - const bool doesRowChange = - ((threadblock_offset.column() + Mma::Shape::kN) >= problem_size.n()); - const bool do_gmem_reduce = (doesRowChange || !isNextTile) ? true : false; - - ElementA* ptr_A = static_cast(params.ptr_A); - ElementB* ptr_B = static_cast(params.ptr_B); - - // Compute initial location in logical coordinates - cutlass::MatrixCoord tb_offset_A{threadblock_offset.row(), 0}; - cutlass::MatrixCoord tb_offset_B{0, threadblock_offset.column()}; - - // Compute position within threadblock - int thread_idx = threadIdx.x; - - // Construct iterators to A and B operands - typename Mma::IteratorA iterator_A( - params.params_A, ptr_A, {problem_size.m(), problem_size.k()}, thread_idx, tb_offset_A); - - typename Mma::IteratorB iterator_B( - params.params_B, ptr_B, {problem_size.k(), problem_size.n()}, thread_idx, tb_offset_B); - - // Broadcast the warp_id computed by lane 0 to ensure dependent code - // is compiled as warp-uniform. - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); - - int lane_idx = threadIdx.x % 32; - - // - // Matrix multiply phase - // - - // Construct thread-scoped matrix multiply - Mma mma(shared_storage.kernel.main_loop, thread_idx, warp_idx, lane_idx); - - typename Mma::FragmentC accumulators; - - accumulators.clear(); - // Compute threadblock-scoped matrix multiply-add - int gemm_k_iterations = (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; - - // Wait for all threads to finish their epilogue phases from the previous tile. - //__syncthreads(); - - // Compute threadblock-scoped matrix multiply-add - mma(gemm_k_iterations, accumulators, iterator_A, iterator_B, accumulators); - - // - // Epilogue - // - - EpilogueOutputOp output_op(params.output_op); - - ElementC* ptr_C = static_cast(params.ptr_C); - typename Epilogue::ElementTensor* ptr_Tensor = - static_cast(params.ptr_Tensor); - - // Define the reduction output pointer and move to the appropriate place - typename Epilogue::ElementVector* ptr_Vector = - static_cast(params.ptr_Vector); - - // Tile iterator loading from source tensor. - typename Epilogue::OutputTileIterator iterator_rownorm(shared_storage.rownorm_store, - params.params_C, - ptr_C, - problem_size.mn(), - thread_idx, - threadblock_offset); - - // Additional tensor to load from - typename Epilogue::TensorTileIterator tensor_iterator(shared_storage.reduced_store, - params.params_Tensor, - // Only the final block outputs Tensor - ptr_Tensor, - problem_size.mn(), - thread_idx, - do_gmem_reduce, - threadblock_offset); - - Epilogue epilogue(shared_storage.kernel.epilogue, thread_idx, warp_idx, lane_idx); - - // Execute the epilogue operator to update the destination tensor. - // Move to appropriate location for this output tile - if (ptr_Vector) { ptr_Vector += threadblock_offset.column(); } - - // Execute the epilogue operator to update the destination tensor. - epilogue(output_op, - ptr_Vector, - // iterator_D, - accumulators, - iterator_rownorm, - tensor_iterator, - problem_size.mn(), - threadblock_offset); - } -#endif - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace kernel -} // namespace gemm -} // namespace cutlass - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/include/cuvs/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h b/cpp/include/cuvs/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h deleted file mode 100644 index 14c09f6ae..000000000 --- a/cpp/include/cuvs/distance/detail/fused_distance_nn/predicated_tile_iterator_normvec_smem.h +++ /dev/null @@ -1,448 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/*! \file - \brief Epilogue for threadblock scoped GEMMs using Tensor Ops. - -This file contains a customized version of PredicatedTileIterator from CUTLASS 2.9.0 -(https://github.com/NVIDIA/cutlass/blob/v2.9.0/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h#L75) - -Changes: -- added `Layout_` template param -- Only the row index is used to load the data in load_with_byte_offset(). - This way the same normalization data is used across all columns in a row. - -*/ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -//////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { - -//////////////////////////////////////////////////////////////////////////////// - -namespace epilogue { -namespace threadblock { - -//////////////////////////////////////////////////////////////////////////////// - -/// Tile iterator used to load and store output tile from global memory in epilogue. -/// -/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator -/// -template -class PredicatedTileIteratorNormVecSmem { - public: - using ThreadMap = ThreadMap_; - using Shape = typename ThreadMap::Shape; - - using Element = Element_; - - using Layout = Layout_; - using TensorRef = TensorRef; - using ConstTensorRef = typename TensorRef::ConstTensorRef; - - using Index = typename Layout::Index; - using LongIndex = typename Layout::LongIndex; - using TensorCoord = MatrixCoord; - - static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; - static int const kThreads = ThreadMap::kThreads; - static int const kIterations = ThreadMap::Count::kTile; - - static int const total_rows = ThreadMap::kWarpCount * ThreadMap::Iterations::kRow * - ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster * - ThreadMap::Count::kTile * ThreadMap::Delta::kRow; - - static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0"); - static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0"); - static_assert(ThreadMap::Iterations::kCluster > 0, "ThreadMap::Iterations::kCluster must be > 0"); - static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0"); - - using Fragment = Array; - - /// Memory access size - using AccessType = AlignedArray; - - // - // Parameters struct - // - - /// Uses a non-template class - struct Params : PredicatedTileIteratorParams { - using Base = PredicatedTileIteratorParams; - - CUTLASS_HOST_DEVICE - Params() {} - - CUTLASS_HOST_DEVICE - Params(Layout const& layout) - : PredicatedTileIteratorParams( - layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess, - make_OutputTileThreadMapDesc()) - { - } - - CUTLASS_HOST_DEVICE - Params(Base const& base) : Base(base) {} - }; - - /// Mask object - struct Mask { - static int const kCount = ThreadMap::Iterations::kColumn; - - /// Predicate state - bool predicates[kCount]; - - // - // Mask - // - CUTLASS_HOST_DEVICE - Mask() { enable(); } - - ///< Efficiently disables all accesses guarded by mask - CUTLASS_HOST_DEVICE void clear() - { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kCount; ++i) { - predicates[i] = false; - } - } - - ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask - CUTLASS_DEVICE void enable() - { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kCount; ++i) { - predicates[i] = true; - } - } - }; - - /// Shared storage allocation needed by the predicated tile - // iterator for storing rowNorm chunk. - struct SharedStorage { - // - // Type definitions - // - using Shape = MatrixShape; - - /// Shape of the shared memory allocation - using StorageShape = MatrixShape; - - // - // Data members - // - // Methods - // - AlignedBuffer storage; - - CUTLASS_DEVICE - Element* data() { return storage.data(); } - - SharedStorage() {} - - CUTLASS_DEVICE - void initSmem(void* pointer, - const Index& num_rows, - const Index& tb_row_offset, - const LongIndex& stride) - { - Element* shared_elem_arr = data(); - uint8_t* first_tile_byte_pointer_ = - reinterpret_cast(pointer) + LongIndex(tb_row_offset) * LongIndex(stride); - const auto gmem_ptr = reinterpret_cast(first_tile_byte_pointer_); - - for (int row = threadIdx.x; row < total_rows; row += blockDim.x) { - bool guard = (tb_row_offset + row) < num_rows; - cutlass::arch::cp_async(shared_elem_arr + row, gmem_ptr + row, guard); - cutlass::arch::cp_async_wait<0>(); - } - } - }; - - private: - // - // Data members - // - - /// Parameters structure containing reference and precomputed state. - PredicatedTileIteratorParams params_; - - /// Byte-level pointer - uint8_t* byte_pointer_; - - /// Array of boolean values to contain steady-state predicates - Mask mask_; - - /// Extent of the matrix tile in rows - Index extent_row_; - - /// Extent of the matrix tile in rows - Index extent_column_; - - /// A thread's starting row position (assuming steady-state predicates have been computed) - Index thread_start_row_; - - /// A thread's starting column - Index thread_start_column_; - - /// Internal state counter - int state_[3]; - - /// Scatter indices - int const* indices_; - - // - // Static asserts about internal strides - // - - static_assert(sizeof(extent_row_) == 4, "Expected 32b extents"); - static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents"); - static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides"); - - private: - // - // Methods - // - - protected: - SharedStorage& shared_storage_; - - public: - // - // Methods - // - - /// Constructor - CUTLASS_DEVICE - PredicatedTileIteratorNormVecSmem(SharedStorage& shared_storage, - PredicatedTileIteratorParams const& params, - Element* pointer, - TensorCoord extent, - int thread_idx, - TensorCoord& threadblock_offset, - int const* indices = nullptr) - : params_(params), indices_(indices), shared_storage_(shared_storage) - { - TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset; - - extent_row_ = extent.row(); - extent_column_ = extent.column(); - - thread_start_row_ = thread_offset.row(); - thread_start_column_ = thread_offset.column(); - - // Initialize predicates - CUTLASS_PRAGMA_UNROLL - for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) { - mask_.predicates[c] = - ((thread_offset.column() + ThreadMap::Delta::kColumn * c) < extent.column()); - } - - // Null pointer performs no accesses - if (!pointer) { - mask_.clear(); - return; - } - - if (ScatterD && !indices) { mask_.clear(); } - - // Initialize pointer - byte_pointer_ = reinterpret_cast(pointer) + - LongIndex(thread_offset.row()) * LongIndex(params_.stride); - - if (ScatterD) { - byte_pointer_ = reinterpret_cast(pointer) + - LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess; - } - - if (threadblock_offset.column() == 0) { - shared_storage_.initSmem(pointer, extent_row_, threadblock_offset.row(), params_.stride); - } - - // Initialize internal state counter - state_[0] = state_[1] = state_[2] = 0; - } - - /// Adds a pointer offset in units of Element - CUTLASS_HOST_DEVICE - void add_pointer_offset(LongIndex pointer_offset) - { - byte_pointer_ += pointer_offset * sizeof_bits::value / 8; - } - - /// Loads a fragment from memory - CUTLASS_DEVICE - void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const - { - AccessType* frag_ptr = reinterpret_cast(&frag); - - Element* shared_elem_arr = shared_storage_.data(); - - CUTLASS_PRAGMA_UNROLL - for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { - CUTLASS_PRAGMA_UNROLL - for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { - CUTLASS_PRAGMA_UNROLL - for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = - (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - - int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; - int iter_row = ((row_offset + thread_start_row_) % total_rows); - Element val = shared_elem_arr[iter_row]; - - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kElementsPerAccess; ++i) { - (*frag_ptr)[frag_row_idx + i] = val; - } - } - } - } - } - - /// Loads a fragment from memory - CUTLASS_DEVICE - void load(Fragment& frag) const { load_with_byte_offset(frag, 0); } - - CUTLASS_DEVICE - MatrixCoord thread_start() const { return MatrixCoord(thread_start_row_, thread_start_column_); } - - /// Need to get the thread start row from the tile iterator - CUTLASS_DEVICE - int32_t thread_start_row() const { return thread_start_row_; } - - /// Need to get the thread start row from the tile iterator - CUTLASS_DEVICE - int32_t thread_start_column() const { return thread_start_column_; } - - /// Extent of the matrix in rows - CUTLASS_DEVICE - Index extent_row() const { return extent_row_; } - - /// Extent of the matrix in columns - CUTLASS_DEVICE - Index extent_column() const { return extent_column_; } - - /// Advances to the next position to load or store - CUTLASS_HOST_DEVICE - PredicatedTileIteratorNormVecSmem& operator++() - { - ++state_[0]; - - if (!ScatterD) { byte_pointer_ += params_.advance_row; } - - thread_start_row_ += ThreadMap::Shape::kRow; - - if (state_[0] == ThreadMap::Count::kRow) { - state_[0] = 0; - ++state_[1]; - byte_pointer_ += params_.advance_group; - - thread_start_row_ += - (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow; - - if (state_[1] == ThreadMap::Count::kGroup) { - state_[1] = 0; - ++state_[2]; - byte_pointer_ += params_.advance_cluster; - - thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup * - ThreadMap::Count::kRow * ThreadMap::Shape::kRow; - - if (state_[2] == ThreadMap::Count::kCluster) { - state_[2] = 0; - byte_pointer_ += params_.advance_tile; - } - } - } - - return *this; - } - - ///< Efficiently disables all accesses guarded by mask - CUTLASS_DEVICE void clear_mask() { mask_.clear(); } - - ///< Efficiently enables all accesses guarded by mask - CUTLASS_DEVICE void enable_mask() { mask_.enable(); } - - ///< Sets the mask - CUTLASS_DEVICE void get_mask(Mask& mask) const { mask = mask_; } - - ///< Sets the mask - CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; } -}; - -/////////////////////////////////////////////////////////////////////////////// - -} // namespace threadblock -} // namespace epilogue -} // namespace cutlass - -//////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/include/cuvs/distance/detail/fused_distance_nn/predicated_tile_iterator_reduced_vec.h b/cpp/include/cuvs/distance/detail/fused_distance_nn/predicated_tile_iterator_reduced_vec.h deleted file mode 100644 index dc224c5c9..000000000 --- a/cpp/include/cuvs/distance/detail/fused_distance_nn/predicated_tile_iterator_reduced_vec.h +++ /dev/null @@ -1,626 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/*! \file - \brief Epilogue for threadblock scoped GEMMs using Tensor Ops. - -This file contains a customized version of PredicatedTileIterator from CUTLASS 2.9.0 -(https://github.com/NVIDIA/cutlass/blob/v2.9.0/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h#L75) - -Changes: -- added `Layout_` template param -- PredicatedTileIteratorParams() is customized to not stride by layout.stride(0). -- makes use of `SharedStorage` to store reduced values across warps to gmem in coalesced manner. -- customized the store_with_byte_offset() to perform reduction per row and write final value to -gmem. -- customized the Params() struct to take user inputs from epilogueOp params. - -*/ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cg = cooperative_groups; - -//////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { - -//////////////////////////////////////////////////////////////////////////////// - -namespace epilogue { -namespace threadblock { - -//////////////////////////////////////////////////////////////////////////////// - -/// Tile iterator used to load and store output tile from global memory in epilogue. -/// -/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator -/// -template -class PredicatedTileIteratorReducedVec { - public: - using ThreadMap = ThreadMap_; - using Shape = typename ThreadMap::Shape; - - using Element = Element_; - - using Layout = Layout_; - using TensorRef = TensorRef; - using ConstTensorRef = typename TensorRef::ConstTensorRef; - - using Index = typename Layout::Index; - using LongIndex = typename Layout::LongIndex; - using TensorCoord = MatrixCoord; - using EpilogueOpParams = EpilogueOpParams_; - using OutIdxT = typename EpilogueOpParams::CGReduceT::IndexT; - using OutValT = typename EpilogueOpParams::CGReduceT::AccTypeT; - - static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; - static int const kThreads = ThreadMap::kThreads; - static int const kIterations = ThreadMap::Count::kTile; - - static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0"); - static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0"); - static_assert(ThreadMap::Iterations::kCluster > 0, "ThreadMap::Iterations::kCluster must be > 0"); - static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0"); - static_assert(!UseCUDAStore, "UseCUDAStore path is not supported"); - - static int const total_rows = ThreadMap::kWarpCount * ThreadMap::Iterations::kRow * - ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster * - ThreadMap::Count::kTile * ThreadMap::Delta::kRow; - /// Fragment object - using Fragment = - Array; - - // Memory access size - using AccessType = AlignedArray; - using AccessTypeValT = AlignedArray; - - // - // Parameters struct - // - - /// Uses a non-template class - struct Params : PredicatedTileIteratorParams { - using Base = PredicatedTileIteratorParams; - - EpilogueOpParams user_param; - CUTLASS_HOST_DEVICE - Params() {} - - CUTLASS_HOST_DEVICE - Params(Layout const& layout) - : PredicatedTileIteratorParams( - layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess, - make_OutputTileThreadMapDesc()) - { - } - - CUTLASS_HOST_DEVICE - Params(Layout const& layout, EpilogueOpParams const& user_param_) - : PredicatedTileIteratorParams(int(sizeof(AccessType)) / kElementsPerAccess, - make_OutputTileThreadMapDesc()), - user_param(user_param_) - { - } - - CUTLASS_HOST_DEVICE - Params(Base const& base) : Base(base) {} - }; - - /// Mask object - struct Mask { - // static int const kCount = ThreadMap::Iterations::kColumn; - static int const kCount = ThreadMap::Iterations::kColumn * kElementsPerAccess; - - /// Predicate state - bool predicates[kCount]; - - // - // Mask - // - CUTLASS_HOST_DEVICE - Mask() { enable(); } - - ///< Efficiently disables all accesses guarded by mask - CUTLASS_HOST_DEVICE void clear() - { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kCount; ++i) { - predicates[i] = false; - } - } - - ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask - CUTLASS_DEVICE void enable() - { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kCount; ++i) { - predicates[i] = true; - } - } - }; - - /// Shared storage allocation needed by the predicated tile - // iterator for reduction. - struct SharedStorage { - // - // Type definitions - // - using Shape = MatrixShape; - - /// Shape of the shared memory allocation for the reduced values store - using StorageShape = MatrixShape; - - // - // Data members - - // - // Methods - // - AlignedBuffer storage; - - CUTLASS_DEVICE - Element* data() { return storage.data(); } - - SharedStorage() {} - - CUTLASS_DEVICE - void initSmem(EpilogueOpParams const& user_params) - { - Element* shared_elem_arr = data(); - constexpr auto maxVal = std::numeric_limits::max(); - - for (int row = threadIdx.x; row < total_rows; row += blockDim.x) { - user_params.red_op_.init(&shared_elem_arr[row], maxVal); - } - } - }; - - template - struct select_reduce { - /// Performs warp level reduction and stores a reduced output to memory - CUTLASS_DEVICE - select_reduce(OutT value, - ValT prev_red_val, - cg_reduce_op_t reduce_op, - cg_group_t cg_warp_group, - OutT& shmem_ptr) - { - if (cg_warp_group.any(reduce_op.isAmin(value, prev_red_val))) { - OutT reduced_val = cg::reduce(cg_warp_group, value, reduce_op); - if (cg_warp_group.thread_rank() == 0) { shmem_ptr = reduced_val; } - } - } - }; - - template - struct select_reduce> { - using ValT = float; - using Ty = raft::KeyValuePair; - /// Performs warp level reduction of key value pair and stores a reduced output to memory - CUTLASS_DEVICE - select_reduce(Ty val_to_red, - float prev_red_val, - cg_reduce_op_t cg_reduce_op, - cg_group_t cg_warp_group, - Ty& shmem_ptr) - { - ValT val = val_to_red.value; - - if (cg_warp_group.any(cg_reduce_op.isAmin(val, prev_red_val))) { - ValT reduced_val = cg::reduce(cg_warp_group, val, cg_reduce_op); - bool pred = (reduced_val == val); - auto subTile = cg::binary_partition(cg_warp_group, pred); - if (pred) { - if (subTile.thread_rank() == 0) { shmem_ptr = val_to_red; } - } - } - } - }; - - template - struct select_reduce> { - using ValT = double; - using Ty = raft::KeyValuePair; - /// Performs warp level reduction of key value pair and stores a reduced output to memory - CUTLASS_DEVICE - select_reduce(Ty val_to_red, - double prev_red_val, - cg_reduce_op_t cg_reduce_op, - cg_group_t cg_warp_group, - Ty& shmem_ptr) - { - ValT val = val_to_red.value; - - if (cg_warp_group.any(cg_reduce_op.isAmin(val, prev_red_val))) { - ValT reduced_val = cg::reduce(cg_warp_group, val, cg_reduce_op); - bool pred = (reduced_val == val); - auto subTile = cg::binary_partition(cg_warp_group, pred); - if (pred) { - if (subTile.thread_rank() == 0) { shmem_ptr = val_to_red; } - } - } - } - }; - - private: - // - // Data members - // - - /// Parameters structure containing reference and precomputed state. - Params params_; - - /// Byte-level pointer - uint8_t* byte_pointer_; - /// Byte-level pointer first tile offset of this threadblock. - uint8_t* first_tile_byte_pointer_; - - /// Array of boolean values to contain steady-state predicates - Mask mask_; - - /// Extent of the matrix tile in rows - Index extent_row_; - - /// Extent of the matrix tile in rows - Index extent_column_; - - /// A thread's starting row position (assuming steady-state predicates have been computed) - Index thread_start_row_; - Index block_start_row_first_tile_; - - /// A thread's starting column - Index thread_start_column_; - - /// Internal state counter - int state_[3]; - // mutable int shared_tile_id; - - /// Scatter indices - int const* indices_; - - // - // Static asserts about internal strides - // - - static_assert(sizeof(extent_row_) == 4, "Expected 32b extents"); - static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents"); - static_assert(sizeof(Params::stride) == 8, "Expected 64b strides"); - - protected: - SharedStorage& shared_storage_; - const bool& do_gmem_reduction_; - - private: - // - // Methods - // - public: - // - // Methods - // - /// Constructor - CUTLASS_DEVICE - PredicatedTileIteratorReducedVec(SharedStorage& shared_storage, - Params const& params, - Element* pointer, - TensorCoord extent, - int thread_idx, - const bool& do_gmem_reduction, - TensorCoord threadblock_offset = TensorCoord(), - int const* indices = nullptr) - : params_(params), - indices_(indices), - shared_storage_(shared_storage), - do_gmem_reduction_(do_gmem_reduction) - { - TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset; - - extent_row_ = extent.row(); - extent_column_ = extent.column(); - - thread_start_row_ = thread_offset.row(); - thread_start_column_ = thread_offset.column(); - - TensorCoord block_offset = ThreadMap::initial_offset(0) + threadblock_offset; - block_start_row_first_tile_ = block_offset.row(); - - // Initialize predicates - CUTLASS_PRAGMA_UNROLL - for (int c = 0; c < ThreadMap::Iterations::kColumn * kElementsPerAccess; ++c) { - int columnPerAccess = (c / kElementsPerAccess); - int columnWithinPerAccess = c % kElementsPerAccess; - mask_.predicates[c] = ((thread_offset.column() + ThreadMap::Delta::kColumn * columnPerAccess + - columnWithinPerAccess) < extent.column()); - } - - if (threadblock_offset.column() == 0) { - EpilogueOpParams const& user_params = params_.user_param; - shared_storage_.initSmem(user_params); - } - - // Null pointer performs no accesses - if (!pointer) { mask_.clear(); } - - if (ScatterD && !indices) { mask_.clear(); } - - // Initialize pointer - first_tile_byte_pointer_ = reinterpret_cast(pointer) + - LongIndex(block_offset.row()) * LongIndex(params_.stride); - - if (ScatterD) { - byte_pointer_ = reinterpret_cast(pointer) + - LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess; - } - - // Initialize internal state counter - state_[0] = state_[1] = state_[2] = 0; - } - - /// Destructor - CUTLASS_DEVICE - ~PredicatedTileIteratorReducedVec() - { - if (do_gmem_reduction_) { - EpilogueOpParams const& user_params = params_.user_param; - auto gmem_ptr = reinterpret_cast(first_tile_byte_pointer_); - Element* shared_elem_arr = shared_storage_.data(); - const uint32_t mutex_id = (block_start_row_first_tile_ / total_rows); - bool useGmemMutex = (gridDim.x != ((extent_row_ - 1 + total_rows) / total_rows)); - // If this is not optimal grid size perform mutex based gmem reduce. - if (useGmemMutex) { - // single lock per block for multiple rows - if (threadIdx.x == 0 && block_start_row_first_tile_ < extent_row_) { - // acquire mutex lock. - unsigned int ns = 8; - while (atomicCAS(user_params.mutexes_ + mutex_id, 0, 1) == 1) { - __nanosleep(ns); - if (ns < 256) { ns *= 2; } - } - } - } - - __syncthreads(); - for (int row = threadIdx.x; row < total_rows; row += blockDim.x) { - if (block_start_row_first_tile_ + row < extent_row_) { - user_params.red_op_( - block_start_row_first_tile_ + row, &gmem_ptr[row], shared_elem_arr[row]); - } - } - - if (useGmemMutex) { - __threadfence(); - __syncthreads(); - if (threadIdx.x == 0 && block_start_row_first_tile_ < extent_row_) { - // release mutex lock. - atomicExch(user_params.mutexes_ + mutex_id, 0); - } - } - } - } - - /// Adds a pointer offset in units of Element - CUTLASS_HOST_DEVICE - void add_pointer_offset(LongIndex pointer_offset) - { - byte_pointer_ += pointer_offset * sizeof_bits::value / 8; - } - - /// Performs reduction and Stores a reduced output to memory - CUTLASS_DEVICE - void store_with_byte_offset(Fragment& frag, int64_t byte_offset) const - { - AccessTypeValT* frag_ptr = reinterpret_cast(&frag); - - cg::thread_block cta = cg::this_thread_block(); - // tile_width 16 is required if kElementPerAccess > 1 - constexpr int tile_width = (32 / ThreadMap::Delta::kColumn) ? 32 : 16; - cg::thread_block_tile tile32 = cg::tiled_partition(cta); - EpilogueOpParams const& user_params = params_.user_param; - - using cg_reduce_t = decltype(user_params.cg_reduce_op); - using tile32_t = decltype(tile32); - - Element* shared_elem_arr = shared_storage_.data(); - constexpr auto maxVal = std::numeric_limits::max(); - - CUTLASS_PRAGMA_UNROLL - for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { - CUTLASS_PRAGMA_UNROLL - for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { - CUTLASS_PRAGMA_UNROLL - for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = - (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - - int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; - - const OutIdxT row_id = row_offset + thread_start_row_; - bool row_guard = (row_id < extent_row_); - - const int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn * kElementsPerAccess; - Element red_val; - user_params.red_op_.init(&red_val, maxVal); - - if (row_guard) { - const int iter_row = (row_id % total_rows); - const auto prev_red_val = user_params.red_op_.get_value(shared_elem_arr[iter_row]); - - CUTLASS_PRAGMA_UNROLL - for (int column = 0; column < ThreadMap::Iterations::kColumn * kElementsPerAccess; - ++column) { - int columnPerAccess = column / kElementsPerAccess; - int columnWithPerAccess = column % kElementsPerAccess; - bool guard = mask_.predicates[column]; - if (guard) { - const OutIdxT key_id = thread_start_column_ + - ThreadMap::Delta::kColumn * columnPerAccess + - columnWithPerAccess; - const int frag_col_idx = frag_idx + column; - - Element this_val; - user_params.red_op_.init(&this_val, (*frag_ptr)[frag_col_idx]); - user_params.red_op_.init_key(this_val, key_id); - user_params.red_op_(row_id, &red_val, this_val); - } - } - // select_reduce doesn't need to use `red_op_` as at the warp level we use cg_reduce_op, - // this satisfies the requirement of mst/single linkage of checking colors buffer. - select_reduce red_obj( - red_val, prev_red_val, user_params.cg_reduce_op, tile32, shared_elem_arr[iter_row]); - } - } - } - } - } - - /// Stores a fragment to memory - CUTLASS_DEVICE - void store(Fragment& frag) const { store_with_byte_offset(frag, 0); } - - CUTLASS_DEVICE - MatrixCoord thread_start() const { return MatrixCoord(thread_start_row_, thread_start_column_); } - - /// Need to get the thread start row from the tile iterator - CUTLASS_DEVICE - int32_t thread_start_row() const { return thread_start_row_; } - - /// Need to get the thread start row from the tile iterator - CUTLASS_DEVICE - int32_t thread_start_column() const { return thread_start_column_; } - - /// Extent of the matrix in rows - CUTLASS_DEVICE - Index extent_row() const { return extent_row_; } - - /// Extent of the matrix in columns - CUTLASS_DEVICE - Index extent_column() const { return extent_column_; } - - /// Advances to the next position to load or store - CUTLASS_HOST_DEVICE - PredicatedTileIteratorReducedVec& operator++() - { - ++state_[0]; - - if (!ScatterD) { byte_pointer_ += params_.advance_row; } - - thread_start_row_ += ThreadMap::Shape::kRow; - - if (state_[0] == ThreadMap::Count::kRow) { - state_[0] = 0; - ++state_[1]; - byte_pointer_ += params_.advance_group; - - thread_start_row_ += - (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow; - - if (state_[1] == ThreadMap::Count::kGroup) { - state_[1] = 0; - ++state_[2]; - byte_pointer_ += params_.advance_cluster; - - thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup * - ThreadMap::Count::kRow * ThreadMap::Shape::kRow; - - if (state_[2] == ThreadMap::Count::kCluster) { - state_[2] = 0; - byte_pointer_ += params_.advance_tile; - } - } - } - - return *this; - } - - ///< Efficiently disables all accesses guarded by mask - CUTLASS_DEVICE void clear_mask() { mask_.clear(); } - - ///< Efficiently enables all accesses guarded by mask - CUTLASS_DEVICE void enable_mask() { mask_.enable(); } - - ///< Sets the mask - CUTLASS_DEVICE void get_mask(Mask& mask) const { mask = mask_; } - - ///< Sets the mask - CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; } -}; - -/////////////////////////////////////////////////////////////////////////////// - -} // namespace threadblock -} // namespace epilogue -} // namespace cutlass - -//////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh b/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh deleted file mode 100644 index 0c2548863..000000000 --- a/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // size_t -#include // ops::l2_exp_distance_op -#include -#include // PairwiseDistances -#include // std::numeric_limits -#include // raft::KeyValuePair -#include // raft::identity_op -#include // Policy -#include // raft::util::arch::SM_* -#include // raft::ceildiv, raft::shfl - -namespace cuvs { -namespace distance { - -namespace detail { - -template -struct KVPMinReduceImpl { - typedef raft::KeyValuePair KVP; - DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } - -}; // KVPMinReduce - -template -struct MinAndDistanceReduceOpImpl { - typedef typename raft::KeyValuePair KVP; - DI void operator()(LabelT rid, KVP* out, const KVP& other) const - { - if (other.value < out->value) { - out->key = other.key; - out->value = other.value; - } - } - - DI void operator()(LabelT rid, DataT* out, const KVP& other) const - { - if (other.value < *out) { *out = other.value; } - } - - DI void operator()(LabelT rid, DataT* out, const DataT& other) const - { - if (other < *out) { *out = other; } - } - - DI void init(DataT* out, DataT maxVal) const { *out = maxVal; } - DI void init(KVP* out, DataT maxVal) const { out->value = maxVal; } - - DI void init_key(DataT& out, LabelT idx) const { return; } - DI void init_key(KVP& out, LabelT idx) const { out.key = idx; } - - DI DataT get_value(KVP& out) const - { - return out.value; - ; - } - DI DataT get_value(DataT& out) const { return out; } -}; - -template -struct MinReduceOpImpl { - typedef typename raft::KeyValuePair KVP; - DI void operator()(LabelT rid, DataT* out, const KVP& other) - { - if (other.value < *out) { *out = other.value; } - } - - DI void init(DataT* out, DataT maxVal) { *out = maxVal; } -}; - -template -RAFT_KERNEL initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) -{ - auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x; - if (tid < m) { redOp.init(min + tid, maxVal); } -} - -template -void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, cudaStream_t stream) -{ - auto blks = raft::ceildiv(m, 256); - initKernel<<>>(min, m, maxVal, redOp); -} - -// TODO: specialize this function for MinAndDistanceReduceOp -// with atomicCAS of 64 bit which will eliminate mutex and raft::shfls -template -DI void updateReducedVal( - int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY) -{ - const auto lid = threadIdx.x % raft::WarpSize; - const auto accrowid = threadIdx.x / P::AccThCols; - - // Update each output row in order within a warp. This will resolve hang - // issues with pre-Volta architectures -#pragma unroll - for (int j = 0; j < (raft::WarpSize / P::AccThCols); j++) { - if (lid == j * P::AccThCols) { -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto rid = gridStrideY + accrowid + i * P::AccThRows; - if (rid < m) { - auto value = val[i]; - while (atomicCAS(mutex + rid, 0, 1) == 1) - ; - __threadfence(); - red_op(rid, min + rid, value); - __threadfence(); - atomicCAS(mutex + rid, 1, 0); - } - } - } - } -} - -template -__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL fusedL2NNkernel(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - DataT maxVal, - int* mutex, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - OpT distance_op, - FinalLambda fin_op) -{ -// compile only if below non-ampere arch. -#if __CUDA_ARCH__ < 800 - extern __shared__ char smem[]; - - typedef KeyValuePair KVPair; - KVPair val[P::AccRowsPerTh]; -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {0, maxVal}; - } - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__( - DataT acc[P::AccRowsPerTh][P::AccColsPerTh], - DataT * regxn, - DataT * regyn, - IdxT gridStrideX, - IdxT gridStrideY) { - KVPReduceOpT pairRed_op(pairRedOp); - - // intra thread reduce - const auto acccolid = threadIdx.x % P::AccThCols; - const auto accrowid = threadIdx.x / P::AccThCols; -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - auto tmpkey = acccolid + j * P::AccThCols + gridStrideX; - KVPair tmp = {tmpkey, acc[i][j]}; - if (tmpkey < n) { - val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); - } - } - } - }; - - auto rowEpilog_lambda = - [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) { - KVPReduceOpT pairRed_op(pairRedOp); - ReduceOpT red_op(redOp); - - const auto accrowid = threadIdx.x / P::AccThCols; - const auto lid = raft::laneId(); - - // reduce -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = P::AccThCols / 2; j > 0; j >>= 1) { - // Actually, the srcLane (lid +j) should be (lid +j) % P:AccThCols, - // but the raft::shfl op applies the modulo internally. - auto tmpkey = raft::shfl(val[i].key, lid + j, P::AccThCols); - auto tmpvalue = raft::shfl(val[i].value, lid + j, P::AccThCols); - KVPair tmp = {tmpkey, tmpvalue}; - val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]); - } - } - - updateReducedVal(mutex, min, val, red_op, m, gridStrideY); - - // reset the val array. -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {0, maxVal}; - } - }; - - IdxT lda = k, ldb = k, ldd = n; - constexpr bool row_major = true; - constexpr bool write_out = false; - PairwiseDistances - obj(x, - y, - m, - n, - k, - lda, - ldb, - ldd, - xn, - yn, - nullptr, // Output pointer - smem, - distance_op, - epilog_lambda, - fin_op, - rowEpilog_lambda); - obj.run(); -#endif -} - -// cg::reduce functor for FusedDistanceNN used in its cutlass version -// to output the min distance value & key(loc id). -// This is used in fused_distance_nn/predicated_tile_iterator_reduced_vec.h -// store_with_byte_offset() passed to cg::reduce() & select_reduce. -template -struct kvp_cg_min_reduce_op { - typedef typename raft::KeyValuePair KVP; - - __host__ __device__ kvp_cg_min_reduce_op() noexcept {}; - - using AccTypeT = AccType; - using IndexT = Index; - // functor signature. - __host__ __device__ KVP operator()(KVP a, KVP b) const { return a.value < b.value ? a : b; } - - __host__ __device__ AccType operator()(AccType a, AccType b) const { return min(a, b); } - - __host__ __device__ bool isAmin(AccType a, AccType b) const { return a < b ? true : false; } -}; - -template -void fusedL2NNImpl(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - int* workspace, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - bool sqrt, - bool initOutBuffer, - cudaStream_t stream) -{ - // The kernel policy is determined by fusedL2NN. - typedef Policy P; - - dim3 blk(P::Nthreads); - auto nblks = raft::ceildiv(m, P::Nthreads); - constexpr auto maxVal = std::numeric_limits::max(); - typedef raft::KeyValuePair KVPair; - - RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); - if (initOutBuffer) { - initKernel - <<>>(min, m, maxVal, redOp); - RAFT_CUDA_TRY(cudaGetLastError()); - } - - namespace arch = raft::util::arch; - using AccT = DataT; - ops::l2_exp_distance_op distance_op{sqrt}; - - raft::identity_op fin_op{}; - - auto kernel = fusedL2NNkernel; - - // Get pointer to fp32 SIMT kernel to determine the best compute architecture - // out of all for which the kernel was compiled for that matches closely - // to the current device. Other methods to determine the architecture (that do not - // require a pointer) can be error prone. See: - // https://github.com/NVIDIA/cub/issues/545 - void* kernel_ptr = reinterpret_cast(kernel); - auto runtime_arch = arch::kernel_virtual_arch(kernel_ptr); - auto cutlass_range = arch::SM_range(arch::SM_80(), arch::SM_future()); - - if (cutlass_range.contains(runtime_arch)) { - // If device is SM_80 or later, use CUTLASS-based kernel. - using L2Op = cuvs::distance::detail::ops::l2_exp_cutlass_op; - using kvp_cg_min_reduce_op_ = kvp_cg_min_reduce_op; - kvp_cg_min_reduce_op_ cg_reduce_op; - L2Op L2_dist_op(sqrt); - - IdxT lda, ldb, ldd; - lda = k, ldb = k, ldd = n; - - cutlassFusedDistanceNN(x, - y, - xn, - yn, - m, - n, - k, - lda, - ldb, - ldd, - min, - workspace, - cg_reduce_op, - L2_dist_op, - redOp, - pairRedOp, - stream); - } else { - // If device less than SM_80, use fp32 SIMT kernel. - constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); - dim3 grid = launchConfigGenerator

(m, n, shmemSize, kernel); - - kernel<<>>( - min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, distance_op, fin_op); - RAFT_CUDA_TRY(cudaGetLastError()); - } -} - -} // namespace detail -} // namespace distance -} // namespace cuvs diff --git a/cpp/include/cuvs/distance/detail/kernels/gram_matrix.cuh b/cpp/include/cuvs/distance/detail/kernels/gram_matrix.cuh deleted file mode 100644 index 1f4424ea9..000000000 --- a/cpp/include/cuvs/distance/detail/kernels/gram_matrix.cuh +++ /dev/null @@ -1,489 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -// #include -#include -#include - -#include -#include - -namespace cuvs::distance::kernels::detail { - -template -using dense_input_matrix_view_t = raft::device_matrix_view; -template -using dense_output_matrix_view_t = raft::device_matrix_view; -template -using csr_input_matrix_view_t = raft::device_csr_matrix_view; - -/** - * Base class for general Gram matrices - * A Gram matrix is the Hermitian matrix of inner probucts G_ik = - * Here, the inner product is evaluated for all elements from vectors sets X1, - * and X2. - * - * To be more precise, on exit the output buffer will store: - * - if is_row_major == true: out[j+k*n1] = , - * - if is_row_major == false: out[j*n2 + k] = , - * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector - * from the x2 set. - */ -template -class GramMatrixBase { - protected: - cublasHandle_t cublas_handle; - bool legacy_interface; - - public: - GramMatrixBase() : legacy_interface(false){}; - [[deprecated]] GramMatrixBase(cublasHandle_t cublas_handle) - : cublas_handle(cublas_handle), legacy_interface(true){}; - - virtual ~GramMatrixBase(){}; - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * Vector sets are provided in Matrix format - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void operator()(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * Vector sets are provided in Matrix format - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void operator()(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * Vector sets are provided in Matrix format - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void operator()(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } - - // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual - - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - virtual void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - virtual void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - virtual void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } - - /** Evaluate the Gram matrix for two vector sets using simple dot product. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] virtual void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - } - - /** Convenience function to evaluate the Gram matrix for two vector sets. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 - * @param ld2 leading dimension of x2 - * @param ld_out leading dimension of out - */ - [[deprecated]] void operator()(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1 = 0, - int ld2 = 0, - int ld_out = 0) - { - ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor."); - if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; } - if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; } - if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; } - evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - } - - protected: - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 - * @param ld2 leading dimension of x2 - * @param ld_out leading dimension of out - */ - [[deprecated]] void linear(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - math_t alpha = 1.0; - math_t beta = 0.0; - if (is_row_major) { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - n2, - n1, - n_cols, - &alpha, - x2, - ld2, - x1, - ld1, - &beta, - out, - ld_out, - stream)); - } else { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_T, - n1, - n2, - n_cols, - &alpha, - x1, - ld1, - x2, - ld2, - &beta, - out, - ld_out, - stream)); - } - } - - protected: - bool get_is_row_major(dense_output_matrix_view_t matrix) - { - return (matrix.stride(1) == 1); - } - - bool get_is_row_major(dense_input_matrix_view_t matrix) - { - return (matrix.stride(1) == 1); - } - - bool get_is_col_major(dense_output_matrix_view_t matrix) - { - return (matrix.stride(0) == 1); - } - - bool get_is_col_major(dense_input_matrix_view_t matrix) - { - return (matrix.stride(0) == 1); - } - - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - */ - void linear(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check is_row_major consistency - bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out); - bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out); - ASSERT(is_row_major || is_col_major, - "GramMatrix leading dimensions for x1, x2 and out do not match"); - - // check dimensions - int n1 = out.extent(0); - int n2 = out.extent(1); - int n_cols = x1.extent(1); - ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match"); - ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match"); - ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match"); - - // extract major stride - int ld1 = is_row_major ? x1.stride(0) : x1.stride(1); - int ld2 = is_row_major ? x2.stride(0) : x2.stride(1); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - - math_t alpha = 1.0; - math_t beta = 0.0; - if (is_row_major) { - // #TODO: Use mdspan-based API when stride-capable - // https://github.com/rapidsai/raft/issues/875 - raft::linalg::gemm(handle, - true, - false, - n2, - n1, - n_cols, - &alpha, - x2.data_handle(), - ld2, - x1.data_handle(), - ld1, - &beta, - out.data_handle(), - ld_out, - resource::get_cuda_stream(handle)); - } else { - // #TODO: Use mdspan-based API when stride-capable - // https://github.com/rapidsai/raft/issues/875 - raft::linalg::gemm(handle, - false, - true, - n1, - n2, - n_cols, - &alpha, - x1.data_handle(), - ld1, - x2.data_handle(), - ld2, - &beta, - out.data_handle(), - ld_out, - resource::get_cuda_stream(handle)); - } - } - - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - */ - void linear(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check is_row_major consistency - bool is_row_major = get_is_row_major(x2) && get_is_row_major(out); - bool is_col_major = get_is_col_major(x2) && get_is_col_major(out); - ASSERT(is_row_major || is_col_major, - "GramMatrix leading dimensions for x2 and out do not match"); - - // check dimensions - auto x1_structure = x1.structure_view(); - ASSERT(x1_structure.get_n_rows() == out.extent(0), - "GramMatrix input matrix dimensions for x1 and out do not match"); - ASSERT(x2.extent(0) == out.extent(1), - "GramMatrix input matrix dimensions for x2 and out do not match"); - ASSERT(x2.extent(1) == x1_structure.get_n_cols(), - "GramMatrix input matrix dimensions for x1 and x2 do not match"); - - math_t alpha = 1.0; - math_t beta = 0.0; - - raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out); - } - - /** Calculates the Gram matrix using simple dot product between vector sets. - * - * out = x1 * x2 - * - * Can be used as a building block for more complex kernel functions. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - */ - void linear(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check layout consistency (w.r.t. strides a matrix might be both row & col major) - bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1); - bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0); - - ASSERT(is_row_major_nopad || is_col_major_nopad, - "Sparse linear Kernel distance does not support ld_out parameter"); - - // switch a,b based on is_row_major - if (is_col_major_nopad) { - auto out_row_major = raft::make_device_matrix_view( - out.data_handle(), out.extent(1), out.extent(0)); - raft::sparse::distance::pairwise_distance( - handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); - } else { - auto out_row_major = raft::make_device_matrix_view( - out.data_handle(), out.extent(0), out.extent(1)); - raft::sparse::distance::pairwise_distance( - handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); - } - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/include/cuvs/distance/detail/kernels/kernel_factory.cuh b/cpp/include/cuvs/distance/detail/kernels/kernel_factory.cuh deleted file mode 100644 index d0f1f5569..000000000 --- a/cpp/include/cuvs/distance/detail/kernels/kernel_factory.cuh +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "gram_matrix.cuh" -#include "kernel_matrices.cuh" -#include -#include - -namespace cuvs::distance::kernels::detail { - -template -class KernelFactory { - public: - static GramMatrixBase* create(KernelParams params) - { - GramMatrixBase* res; - // KernelParams is not templated, we convert the parameters to math_t here: - math_t coef0 = params.coef0; - math_t gamma = params.gamma; - switch (params.kernel) { - case LINEAR: res = new GramMatrixBase(); break; - case POLYNOMIAL: res = new PolynomialKernel(params.degree, gamma, coef0); break; - case TANH: res = new TanhKernel(gamma, coef0); break; - case RBF: res = new RBFKernel(gamma); break; - default: throw raft::exception("Kernel not implemented"); - } - return res; - } - - [[deprecated]] static GramMatrixBase* create(KernelParams params, cublasHandle_t handle) - { - GramMatrixBase* res; - // KernelParams is not templated, we convert the parameters to math_t here: - math_t coef0 = params.coef0; - math_t gamma = params.gamma; - switch (params.kernel) { - case LINEAR: res = new GramMatrixBase(handle); break; - case POLYNOMIAL: - res = new PolynomialKernel(params.degree, gamma, coef0, handle); - break; - case TANH: res = new TanhKernel(gamma, coef0, handle); break; - case RBF: res = new RBFKernel(gamma, handle); break; - default: throw raft::exception("Kernel not implemented"); - } - return res; - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/include/cuvs/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/cuvs/distance/detail/kernels/kernel_matrices.cuh deleted file mode 100644 index 1f9db896e..000000000 --- a/cpp/include/cuvs/distance/detail/kernels/kernel_matrices.cuh +++ /dev/null @@ -1,777 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "gram_matrix.cuh" -#include - -#include -#include -#include -#include -#include - -namespace cuvs::distance::kernels::detail { - -/** Epiloge function for polynomial kernel without padding. - * Calculates output = (gain*in + offset)^exponent - * @param inout device vector in column major format, size [len] - * @param len array length - * @param exponent - * @param gain - * @param offset - */ -template -RAFT_KERNEL polynomial_kernel_nopad( - math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; - tid += blockDim.x * gridDim.x) { - inout[tid] = pow(gain * inout[tid] + offset, exponent); - } -} - -/** Epiloge function for polynomial kernel with padding. - * Calculates output = (gain*input + offset)^exponent - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param exponent - * @param gain - * @param offset - */ -template -RAFT_KERNEL polynomial_kernel( - math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent); - } -} - -/** Epiloge function for tanh kernel without padding. - * Calculates output = tanh(gain*input + offset) - * @param inout device vector, size [len] - * @param len length of the input vector - * @param gain - * @param offset - */ -template -RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; - tid += blockDim.x * gridDim.x) { - inout[tid] = tanh(gain * inout[tid] + offset); - } -} - -/** Epiloge function for tanh kernel without padding. - * Calculates output = tanh(gain*input + offset) - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param gain - * @param offset - */ -template -RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset); - } -} - -/** Epiloge function for rbf kernel using expansion. - * - * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij)); - * - * Intended usage - * - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk - * - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X - * - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y - * - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param norm_x l2-norm of X's rows - * @param norm_y l2-norm of Y's rows - * @param gain - */ -template -RAFT_KERNEL rbf_kernel_expanded( - math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) { - math_t norm_y_val = norm_y[tidy]; - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = - exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2)); - } - } -} - -namespace { -std::tuple generateLaunchConfig2dElementwiseOp(int n1, int n2) -{ - dim3 block_shape = dim3(32, 4); - const int num_blocks_x = raft::ceildiv(n1, 32); - const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1); - dim3 grid_shape = dim3(num_blocks_x, num_blocks_y); - return std::make_tuple(grid_shape, block_shape); -} -} // namespace - -/** - * Create a kernel matrix using polynomial kernel function. - */ -template -class PolynomialKernel : public GramMatrixBase { - exp_t exponent; - math_t gain; - math_t offset; - - void applyKernel( - math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) - { - const int n_minor = is_row_major ? cols : rows; - if (ld == n_minor) { - polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( - inout, rows * cols, exponent, gain, offset); - } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - polynomial_kernel<<>>( - inout, ld, n1, n2, exponent, gain, offset); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - public: - /** - * Constructs a polynomial kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = (gain* + offset)^exponent - * - * @tparam math_t floating point type - * @tparam exp_t type of exponent - * @param exponent - * @param gain - * @param offset - */ - PolynomialKernel(exp_t exponent, math_t gain, math_t offset) - : GramMatrixBase(), exponent(exponent), gain(gain), offset(offset) - { - } - - [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), exponent(exponent), gain(gain), offset(offset) - { - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - GramMatrixBase::linear( - x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - applyKernel(out, ld_out, n1, n2, is_row_major, stream); - } -}; - -/** - * Create a kernel matrix using tanh kernel function. - */ -template -class TanhKernel : public GramMatrixBase { - math_t gain, offset; - - void applyKernel( - math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) - { - const int n_minor = is_row_major ? cols : rows; - if (ld == n_minor) { - tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( - inout, rows * cols, gain, offset); - } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - public: - /** - * Constructs a tanh kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = tanh(gain* + offset) - * - * @tparam math_t floating point type - * @param gain - * @param offset - */ - TanhKernel(math_t gain, math_t offset) : GramMatrixBase(), gain(gain), offset(offset) {} - - [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain), offset(offset) - { - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - GramMatrixBase::linear( - x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - applyKernel(out, ld_out, n1, n2, is_row_major, stream); - } -}; - -/** - * Create a kernel matrix using RBF kernel function. - */ -template -class RBFKernel : public GramMatrixBase { - math_t gain; - - void applyKernel(math_t* inout, - int ld, - int rows, - int cols, - math_t* norm_x1, - math_t* norm_x2, - bool is_row_major, - cudaStream_t stream) - { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; - math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - rbf_kernel_expanded<<>>( - inout, ld, n1, n2, norm_n1, norm_n2, gain); - } - - public: - /** - * Constructs a RBF kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = exp(-gain*|x1_i- x2_k|^2) - * - * @tparam math_t floating point type - * @param gain - */ - RBFKernel(math_t gain) : GramMatrixBase(), gain(gain) {} - - [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain) - { - } - - void matrixRowNormL2(raft::resources const& handle, - dense_input_matrix_view_t matrix, - math_t* target) - { - bool is_row_major = GramMatrixBase::get_is_row_major(matrix); - int minor = is_row_major ? matrix.extent(1) : matrix.extent(0); - int ld = is_row_major ? matrix.stride(0) : matrix.stride(1); - ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter"); - raft::linalg::rowNorm(target, - matrix.data_handle(), - matrix.extent(1), - matrix.extent(0), - raft::linalg::NormType::L2Norm, - is_row_major, - resource::get_cuda_stream(handle)); - } - - void matrixRowNormL2(raft::resources const& handle, - csr_input_matrix_view_t matrix, - math_t* target) - { - auto matrix_structure = matrix.structure_view(); - raft::sparse::linalg::rowNormCsr(handle, - matrix_structure.get_indptr().data(), - matrix.get_elements().data(), - matrix_structure.get_nnz(), - matrix_structure.get_n_rows(), - target, - raft::linalg::NormType::L2Norm); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.extent(0), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.extent(0), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.extent(0), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - int minor1 = is_row_major ? n_cols : n1; - int minor2 = is_row_major ? n_cols : n2; - int minor_out = is_row_major ? n2 : n1; - ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter"); - ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter"); - ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter"); - - math_t gain = this->gain; - using index_t = int64_t; - - rbf_fin_op fin_op{gain}; - - raft::resources handle; - resource::set_cuda_stream(handle, stream); - - cuvs::distance::distance(handle, - const_cast(x1), - const_cast(x2), - out, - n1, - n2, - n_cols, - NULL, - 0, - fin_op, - is_row_major); - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/include/cuvs/distance/detail/kernels/rbf_fin_op.cuh b/cpp/include/cuvs/distance/detail/kernels/rbf_fin_op.cuh deleted file mode 100644 index 73588baea..000000000 --- a/cpp/include/cuvs/distance/detail/kernels/rbf_fin_op.cuh +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -/* - * This file defines rbf_fin_op, which is used in GramMatrixBase. - * - * This struct has been moved to a separate file, so that it is cheap to include - * in distance/distance-ext.cuh, where an instance of cuvs::distance::distance - * with the rbf_fin_op is instantiated. - * - */ - -#include // raft::exp -#include // HD - -namespace cuvs::distance::kernels::detail { - -/** @brief: Final op for Gram matrix with RBF kernel. - * - * Calculates output = e^(-gain * in) - * - */ -template -struct rbf_fin_op { - OutT gain; - - explicit HD rbf_fin_op(OutT gain_) noexcept : gain(gain_) {} - - template - HDI OutT operator()(OutT d_val, Args... unused_args) - { - return raft::exp(-gain * d_val); - } -}; // struct rbf_fin_op - -} // namespace cuvs::distance::kernels::detail diff --git a/cpp/include/cuvs/distance/detail/masked_distance_base.cuh b/cpp/include/cuvs/distance/detail/masked_distance_base.cuh deleted file mode 100644 index 0c8db755b..000000000 --- a/cpp/include/cuvs/distance/detail/masked_distance_base.cuh +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include -#include -#include - -#include - -namespace cuvs { -namespace distance { -namespace detail { - -/** - * @brief Device class for masked nearest neighbor computations. - * - * @tparam useNorms whether norms are needed - * @tparam DataT input data-type (for x and y matrices) - * @tparam AccT accumulation data-type - * @tparam IdxT index data-type - * @tparam Policy struct which tunes the Contraction kernel - * @tparam CoreLambda tells how to accumulate an x and y into - acc. its signature: - template void core_lambda(AccT& acc, - const DataT& x, const DataT& y) - * @tparam EpilogueLambda applies an elementwise function to compute final - values. Its signature is: - template void epilogue_lambda - (AccT acc[][], DataT* regxn, DataT* regyn); - * @tparam FinalLambda the final lambda called on final distance value - * @tparam rowEpilogueLambda epilog lambda that executes when a full row has - * been processed. - * - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of x - * @param[in] n number of columns of y - * @param[in] k number of cols of x and y - * @param[in] lda leading dimension of x - * @param[in] ldb leading dimension of y - * @param[in] ldd parameter to keep Contractions_NT happy.. - * @param[in] xn row norms of input matrix A. Required for expanded L2, cosine - * @param[in] yn row norms of input matrix B. Required for expanded L2, cosine - * @param[in] adj An adjacency matrix encoded as a bitfield indicating for each - * row of `x` and each group in `y` whether to compute the - * distance. Dim = `(m / 64) x num_groups`. - * @param[in] group_idxs An array containing the *end* indices of each group - * in `y`. The value of group_idxs[j] indicates the - * start of group j + 1, i.e., it is the inclusive - * scan of the group lengths. The first group is - * always assumed to start at index 0 and the last - * group typically ends at index `n`. Length = - * `num_groups`. - * @param[in] num_groups The number of groups in group_idxs. - * @param[in] smem shared mem buffer for intermediate storage of x, y, xn & yn. - * @param core_op the core accumulation operation lambda - * @param epilog_op the epilog operation lambda - * @param fin_op the final gemm epilogue lambda - * @param rowEpilog_op epilog lambda that executes when a full row has been processed. - */ -template > -struct MaskedDistances : public BaseClass { - private: - typedef Policy P; - const DataT* xn; - const DataT* yn; - const DataT* const yBase; - const uint64_t* adj; - const IdxT* group_idxs; - IdxT num_groups; - char* smem; - CoreLambda core_op; - EpilogueLambda epilog_op; - FinalLambda fin_op; - rowEpilogueLambda rowEpilog_op; - - AccT acc[P::AccRowsPerTh][P::AccColsPerTh]; - - public: - // Constructor - DI MaskedDistances(const DataT* _x, - const DataT* _y, - IdxT _m, - IdxT _n, - IdxT _k, - IdxT _lda, - IdxT _ldb, - IdxT _ldd, - const DataT* _xn, - const DataT* _yn, - const uint64_t* _adj, - const IdxT* _group_idxs, - IdxT _num_groups, - char* _smem, - CoreLambda _core_op, - EpilogueLambda _epilog_op, - FinalLambda _fin_op, - rowEpilogueLambda _rowEpilog_op) - : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem), - xn(_xn), - yn(_yn), - yBase(_y), - adj(_adj), - group_idxs(_group_idxs), - num_groups(_num_groups), - smem(_smem), - core_op(_core_op), - epilog_op(_epilog_op), - fin_op(_fin_op), - rowEpilog_op(_rowEpilog_op) - { - } - - DI void run() - { - const auto grid_stride_m = (P::Mblk * gridDim.y); - const auto grid_offset_m = (P::Mblk * blockIdx.y); - - const auto grid_stride_g = gridDim.x; - const auto grid_offset_g = blockIdx.x; - - for (auto tile_idx_m = grid_offset_m; tile_idx_m < this->m; tile_idx_m += grid_stride_m) { - // Start loop over groups - for (auto idx_g = grid_offset_g; idx_g < this->num_groups; idx_g += grid_stride_g) { - const uint64_t block_adj = get_block_adjacency(adj, tile_idx_m, idx_g); - // block_adj is a bitfield that contains a 1 if a row is adjacent to the - // current group. All zero means we can skip this group. - if (block_adj == 0) { continue; } - - // thread_adj is a bitfield that contains a 1 at location i iff we must - // compute row i of acc (the accumulator register tile). That is, - // for i = 0,.., AccRowsPerTh and j = 0,.., AccColsPerTh: - // - // ((1 << i) & thread_adj) > 0 <=> acc[i][j] must be computed. - // - // We precompute this information because it is used in various - // locations to skip thread-local computations, specifically: - // - // 1. To skip computations if thread_adj == 0, i.e., none of the values - // of `acc` have to be computed. - // - // 2. In epilog_op, to consider only values of `acc` to be reduced that - // are not masked of. - // - // Note 1: Even when the computation can be skipped for a specific thread, - // the thread still participates in synchronization operations. - // - // Note 2: In theory, it should be possible to skip computations for - // specific rows of `acc`. In practice, however, this does not improve - // performance. - int thread_adj = compute_thread_adjacency(block_adj); - - auto tile_idx_n = idx_g == 0 ? 0 : group_idxs[idx_g - 1]; - const auto group_end_n = group_idxs[idx_g]; - for (; tile_idx_n < group_end_n; tile_idx_n += P::Nblk) { - // We provide group_end_n to limit the number of unnecessary data - // points that are loaded from y. - this->ldgXY(tile_idx_m, tile_idx_n, 0, group_end_n); - - reset_accumulator(); - this->stsXY(); - __syncthreads(); - this->switch_write_buffer(); - - for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { - this->ldgXY(tile_idx_m, tile_idx_n, kidx, group_end_n); - // Process all data in shared memory (previous k-block) and - // accumulate in registers. - if (thread_adj != 0) { accumulate(); } - this->stsXY(); - __syncthreads(); - this->switch_write_buffer(); - this->switch_read_buffer(); - } - if (thread_adj != 0) { - accumulate(); // last iteration - } - // The pre-condition for the loop over tile_idx_n is that write_buffer - // and read_buffer point to the same buffer. This flips read_buffer - // back so that it satisfies the pre-condition of this loop. - this->switch_read_buffer(); - - if (useNorms) { - DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh]; - load_norms(tile_idx_m, tile_idx_n, group_end_n, regxn, regyn); - if (thread_adj != 0) { - epilog_op(acc, thread_adj, regxn, regyn, tile_idx_n, tile_idx_m, group_end_n); - } - } else { - if (thread_adj != 0) { - epilog_op(acc, thread_adj, nullptr, nullptr, tile_idx_n, tile_idx_m, group_end_n); - } - } - } // tile_idx_n - } // idx_g - rowEpilog_op(tile_idx_m); - } // tile_idx_m - } - - private: - DI uint64_t get_block_adjacency(const uint64_t* adj, IdxT tile_idx_m, IdxT idx_group) - { - // A single element of `adj` contains exactly enough bits to indicate which - // rows in the current tile to skip and which to compute. - static_assert(P::Mblk == 8 * sizeof(adj[0]), - "masked_l2_nn only supports a policy with 64 rows per block."); - IdxT block_flag_idx = tile_idx_m / P::Mblk; - // Index into adj at row tile_idx_m / 64 and column idx_group. - return adj[block_flag_idx * this->num_groups + idx_group]; - } - - DI uint32_t compute_thread_adjacency(const uint64_t block_adj) - { - // thread_adj is a bitfield that contains a 1 at location i iff we must - // compute row i of acc (the accumulator register tile). It is described in - // more detail in the run() method. - uint32_t thread_adj = 0; -#pragma unroll - for (int thread_row_idx = 0; thread_row_idx < P::AccRowsPerTh; ++thread_row_idx) { - // Index `thread_row_idx` refers to a row of the current threads' register - // tile `acc`, i.e., acc[i][:]. Index `block_row_idx` refers to the - // corresponding row of the current block tile in shared memory. - const int block_row_idx = this->accrowid + thread_row_idx * P::AccThRows; - - // block_row_is_adjacent is true if the current block_row_idx is adjacent - // to the current group. - const uint64_t block_mask = 1ull << block_row_idx; - const bool block_row_is_adjacent = (block_adj & block_mask) != 0; - if (block_row_is_adjacent) { - // If block row is adjacent, write a 1 bit to thread_adj at location - // `thread_row_idx`. - const uint32_t thread_mask = 1 << thread_row_idx; - thread_adj |= thread_mask; - } - } - return thread_adj; - } - - DI void reset_accumulator() - { - // Reset accumulator registers to zero. -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - acc[i][j] = BaseClass::Zero; - } - } - } - - DI void accumulate() - { -#pragma unroll - for (int ki = 0; ki < P::Kblk; ki += P::Veclen) { - this->ldsXY(ki); -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { -#pragma unroll - for (int v = 0; v < P::Veclen; ++v) { - core_op(acc[i][j], this->regx[i][v], this->regy[j][v]); - } - } - } - } - } - - DI void load_norms(IdxT tile_idx_m, - IdxT tile_idx_n, - IdxT end_n, - DataT (®xn)[P::AccRowsPerTh], - DataT (®yn)[P::AccColsPerTh]) - { - DataT* sxNorm = (DataT*)(&smem[P::SmemSize]); - DataT* syNorm = (&sxNorm[P::Mblk]); - - // Load x & y norms required by this threadblock in shmem buffer - for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = tile_idx_m + i; - sxNorm[i] = idx < this->m ? xn[idx] : 0; - } - - for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = tile_idx_n + i; - syNorm[i] = idx < end_n ? yn[idx] : 0; - } - __syncthreads(); - -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - regxn[i] = sxNorm[i * P::AccThRows + (threadIdx.x / P::AccThCols)]; - } -#pragma unroll - for (int i = 0; i < P::AccColsPerTh; ++i) { - regyn[i] = syNorm[i * P::AccThCols + (threadIdx.x % P::AccThCols)]; - } - } -}; // struct MaskedDistances - -}; // namespace detail -}; // namespace distance -}; // namespace cuvs diff --git a/cpp/include/cuvs/distance/detail/masked_nn.cuh b/cpp/include/cuvs/distance/detail/masked_nn.cuh deleted file mode 100644 index 8b30d8eec..000000000 --- a/cpp/include/cuvs/distance/detail/masked_nn.cuh +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace distance { -namespace detail { - -template -__launch_bounds__(P::Nthreads, 2) RAFT_KERNEL masked_l2_nn_kernel(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - const uint64_t* adj, - const IdxT* group_idxs, - IdxT num_groups, - IdxT m, - IdxT n, - IdxT k, - bool sqrt, - DataT maxVal, - int* mutex, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - CoreLambda core_op, - FinalLambda fin_op) -{ - extern __shared__ char smem[]; - - typedef raft::KeyValuePair KVPair; - KVPair val[P::AccRowsPerTh]; -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; - } - - // epilogue operation lambda for final value calculation - auto epilog_lambda = [pairRedOp, &val, maxVal, sqrt] __device__( - DataT acc[P::AccRowsPerTh][P::AccColsPerTh], - int thread_adj, - DataT* regxn, - DataT* regyn, - IdxT tile_idx_n, - IdxT tile_idx_m, - IdxT tile_end_n) { - KVPReduceOpT pairRed_op(pairRedOp); - -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j]; - } - } - if (sqrt) { -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - acc[i][j] = raft::sqrt(acc[i][j]); - } - } - } - - // intra thread reduce - const auto acccolid = threadIdx.x % P::AccThCols; - const auto accrowid = threadIdx.x / P::AccThCols; - -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - // thread_adj is a bitfield that contains a 1 at location i iff we must - // compute row i of acc (the accumulator register tile). It is described in - // more detail in the maskedDistances.run() method. - const bool ignore = (thread_adj & (1 << i)) == 0; - if (ignore) { continue; } -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - auto tmpkey = acccolid + j * P::AccThCols + tile_idx_n; - if (tile_end_n <= tmpkey) { - // Do not process beyond end of tile. - continue; - } - KVPair tmp = {tmpkey, acc[i][j]}; - if (tmpkey < tile_end_n) { - val[i] = pairRed_op(accrowid + i * P::AccThRows + tile_idx_m, tmp, val[i]); - } - } - } - }; - - auto rowEpilog_lambda = - [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT tile_idx_m) { - KVPReduceOpT pairRed_op(pairRedOp); - ReduceOpT red_op(redOp); - - const auto accrowid = threadIdx.x / P::AccThCols; - const auto lid = raft::laneId(); - // reduce -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = P::AccThCols / 2; j > 0; j >>= 1) { - auto tmpkey = raft::shfl(val[i].key, lid + j); - auto tmpvalue = raft::shfl(val[i].value, lid + j); - KVPair tmp = {tmpkey, tmpvalue}; - val[i] = pairRed_op(accrowid + i * P::AccThRows + tile_idx_m, tmp, val[i]); - } - } - - updateReducedVal(mutex, min, val, red_op, m, tile_idx_m); - - // reset the val array. -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - val[i] = {-1, maxVal}; - } - }; - - IdxT lda = k, ldb = k, ldd = n; - MaskedDistances - obj(x, - y, - m, - n, - k, - lda, - ldb, - ldd, - xn, - yn, - adj, - group_idxs, - num_groups, - smem, - core_op, - epilog_lambda, - fin_op, - rowEpilog_lambda); - obj.run(); -} - -/** - * @brief Wrapper for masked_l2_nn_kernel - * - * Responsibilities: - * - Allocate (and initialize) workspace memory for: - * - mutexes used in nearest neighbor update step - * - adjacency matrix bitfield - * - Compress adjacency matrix to bitfield - * - Initialize output buffer (conditional on `initOutBuffer`) - * - Specify core and final operations for the L2 norm - * - Determine optimal launch configuration for kernel. - * - Launch kernel and check for errors. - * - * @tparam DataT Input data-type (for x and y matrices). - * @tparam OutT Output data-type (for key-value pairs). - * @tparam IdxT Index data-type. - * @tparam ReduceOpT A struct to perform the final needed reduction - * operation and also to initialize the output array - * elements with the appropriate initial value needed for - * reduction. - * @tparam KVPReduceOpT Type of Reduction operation on key value pairs. - * - * @param handle RAFT handle for managing expensive resources - * @param[out] out Will contain reduced output (nn key-value pairs) - * @param[in] x First matrix. Row major. Dim = `m x k`. (on device) - * @param[in] y Second matrix. Row major. Dim = `n x k`. (on device) - * @param[in] xn L2 squared norm of `x`. Length = `m`. - * @param[in] yn L2 squared norm of `y`. Length = `n`. - * @param[in] adj A boolean adjacency matrix indicating for each - * row of `x` and each group in `y` whether to compute the - * distance. Dim = `m x num_groups`. - * @param[in] group_idxs An array containing the *end* indices of each group - * in `y`. The value of group_idxs[j] indicates the - * start of group j + 1, i.e., it is the inclusive - * scan of the group lengths. The first group is - * always assumed to start at index 0 and the last - * group typically ends at index `n`. Length = - * `num_groups`. - * @param[in] num_groups Length of `group_idxs`. - * @param m Rows of `x`. - * @param n Rows of `y`. - * @param k Cols of `x` and `y`. - * @param redOp Reduction operator in the epilogue - * @param pairRedOp Reduction operation on key value pairs - * @param sqrt Whether to compute the squared or actual (i.e. sqrt) L2 norm. - * @param initOutBuffer Whether to initialize the output buffer - * - * - */ -template -void masked_l2_nn_impl(raft::resources const& handle, - OutT* out, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - const bool* adj, - const IdxT* group_idxs, - IdxT num_groups, - IdxT m, - IdxT n, - IdxT k, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - bool sqrt, - bool initOutBuffer) -{ - typedef typename linalg::Policy4x4::Policy P; - - static_assert(P::Mblk == 64, "masked_l2_nn_impl only supports a policy with 64 rows per block."); - - // Get stream and workspace memory resource - rmm::mr::device_memory_resource* ws_mr = - dynamic_cast(raft::resource::get_workspace_resource(handle)); - auto stream = resource::get_cuda_stream(handle); - - // Acquire temporary buffers and initialize to zero: - // 1) Adjacency matrix bitfield - // 2) Workspace for fused nearest neighbor operation - size_t m_div_64 = raft::ceildiv(m, IdxT(64)); - rmm::device_uvector ws_adj64{m_div_64 * num_groups, stream, ws_mr}; - rmm::device_uvector ws_fused_nn{size_t(m), stream, ws_mr}; - RAFT_CUDA_TRY(cudaMemsetAsync(ws_adj64.data(), 0, ws_adj64.size() * sizeof(uint64_t), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(ws_fused_nn.data(), 0, ws_fused_nn.size() * sizeof(int), stream)); - - // Compress boolean adjacency matrix to bitfield. - auto adj_view = raft::make_device_matrix_view(adj, m, num_groups); - auto adj64_view = - raft::make_device_matrix_view(ws_adj64.data(), m_div_64, num_groups); - compress_to_bits(handle, adj_view, adj64_view); - - // Initialize output buffer with keyvalue pairs as determined by the reduction - // operator (it will be called with maxVal). - constexpr auto maxVal = std::numeric_limits::max(); - if (initOutBuffer) { - dim3 grid(raft::ceildiv(m, P::Nthreads)); - dim3 block(P::Nthreads); - - initKernel<<>>(out, m, maxVal, redOp); - RAFT_CUDA_TRY(cudaGetLastError()); - } - - // Accumulation operation lambda - auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; }; - auto fin_op = raft::identity_op{}; - - auto kernel = masked_l2_nn_kernel; - constexpr size_t smemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT)); - dim3 block(P::Nthreads); - dim3 grid = launchConfigGenerator

(m, n, smemSize, kernel); - - kernel<<>>(out, - x, - y, - xn, - yn, - ws_adj64.data(), - group_idxs, - num_groups, - m, - n, - k, - sqrt, - maxVal, - ws_fused_nn.data(), - redOp, - pairRedOp, - core_lambda, - fin_op); - - RAFT_CUDA_TRY(cudaGetLastError()); -} - -} // namespace detail -} // namespace distance -} // namespace cuvs diff --git a/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh b/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh deleted file mode 100644 index 57366dec9..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once -#include // raft::linalg::Contractions_NT -#include // ceildiv -#include // RAFT_CUDA_TRY - -#include // size_t - -namespace cuvs { -namespace distance { -namespace detail { - -/** - * @brief Device class for L1, L2 and cosine distance metrics. - * @tparam DataT input data-type (for A and B matrices) - * @tparam AccT accumulation data-type - * @tparam OutT output data-type (for C and D matrices) - * @tparam IdxT index data-type - * @tparam Policy struct which tunes the Contraction kernel - * @tparam OpT A distance operation, e.g., cosine_distance_op. - * @tparam EpilogueLambda applies an elementwise function to compute final - values. Its signature is: - template void epilogue_lambda - (AccT acc[][], DataT* regxn, DataT* regyn); - * @tparam FinalLambda the final lambda called on final distance value - * @param[in] x input matrix - * @param[in] y input matrix - * @param[in] m number of rows of A and C/D - * @param[in] n number of columns of B and C/D - * @param[in] k number of cols of A and rows of B - * @param[in] lda leading dimension of A - * @param[in] ldb leading dimension of B - * @param[in] ldd leading dimension of C/D - * @param[in] xn row norms of input matrix A. Required for expanded L2, cosine - * @param[in] yn row norms of input matrix B. Required for expanded L2, cosine - * @param[output] pD output matrix - * @param[in] smem shared mem buffer for intermediate storage of A, B, xn & yn. - * @param distance_op the distance operation, e.g. cosine_distance_op - * @param epilog_op the epilog operation lambda - * @param fin_op the final gemm epilogue lambda - * @param rowEpilog_op epilog lambda that executes when a full row has been processed - */ - -template > -struct PairwiseDistances : public BaseClass { - // Get accumulation type from distance_op - using AccT = typename OpT::AccT; - - private: - typedef Policy P; - const DataT* xn; - const DataT* yn; - const DataT* const yBase; - OutT* dOutput; - char* smem; - OpT distance_op; - EpilogueLambda epilog_op; - FinalLambda fin_op; - rowEpilogueLambda rowEpilog_op; - - const IdxT grid_stride_m; - const IdxT grid_stride_n; - const IdxT grid_offset_m; - const IdxT grid_offset_n; - - AccT acc[P::AccRowsPerTh][P::AccColsPerTh]; - - public: - // Constructor - DI PairwiseDistances(const DataT* _x, - const DataT* _y, - IdxT _m, - IdxT _n, - IdxT _k, - IdxT _lda, - IdxT _ldb, - IdxT _ldd, - const DataT* _xn, - const DataT* _yn, - OutT* _dOutput, - char* _smem, - OpT _distance_op, - EpilogueLambda _epilog_op, - FinalLambda _fin_op, - rowEpilogueLambda _rowEpilog_op) - : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem), - xn(_xn), - yn(_yn), - yBase(_y), - dOutput(_dOutput), - smem(_smem), - distance_op(_distance_op), - epilog_op(_epilog_op), - fin_op(_fin_op), - rowEpilog_op(_rowEpilog_op), - grid_stride_m(P::Mblk * gridDim.y), - grid_stride_n(P::Nblk * gridDim.x), - grid_offset_m(P::Mblk * blockIdx.y), - grid_offset_n(P::Nblk * blockIdx.x) - { - } - - DI void run() - { - for (auto tile_idx_m = grid_offset_m; tile_idx_m < this->m; tile_idx_m += grid_stride_m) { - this->ldgXY(tile_idx_m, grid_offset_n, 0); - for (auto tile_idx_n = grid_offset_n; tile_idx_n < this->n; tile_idx_n += grid_stride_n) { - // Prolog: - reset_accumulator(); - this->stsXY(); - __syncthreads(); - this->switch_write_buffer(); - - // Main loop: - for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) { - this->ldgXY(tile_idx_m, tile_idx_n, kidx); - // Process all data in shared memory (previous k-block) and - // accumulate in registers. - accumulate(); - this->stsXY(); - __syncthreads(); - this->switch_write_buffer(); - this->switch_read_buffer(); - } - accumulate(); // last iteration - // The pre-condition for the loop over tile_idx_n is that write_buffer - // and read_buffer point to the same buffer. This flips read_buffer back - // so that it satisfies the pre-condition of this loop. - this->switch_read_buffer(); - - // Epilog: - if (distance_op.use_norms) { - DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh]; - load_norms(tile_idx_m, tile_idx_n, regxn, regyn); - // Overlap ldg with epilog computation - ldgNextGridStride(tile_idx_m, tile_idx_n); - // Calculate distance_op epilog. - // Use .template to disambiguate (See: - // https://en.cppreference.com/w/cpp/language/dependent_name) - distance_op.template epilog(acc, regxn, regyn, tile_idx_n, tile_idx_m); - // And any possible additional epilogs - epilog_op(acc, regxn, regyn, tile_idx_n, tile_idx_m); - } else { - // Overlap ldg with epilog computation - ldgNextGridStride(tile_idx_m, tile_idx_n); - // Calculate distance_op epilog. - // Use .template to disambiguate (See: - // https://en.cppreference.com/w/cpp/language/dependent_name) - distance_op.template epilog(acc, nullptr, nullptr, tile_idx_n, tile_idx_m); - // And any possible additional epilogs - epilog_op(acc, nullptr, nullptr, tile_idx_n, tile_idx_m); - } - if (writeOut) { store_output(tile_idx_m, tile_idx_n); } - } - rowEpilog_op(tile_idx_m); - } - } - - private: - DI void ldgNextGridStride(IdxT tile_idx_m, IdxT tile_idx_n) - { - // Fetch next grid stride ldg if within range - const auto next_tile_tile_idx_n = tile_idx_n + grid_stride_n; - const auto next_tile_tile_idx_m = tile_idx_m + grid_stride_m; - if ((next_tile_tile_idx_n) < this->n) { - this->ldgXY(tile_idx_m, next_tile_tile_idx_n, 0); - } else if ((next_tile_tile_idx_m) < this->m) { - this->ldgXY(next_tile_tile_idx_m, grid_offset_n, 0); - } - } - - DI void reset_accumulator() - { - // Reset accumulator registers to zero. -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - acc[i][j] = BaseClass::Zero; - } - } - } - - DI void accumulate_reg_tile(DataT (®_x)[P::AccRowsPerTh][P::Veclen], - DataT (®_y)[P::AccColsPerTh][P::Veclen]) - { -#pragma unroll - for (int v = 0; v < P::Veclen; ++v) { -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - distance_op.core(acc[i][j], reg_x[i][v], reg_y[j][v]); - } - } - } - } - - DI void accumulate() - { - // We have a separate raft::ldsXY and accumulate_reg_tile outside the loop body, - // so that these separated calls can be interspersed with preceding and - // following instructions, thereby hiding latency. - this->ldsXY(0); - - // If expensive inner loop, do not unroll loop. - constexpr int num_iterations = P::Kblk / P::Veclen - 1; - constexpr int unroll_count = decltype(distance_op)::expensive_inner_loop ? 1 : num_iterations; -#pragma unroll unroll_count - for (int ki = P::Veclen; ki < P::Kblk; ki += P::Veclen) { - accumulate_reg_tile(this->regx, this->regy); - this->ldsXY(ki); - } - - // Accumulate last loaded tile. - accumulate_reg_tile(this->regx, this->regy); - } - - DI void load_norms(IdxT tile_idx_m, - IdxT tile_idx_n, - DataT (®xn)[P::AccRowsPerTh], - DataT (®yn)[P::AccColsPerTh]) - { - DataT* sxNorm = (DataT*)(&smem[P::SmemSize]); - DataT* syNorm = (&sxNorm[P::Mblk]); - - // Load x & y norms required by this threadblock in shmem buffer - if (tile_idx_n == blockIdx.x * P::Nblk) { - for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) { - auto idx = tile_idx_m + i; - sxNorm[i] = idx < this->m ? xn[idx] : 0; - } - } - - for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) { - auto idx = tile_idx_n + i; - syNorm[i] = idx < this->n ? yn[idx] : 0; - } - __syncthreads(); - -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - regxn[i] = sxNorm[i * P::AccThRows + (threadIdx.x / P::AccThCols)]; - } -#pragma unroll - for (int i = 0; i < P::AccColsPerTh; ++i) { - regyn[i] = syNorm[i * P::AccThCols + (threadIdx.x % P::AccThCols)]; - } - } - - DI void store_output(IdxT tile_idx_m, IdxT tile_idx_n) - { - IdxT starty = tile_idx_m + this->accrowid; - IdxT startx = tile_idx_n + this->acccolid; - -#pragma unroll - for (int i = 0; i < P::AccRowsPerTh; ++i) { - auto rowId = starty + i * P::AccThRows; -#pragma unroll - for (int j = 0; j < P::AccColsPerTh; ++j) { - auto colId = startx + j * P::AccThCols; - if (rowId < this->m && colId < this->n) { - // Promote to 64 bit index for final write, as output array can be > 2^31 - dOutput[std::size_t(rowId) * this->n + colId] = fin_op(acc[i][j], 0); - } - } - } - } -}; // struct PairwiseDistances - -template -dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) -{ - int devId; - RAFT_CUDA_TRY(cudaGetDevice(&devId)); - int numSMs; - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&numSMs, cudaDevAttrMultiProcessorCount, devId)); - - int numBlocksPerSm = 0; - dim3 grid; - - RAFT_CUDA_TRY( - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize)); - std::size_t minGridSize = numSMs * numBlocksPerSm; - std::size_t yChunks = raft::ceildiv(m, P::Mblk); - std::size_t xChunks = raft::ceildiv(n, P::Nblk); - grid.y = yChunks > minGridSize ? minGridSize : yChunks; - grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks; - if (grid.x != 1) { - std::size_t i = 1; - while (grid.y * i < minGridSize) { - i++; - } - grid.x = i >= xChunks ? xChunks : i; - } - - return grid; -} - -}; // namespace detail -}; // namespace distance -}; // namespace cuvs diff --git a/cpp/include/cuvs/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/cuvs/distance/detail/pairwise_distance_cutlass_base.cuh deleted file mode 100644 index b9dd49977..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_distance_cutlass_base.cuh +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Wtautological-compare" - -// We define CUTLASS_NAMESPACE in case -// RAFT cmake is not used -#ifndef CUTLASS_NAMESPACE -#define cutlass raft_cutlass -#endif - -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -#include "./pairwise_distance_epilogue_elementwise.h" -#include "./pairwise_distance_gemm.h" - -namespace cuvs { -namespace distance { -namespace detail { - -template -std::enable_if_t::value> cutlassDistanceKernel(const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - IdxT lda, - IdxT ldb, - IdxT ldd, - OutT* dOutput, - FinalLambda fin_op, - OpT distance_op, - cudaStream_t stream) -{ - static_assert(!(std::is_same::value), - "OutType bool is not supported use uint8_t instead"); - - auto dist_op = distance_op.get_cutlass_op(); - using DistanceFn = decltype(dist_op); - using EpilogueOutputOp = - cutlass::epilogue::thread::PairwiseDistanceEpilogueElementwise; - constexpr int batch_count = 1; - - constexpr auto mode = cutlass::gemm::GemmUniversalMode::kGemm; - - typename EpilogueOutputOp::Params epilog_op_param(dist_op, fin_op); - - const DataT *a, *b; - - IdxT gemm_lda, gemm_ldb; - - // Number of pipelines you want to use - constexpr int NumStages = 3; - // Alignment - constexpr int Alignment = VecLen; - - // default initialize problem size with row major inputs - auto problem_size = cutlass::gemm::GemmCoord(n, m, k); - - using cutlassDistKernel = - typename cutlass::gemm::kernel::PairwiseDistanceGemm::GemmKernel; - - using cutlassDist = cutlass::gemm::device::GemmUniversalAdapter; - - if constexpr (isRowMajor) { - a = y; - b = x; - gemm_lda = ldb; - gemm_ldb = lda; - } else { - problem_size = cutlass::gemm::GemmCoord(m, n, k); - a = x; - b = y; - gemm_lda = lda; - gemm_ldb = ldb; - } - - typename cutlassDist::Arguments arguments{ - mode, problem_size, batch_count, epilog_op_param, a, b, - xn, // C matrix eq vector param, which here is A norm - nullptr, // tensor_Z, - (DataT*)yn, // this is broadcast vec, which is required to be non-const param - dOutput, // Output distance matrix - (int64_t)0, // batch stride A - (int64_t)0, // batch stride B - (int64_t)0, // batch stride Norm A - (int64_t)0, - (int64_t)0, // batch stride Norm B - (int64_t)0, // batch stride Output - gemm_lda, // stride A - gemm_ldb, // stride B - 1, // stride A norm - 0, // this is no-op for Z - 0, // This must be zero - ldd // stride Output matrix - }; - - // Using the arguments, query for extra workspace required for matrix multiplication computation - size_t workspace_size = cutlassDist::get_workspace_size(arguments); - // Allocate workspace memory - rmm::device_uvector workspace(workspace_size, stream); - // Instantiate CUTLASS kernel depending on templates - cutlassDist cutlassDist_op; - // Check the problem size is supported or not - RAFT_CUTLASS_TRY(cutlassDist_op.can_implement(arguments)); - - // Initialize CUTLASS kernel with arguments and workspace pointer - RAFT_CUTLASS_TRY(cutlassDist_op.initialize(arguments, workspace.data(), stream)); - - // Launch initialized CUTLASS kernel - RAFT_CUTLASS_TRY(cutlassDist_op(stream)); -} - -}; // namespace detail -}; // namespace distance -}; // namespace cuvs - -#pragma GCC diagnostic pop diff --git a/cpp/include/cuvs/distance/detail/pairwise_distance_epilogue.h b/cpp/include/cuvs/distance/detail/pairwise_distance_epilogue.h deleted file mode 100644 index 06b83ace9..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_distance_epilogue.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/*! \file - \brief Epilogue for threadblock scoped GEMMs using Tensor Ops. - -This is adapted from DefaultEpilogueWithBroadcastTensorOp from CUTLASS 2.9.0 -(https://github.com/NVIDIA/cutlass/blob/master/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h#L75) - -This epilogue allows us to load norm buffers using PredicatedTileIteratorNormVec -and EpilogueWithBroadcast used for distances L2/cosine as well as applies user-define elementwise -operation. --- A norm load is provided PredicatedTileIteratorNormVec --- B norm load is provided by EpilogueWithBroadcast --- elementwise operation is provided by OutputOp -*/ - -#pragma once - -#include -#include -#include - -#include - -#include "./predicated_tile_iterator_normvec.h" -#include -#include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace epilogue { -namespace threadblock { - -//////////////////////////////////////////////////////////////////////////////// - -/// Defines sensible defaults for epilogues for TensorOps. -template -struct PairwiseDistanceEpilogue { - /// Use defaults related to the existing epilogue - using Base = - DefaultEpilogueTensorOp; - - // - // Stores the result z = (y = GEMM(A, B, C), broadcast) - // - using OutputTileIterator = cutlass::epilogue::threadblock:: - PredicatedTileIteratorNormVec; - - // - // Additional tensor tile iterator - stores t = Elementwise(z) - // - using TensorTileIterator = - cutlass::epilogue::threadblock::PredicatedTileIterator; - - /// Define the epilogue - using Epilogue = EpilogueWithBroadcast; -}; - -} // namespace threadblock -} // namespace epilogue -} // namespace cutlass - -//////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/include/cuvs/distance/detail/pairwise_distance_epilogue_elementwise.h b/cpp/include/cuvs/distance/detail/pairwise_distance_epilogue_elementwise.h deleted file mode 100644 index 9004bd2c7..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_distance_epilogue_elementwise.h +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// -/*! \file - \brief Functor performing distance operations used by epilogues of pairwise distance - * kernels. -* This is adapted from LinearCombinationBiasElementwise from CUTLASS 2.9.0 -* customized for applying elementwise distance formula on accumulated GEMM value -* and applying user-defined final custom operation on the distance value. -*/ - -#pragma once - -#include -#include -#include -#include -#include - -#include - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace epilogue { -namespace thread { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// This base class is meant to define the concept required of the -/// EpilogueWithBroadcast::OutputOp -template -class PairwiseDistanceEpilogueElementwise { - public: - using ElementOutput = ElementC_; - using ElementC = ElementC_; - using ElementAccumulator = ElementAccumulator_; - using ElementCompute = ElementCompute_; - using ElementZ = ElementZ_; - using ElementT = ElementT_; - static int const kElementsPerAccess = ElementsPerAccess; - static int const kCount = kElementsPerAccess; - - using DistanceOp = DistanceOp_; - using FinalOp = FinalOp_; - - using FragmentAccumulator = Array; - using FragmentCompute = Array; - using FragmentC = Array; - using FragmentZ = Array; - using FragmentT = Array; - - using FragmentOutput = FragmentZ; - - static bool const kIsHeavy = false; // ElementwiseOp::kIsHeavy; - - /// If true, the 'Z' tensor is stored - static bool const kStoreZ = false; // We don't store anything in Z, - - /// If true, the 'T' tensor is stored - static bool const kStoreT = true; // this is our final output storage. - - /// Host-constructable parameters structure - struct Params { - FinalOp_ final_op_; - DistanceOp_ dist_op_; - - // - // Methods - // - CUTLASS_HOST_DEVICE - Params(DistanceOp_ dist_op, FinalOp final_op) : final_op_(final_op), dist_op_(dist_op) {} - - CUTLASS_HOST_DEVICE - Params() {} - }; - - private: - // - // Data members - // - FinalOp_ final_op; - DistanceOp_ elementwise_op; - - public: - // - // Methods - // - - /// Constructor from Params - CUTLASS_HOST_DEVICE - PairwiseDistanceEpilogueElementwise(Params const& params) - : final_op(params.final_op_), elementwise_op(params.dist_op_) - { - } - - /// Returns true if source is needed - CUTLASS_HOST_DEVICE - bool is_source_needed() const - { - // we use for making sure C matrix path is used for A mat norm. - return true; - } - - /// Functionally required for serial reduction in the epilogue - CUTLASS_HOST_DEVICE - void set_k_partition(int k_partition, int k_partition_count) {} - - /// Applies the operation when is_source_needed() is true - CUTLASS_HOST_DEVICE - void operator()(FragmentZ& frag_Z, - FragmentT& frag_T, - FragmentAccumulator const& AB, - FragmentC const& frag_C, - FragmentCompute const& V) const - { - FragmentCompute tmp_Accum = - NumericArrayConverter()(AB); - FragmentCompute tmp_C = - NumericArrayConverter()(frag_C); - FragmentCompute result_Z; - FragmentCompute result_T; - - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kElementsPerAccess; ++i) { - result_Z[i] = elementwise_op(tmp_C[i], V[i], tmp_Accum[i]); - result_T[i] = final_op(result_Z[i], 0); - } - - NumericArrayConverter convert_t; - frag_T = convert_t(result_T); - } - - /// Applies the operation when is_source_needed() is false - CUTLASS_HOST_DEVICE - void operator()(FragmentZ& frag_Z, - FragmentT& frag_T, - FragmentAccumulator const& AB, - FragmentCompute const& V) const - { - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace thread -} // namespace epilogue -} // namespace cutlass - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/include/cuvs/distance/detail/pairwise_distance_gemm.h b/cpp/include/cuvs/distance/detail/pairwise_distance_gemm.h deleted file mode 100644 index 2c88d8b70..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_distance_gemm.h +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include -#include - -#include "./pairwise_distance_epilogue.h" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace gemm { -namespace kernel { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -template < - /// Element type for A matrix operand - typename ElementA_, - /// Layout type for A matrix operand - int kAlignmentA, - /// Element type for B matrix operand - typename ElementB_, - /// Layout type for B matrix operand - int kAlignmentB, - /// Element type for C and D matrix operands - typename ElementC_, - /// Element type for internal accumulation - typename ElementAccumulator, - /// Element type for final output - // typename ElementOutT, - /// Epilogue output operator - must satisfy concept of 'EpilogueWithBroadcastOp' - typename EpilogueOutputOp, - /// Number of stages used in the pipelined mainloop - int Stages, - /// data layout row/column major of inputs - bool isRowMajor> -struct PairwiseDistanceGemm { - // This struct is specialized for fp32/3xTF32 - - /// Threadblock-level tile size (concept: GemmShape) - using ThreadblockShape = - cutlass::gemm::GemmShape<128, 128, 16>; // <- threadblock tile M = 128, N = 128, K = 16 - /// Warp-level tile size (concept: GemmShape) - // This code section describes tile size a warp will compute - using WarpShape = cutlass::gemm::GemmShape<64, 64, 16>; // <- warp tile M = 64, N = 64, K = 16 - /// Warp-level tile size (concept: GemmShape) - // This code section describes the size of MMA op - using InstructionShape = - cutlass::gemm::GemmShape<16, 8, 4>; // <- MMA Op tile M = 16, N = 8, K = 4 - - /// Operation performed by GEMM - using Operator = cutlass::arch::OpMultiplyAddFastF32; - - // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU - // SM - using OperatorClass = cutlass::arch::OpClassTensorOp; - - // This code section describes CUDA SM architecture number - using ArchTag = cutlass::arch::Sm80; - - // This code section describes how threadblocks are scheduled on GPU - /// Threadblock-level swizzling operator - using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; - - /// data layout for final output matrix. - // we keep this same layout even for column major inputs - using LayoutOutput = cutlass::layout::RowMajor; - - typedef typename std::conditional::type NormXLayout; - - typedef typename std:: - conditional::type LayoutA_; - - typedef typename std:: - conditional::type LayoutB_; - - using GemmBase = typename DefaultGemmUniversal::GemmKernel; - - // Replace epilogue - using Epilogue = typename cutlass::epilogue::threadblock::PairwiseDistanceEpilogue< - typename GemmBase::Epilogue::Shape, - typename GemmBase::Epilogue::WarpMmaOperator, - GemmBase::Epilogue::kPartitionsK, - ElementAccumulator, - typename EpilogueOutputOp::ElementT, - ElementAccumulator, - EpilogueOutputOp, - NormXLayout, - GemmBase::Epilogue::kElementsPerAccess>::Epilogue; - - // Compose the GEMM kernel - using GemmKernel = GemmWithFusedEpilogue; -}; - -template < - /// Layout type for A matrix operand - int kAlignmentA, - /// Layout type for B matrix operand - int kAlignmentB, - /// Element type for C and D matrix operands - typename ElementC_, - /// Element type for internal accumulation - typename ElementAccumulator, - /// Epilogue output operator - must satisfy concept of 'EpilogueWithBroadcastOp' - typename EpilogueOutputOp, - /// Number of stages used in the pipelined mainloop - int Stages, - /// data layout row/column major of inputs - bool isRowMajor> -struct PairwiseDistanceGemm { - // using Transform = cutlass::ComplexTransform::kNone; - // Threadblock-level tile size (concept: GemmShape) - using ThreadblockShape = - cutlass::gemm::GemmShape<64, 64, 16>; // <- threadblock tile M = 64, N = 64, K = 16 - /// Warp-level tile size (concept: GemmShape) - // This code section describes tile size a warp will compute - using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; // <- warp tile M = 32, N = 32, K = 16 - /// Warp-level tile size (concept: GemmShape) - // This code section describes the size of MMA op - using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; - - // Operation performed by GEMM - using Operator = cutlass::arch::OpMultiplyAdd; - // This code section describes whether you want to use tensor cores or regular SIMT cores on GPU - // SM - using OperatorClass = cutlass::arch::OpClassTensorOp; - - // This code section describes CUDA SM architecture number - using ArchTag = cutlass::arch::Sm80; - - // This code section describes how threadblocks are scheduled on GPU - /// Threadblock-level swizzling operator - using ThreadblockSwizzle = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; - - /// data layout for final output matrix. - // we keep this same layout even for column major inputs - using LayoutOutput = cutlass::layout::RowMajor; - - typedef typename std::conditional::type NormXLayout; - - typedef typename std:: - conditional::type LayoutA_; - - typedef typename std:: - conditional::type LayoutB_; - - using GemmBase = typename DefaultGemmUniversal::GemmKernel; - - // Replace epilogue - using Epilogue = typename cutlass::epilogue::threadblock::PairwiseDistanceEpilogue< - typename GemmBase::Epilogue::Shape, - typename GemmBase::Epilogue::WarpMmaOperator, - GemmBase::Epilogue::kPartitionsK, - ElementC_, - typename EpilogueOutputOp::ElementT, - ElementC_, - EpilogueOutputOp, - NormXLayout, - GemmBase::Epilogue::kElementsPerAccess>::Epilogue; - - // Compose the GEMM kernel - using GemmKernel = GemmWithFusedEpilogue; -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace kernel -} // namespace gemm -} // namespace cutlass \ No newline at end of file diff --git a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch-ext.cuh deleted file mode 100644 index efaebb379..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch-ext.cuh +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include // ops::* -#include // ops::has_cutlass_op -#include // rbf_fin_op -#include // pairwise_matrix_params -#include // raft::identity_op -#include // RAFT_EXPLICIT - -#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY - -namespace cuvs::distance::detail { - -template -void pairwise_matrix_dispatch(OpT distance_op, - IdxT m, - IdxT n, - IdxT k, - const DataT* x, - const DataT* y, - const DataT* x_norm, - const DataT* y_norm, - OutT* out, - FinOpT fin_op, - cudaStream_t stream, - bool is_row_major) RAFT_EXPLICIT; - -}; // namespace cuvs::distance::detail - -#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY - -#define instantiate_raft_distance_detail_pairwise_matrix_dispatch( \ - OpT, DataT, AccT, OutT, FinOpT, IdxT) \ - extern template void cuvs::distance::detail:: \ - pairwise_matrix_dispatch, DataT, AccT, OutT, FinOpT, IdxT>( \ - OpT distance_op, \ - IdxT m, \ - IdxT n, \ - IdxT k, \ - const DataT* x, \ - const DataT* y, \ - const DataT* x_norm, \ - const DataT* y_norm, \ - OutT* out, \ - FinOpT fin_op, \ - cudaStream_t stream, \ - bool is_row_major) - -/* - * Hierarchy of instantiations: - * - * This file defines extern template instantiations of the distance kernels. The - * instantiation of the public API is handled in cuvs/distance/distance-ext.cuh. - * - * After adding an instance here, make sure to also add the instance there. - */ - -// The following two instances are used in the RBF kernel object. Note the use of int64_t for the -// index type. -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l2_unexp_distance_op, - float, - float, - float, - cuvs::distance::kernels::detail::rbf_fin_op, - int64_t); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l2_unexp_distance_op, - double, - double, - double, - cuvs::distance::kernels::detail::rbf_fin_op, - int64_t); - -// Rest of instances -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::canberra_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::canberra_distance_op, - double, - double, - double, - raft::identity_op, - int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::correlation_distance_op, - float, - float, - float, - raft::identity_op, - int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::correlation_distance_op, - double, - double, - double, - raft::identity_op, - int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::cosine_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::cosine_distance_op, double, double, double, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::hamming_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::hamming_distance_op, double, double, double, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::hellinger_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::hellinger_distance_op, - double, - double, - double, - raft::identity_op, - int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::jensen_shannon_distance_op, - float, - float, - float, - raft::identity_op, - int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::jensen_shannon_distance_op, - double, - double, - double, - raft::identity_op, - int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::kl_divergence_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::kl_divergence_op, double, double, double, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l1_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l1_distance_op, double, double, double, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l2_exp_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l2_exp_distance_op, double, double, double, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l2_unexp_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l2_unexp_distance_op, - double, - double, - double, - raft::identity_op, - int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l_inf_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::l_inf_distance_op, double, double, double, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::lp_unexp_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::lp_unexp_distance_op, - double, - double, - double, - raft::identity_op, - int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::russel_rao_distance_op, float, float, float, raft::identity_op, int); -instantiate_raft_distance_detail_pairwise_matrix_dispatch( - cuvs::distance::detail::ops::russel_rao_distance_op, - double, - double, - double, - raft::identity_op, - int); - -#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch diff --git a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch-inl.cuh b/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch-inl.cuh deleted file mode 100644 index ca011731e..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch-inl.cuh +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -/* This file has two responsibilities: - * - * 1. Dispatch to the correct implementation of a kernel based on the - * architecture of the device on which the kernel will be launched. For - * instance, the cosine distance has a CUTLASS-based implementation that can - * be used on SM80+ and the normal implementation that is used on older - * architectures. - * - * 2. Provide concise function templates that can be instantiated in - * src/distance/detail/pairwise_matrix/. Previously, - * cuvs::distance::detail::distance was instantiated. The function - * necessarily required a large set of include files, which slowed down the - * build. The cuvs::distance::detail::pairwise_matrix_arch_dispatch functions - * do not require as large an include files set, which speeds up the build. - */ - -#include // ops::has_cutlass_op -#include // dispatch_sm60 -#include // pairwise_matrix_params -#include // raft::util::arch::SM_* - -// NOTE: to minimize compile times, we do not include dispatch_sm80.cuh. -// Including dispatch_sm80.cuh can slow down compile times (due to CUTLASS). -// Therefore, it is the including file's responsibility to include the correct -// dispatch_smXX.cuh headers, as is done in cuvs/distance/detail/distance.cuh -// and src/distance/detail/pairwise_matrix/dispatch_*.cu. - -namespace cuvs::distance::detail { - -// This forward-declaration ensures that we do not need to include -// dispatch_sm80.cuh if we are not calling it in practice. This makes compiling -// all the non-CUTLASS based distance instantiations faster. For CUTLASS-based -// distances, dispatch_sm80.cuh has to be included by the file including this -// file. -template -void pairwise_matrix_sm80_dispatch(OpT, - pairwise_matrix_params, - SM_compat_t, - cudaStream_t); - -template -void pairwise_matrix_dispatch(OpT distance_op, - IdxT m, - IdxT n, - IdxT k, - const DataT* x, - const DataT* y, - const DataT* x_norm, - const DataT* y_norm, - OutT* out, - FinOpT fin_op, - cudaStream_t stream, - bool is_row_major) -{ - // Create kernel parameter struct. Flip x and y if column major. - IdxT ldx = is_row_major ? k : m; - IdxT ldy = is_row_major ? k : n; - IdxT ld_out = is_row_major ? n : m; - - pairwise_matrix_params params{ - m, n, k, ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, is_row_major}; - - if (!params.is_row_major) { params.flip_x_and_y(); } - - // Dispatch rule: - // - execute CUTLASS-based kernel on SM_80 and above - // - execute normal kernel below SM_80 - namespace arch = raft::util::arch; - - constexpr bool cutlass_op_unavailable = !ops::has_cutlass_op(); - - if constexpr (cutlass_op_unavailable) { - // Always execute legacy kernels when no cutlass op is available - auto any_range = arch::SM_range(arch::SM_min(), arch::SM_future()); - pairwise_matrix_sm60_dispatch(distance_op, params, any_range, stream); - } else { - auto cutlass_range = arch::SM_range(arch::SM_80(), arch::SM_future()); - auto legacy_range = arch::SM_range(arch::SM_min(), arch::SM_80()); - - // Get pointer to SM60 kernel to determine the best compute architecture - // out of all for which the kernel was compiled for that matches closely - // to the current device. Other methods to determine the architecture (that do not - // require a pointer) can be error prone. See: - // https://github.com/NVIDIA/cub/issues/545 - auto sm60_wrapper = pairwise_matrix_sm60_get_wrapper(distance_op, params, legacy_range); - void* kernel_ptr = reinterpret_cast(sm60_wrapper.kernel_ptr); - auto runtime_arch = arch::kernel_virtual_arch(kernel_ptr); - - if (cutlass_range.contains(runtime_arch)) { - // If device is SM_80 or later, use CUTLASS-based kernel. - pairwise_matrix_sm80_dispatch(distance_op, params, cutlass_range, stream); - } else { - // Reuse kernel wrapper that we obtained above. This avoids performing the - // dispatch twice. - sm60_wrapper.launch(distance_op, params, stream); - } - } -} - -}; // namespace cuvs::distance::detail diff --git a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch.cuh deleted file mode 100644 index 4a52b7ebe..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch.cuh +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY -#include "dispatch-inl.cuh" -#endif - -#ifdef RAFT_COMPILED -#include "dispatch-ext.cuh" -#endif diff --git a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_layout.cuh b/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_layout.cuh deleted file mode 100644 index 2e9004b56..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_layout.cuh +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include // std::min -#include // size_t -#include // pairwise_matrix_params -#include // RAFT_EXPECTS -#include // std::integral_constant -namespace cuvs::distance::detail { - -/** - * @brief: Computes minimal common alignment of the rows in a 2D array in bytes - * - * The 2D matrix `x` is assumed to be row-major. This function computes the - * minimal alignment in bytes of the first elements of each row. - * Output can be 16, 8, 4, 2, 1. - * - * @param x Base pointer of row-major input matrix - * @param stride Stride in number of element between consecutive rows. - */ -template -size_t alignment_of_2d_array(const DataT* x, size_t stride) -{ - auto base = reinterpret_cast(x); - size_t stride_bytes = sizeof(DataT) * stride; - - for (int align = 16; align >= 0; align /= 2) { - bool base_aligned = base % align == 0; - bool stride_aligned = stride_bytes % align == 0; - if (base_aligned && stride_aligned) { return align; } - } - return 1; -} - -/** - * @brief: Computes the vec_len parameter kernel policy parameter - * - * @param params Kernel parameters - */ -template -int determine_vec_len(pairwise_matrix_params params) -{ - size_t align_x = alignment_of_2d_array(params.x, params.ldx); - size_t align_y = alignment_of_2d_array(params.y, params.ldy); - size_t byte_alignment = min(align_x, align_y); - - // Since alignment is in bytes, it could be smaller than sizeof(DataT). - // Handle this (unlikely) case here. - RAFT_EXPECTS(sizeof(DataT) <= byte_alignment, - "Input matrix must be aligned to size of elements."); - - // Compute number of elements that can be loaded in one instruction - // without causing misalignent errors. - int vec_len_aligned = (byte_alignment % sizeof(DataT) == 0) ? byte_alignment / sizeof(DataT) : 1; - - // In the future, pairwise_matrix might support `int8_t` input. In that case, - // byte_alignment / sizeof(DataT) might exceed 4. We maximize at 4 here, to - // prevent adding more cases in dispatch_layout below (which are expensive to - // compile). - vec_len_aligned = std::min(vec_len_aligned, 4); - - return vec_len_aligned; -} - -template -using vec_len_constant = std::integral_constant; - -/** - * @brief: Converts run-time arguments to compile-time arguments - * - * Converts run-time arguments row_major and vec_len to compile-time arguments - * and dispatches a lambda f with these compile-time arguments. - * - * This is equivalent to copying and pasting the lambda function `f` in each of - * the switch case statements. - * - * @tparam F Type of lambda f. - * @param row_major Boolean indicating whether input arrays have row-major layout. - * @param vec_len Integer value 1, 2, or 4 specifying the Veclen template parameter of - * the KernelPolicy. - * @param f Lambda that takes two std::integral_constant parameters representing - * row_major and vec_len. - */ -template -auto dispatch_layout(bool row_major, int vec_len, F&& f) -{ - if (row_major) { - switch (vec_len) { - case 4: return f(std::true_type(), vec_len_constant<4>()); - case 2: return f(std::true_type(), vec_len_constant<2>()); - default: return f(std::true_type(), vec_len_constant<1>()); - } - } else { - switch (vec_len) { - case 4: return f(std::false_type(), vec_len_constant<4>()); - case 2: return f(std::false_type(), vec_len_constant<2>()); - default: return f(std::false_type(), vec_len_constant<1>()); - } - } -} - -}; // namespace cuvs::distance::detail diff --git a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_sm60.cuh b/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_sm60.cuh deleted file mode 100644 index 9f9ed1cad..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_sm60.cuh +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include // std::min -#include // dispatch_layout -#include // pairwise_matrix_sm60_wrapper -#include // raft::linalg::Policy4x4 - -namespace cuvs::distance::detail { - -template -pairwise_matrix_sm60_wrapper pairwise_matrix_sm60_get_wrapper( - OpT distance_op, - pairwise_matrix_params params, - SM_compat_t sm_compat_range) -{ - int vec_len = determine_vec_len(params); - - // f takes compile-time constants row_major and vec_len aligned and returns - // the corresponding kernel wrapper. The wrapper contains the launch - // parameters of the kernel: a pointer to the kernel function, grid size, - // block size, and shared memory size. - auto f = [&](auto row_major, auto vec_len_aligned) { - // row_major and vec_len are std::integral_constants of type bool and int - // respectively. - - // To keep compile times in check, we only specialize on veclen > 1 when - // the inner loop is relatively cheap (< 5 flops). - constexpr int vec_len_op = distance_op.expensive_inner_loop ? 1 : vec_len_aligned(); - - // Prevent double, vec_len=4 combination (this is not supported) - constexpr int vec_len = std::min(vec_len_op, static_cast(16 / sizeof(DataT))); - - using RowPolicy = typename raft::linalg::Policy4x4::Policy; - using ColPolicy = typename raft::linalg::Policy4x4::ColPolicy; - using Policy = typename std::conditional::type; - - auto wrapper = - make_pairwise_matrix_sm60_wrapper(distance_op, params, sm_compat_range); - - return wrapper; - }; - - // Dispatch_layout calls f with appropriate compile time constants based on - // the runtime values of params.is_row_major and vec_len. - return dispatch_layout(params.is_row_major, vec_len, f); -} - -template -void pairwise_matrix_sm60_dispatch(OpT distance_op, - pairwise_matrix_params params, - SM_compat_t sm_compat_range, - cudaStream_t stream) -{ - auto wrapper = pairwise_matrix_sm60_get_wrapper(distance_op, params, sm_compat_range); - - wrapper.launch(distance_op, params, stream); -} - -} // namespace cuvs::distance::detail diff --git a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_sm80.cuh b/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_sm80.cuh deleted file mode 100644 index ccff73658..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_matrix/dispatch_sm80.cuh +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include // std::min -#include // cutlassDistanceKernel -#include // dispatch_layout - -namespace cuvs::distance::detail { - -template -void pairwise_matrix_sm80_dispatch(OpT distance_op, - pairwise_matrix_params params, - SM_compat_t sm_compat_range, - cudaStream_t stream) -{ - int vec_len = determine_vec_len(params); - - // f takes compile-time constants row_major and vec_len aligned and runs the - // corresponding cutlass launch code. - auto f = [&](auto row_major, auto vec_len_aligned) { - // row_major and vec_len are std::integral_constants of type bool and int - // respectively. - - // Prevent double, vec_len=4 combination (this is not supported) - constexpr int vec_len = std::min(vec_len_aligned(), static_cast(16 / sizeof(DataT))); - - using AccT = typename OpT::AccT; - cutlassDistanceKernel(params.x, - params.y, - params.x_norm, - params.y_norm, - params.m, - params.n, - params.k, - params.ldx, - params.ldy, - params.ld_out, - params.out, - params.fin_op, - distance_op, - stream); - }; - - // Dispatch_layout calls f with appropriate compile time constants based on - // the runtime values of params.is_row_major and vec_len. - dispatch_layout(params.is_row_major, vec_len, f); -} - -}; // namespace cuvs::distance::detail diff --git a/cpp/include/cuvs/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/cuvs/distance/detail/pairwise_matrix/kernel_sm60.cuh deleted file mode 100644 index baea4830e..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_matrix/kernel_sm60.cuh +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include // assert -#include // PairwiseDistances -#include // pairwise_matrix_params -#include // raft::void_op -#include // raft::util::arch::SM_compute_arch - -namespace cuvs::distance::detail { - -template -__launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL - pairwise_matrix_kernel(OpT distance_op, pairwise_matrix_params params) -{ - // Early exit to minimize the size of the kernel when it is not supposed to be compiled. - constexpr SM_compat_t sm_compat_range{}; - if constexpr (!sm_compat_range.contains(raft::util::arch::SM_compute_arch())) { - assert(false); - return; - } - - extern __shared__ char smem[]; - - // The epilog is already provided by distance_op. Do not provide additional - // epilogs. - auto epilog_op = raft::void_op(); - // No support for row_epilog_op. - auto row_epilog_op = raft::void_op(); - - // Always write output - constexpr bool write_out = true; - constexpr bool use_norms = distance_op.use_norms; - PairwiseDistances - obj(params.x, - params.y, - params.m, - params.n, - params.k, - params.ldx, - params.ldy, - params.ld_out, - params.x_norm, - params.y_norm, - params.out, - smem, - distance_op, - epilog_op, - params.fin_op, - row_epilog_op); - obj.run(); -} - -// The type of a pointer to the pairwise matrix kernel. The following template -// arguments are type-erased: -// -// - The kernel policy -// - row_major -// - SM_compat_t -template -using pairwise_matrix_kernel_t = void (*)(OpT, pairwise_matrix_params); - -// A wrapper for the pairwise matrix kernel launch. Includes kernel launch -// parameters. -template -struct pairwise_matrix_sm60_wrapper { - dim3 grid; - dim3 block; - int smem_size; - pairwise_matrix_kernel_t kernel_ptr; - - void launch(OpT distance_op, - pairwise_matrix_params params, - cudaStream_t stream) - { - kernel_ptr<<>>(distance_op, params); - RAFT_CUDA_TRY(cudaGetLastError()); - } -}; - -/** @brief: Create kernel launch wrapper for pairwise matrix kernel - * - * This can be used to type-erase the kernel execution policy, row_major, and SM - * compatibility range. - * - * @tparam Policy: Kernel execution policy - * @tparam row_major: Indicates whether input matrices are row major - * @tparam OpT: Type of distance operation - * @tparam IdxT: Index type - * @tparam DataT: Data type - * @tparam OutT: Output data type - * @tparam FinOpT: Final operation type - * @tparam SM_compat_t: Type of the SM architecture compatibility - * - * @param distance_op: Distance operation - * @param params: Parameters - * @param sm_compat_range: Which SM architectures to compile for. - */ -template -pairwise_matrix_sm60_wrapper make_pairwise_matrix_sm60_wrapper( - OpT distance_op, - pairwise_matrix_params params, - SM_compat_t sm_compat_range) -{ - dim3 block(Policy::Nthreads); - // Use ::template to disambiguate (See: - // https://en.cppreference.com/w/cpp/language/dependent_name) - int smem_size = OpT::template shared_mem_size(); - // Obtain function pointer to kernel - auto kernel = - pairwise_matrix_kernel; - dim3 grid = launchConfigGenerator(params.m, params.n, smem_size, kernel); - - return pairwise_matrix_sm60_wrapper{ - grid, block, smem_size, kernel}; -} - -}; // namespace cuvs::distance::detail diff --git a/cpp/include/cuvs/distance/detail/pairwise_matrix/params.cuh b/cpp/include/cuvs/distance/detail/pairwise_matrix/params.cuh deleted file mode 100644 index aa419aca0..000000000 --- a/cpp/include/cuvs/distance/detail/pairwise_matrix/params.cuh +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -namespace cuvs::distance::detail { - -template -struct pairwise_matrix_params { - IdxT m; - IdxT n; - IdxT k; - IdxT ldx; - IdxT ldy; - IdxT ld_out; - const DataT* x; - const DataT* y; - const DataT* x_norm; - const DataT* y_norm; - OutT* out; - FinOpT fin_op; - bool is_row_major; - - /// @brief: Flips the x and y input and corresponding sizes - void flip_x_and_y() - { - // Flip m, n; ldx, ldy; x, y; x_norm, y_norm. - std::swap(m, n); - std::swap(ldx, ldy); - std::swap(x, y); - std::swap(x_norm, y_norm); - } -}; - -} // namespace cuvs::distance::detail diff --git a/cpp/include/cuvs/distance/detail/predicated_tile_iterator_normvec.h b/cpp/include/cuvs/distance/detail/predicated_tile_iterator_normvec.h deleted file mode 100644 index 951f8a013..000000000 --- a/cpp/include/cuvs/distance/detail/predicated_tile_iterator_normvec.h +++ /dev/null @@ -1,585 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/*! \file - \brief Epilogue for threadblock scoped GEMMs using Tensor Ops. - -This file contains a customized version of PredicatedTileIterator from CUTLASS 2.9.0 -(https://github.com/NVIDIA/cutlass/blob/v2.9.0/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h#L75) - -Changes: -- added `Layout_` template param -- Only the row index is used to load the data in load_with_byte_offset(). - This way the same normalization data is used across all columns in a row. - -*/ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -//////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { - -//////////////////////////////////////////////////////////////////////////////// - -namespace epilogue { -namespace threadblock { - -//////////////////////////////////////////////////////////////////////////////// - -/// Tile iterator used to load and store output tile from global memory in epilogue. -/// -/// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator -/// -template -class PredicatedTileIteratorNormVec { - public: - using ThreadMap = ThreadMap_; - using Shape = typename ThreadMap::Shape; - - using Element = Element_; - - using Layout = Layout_; - using TensorRef = TensorRef; - using ConstTensorRef = typename TensorRef::ConstTensorRef; - - using Index = typename Layout::Index; - using LongIndex = typename Layout::LongIndex; - using TensorCoord = MatrixCoord; - - static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; - static int const kThreads = ThreadMap::kThreads; - static int const kIterations = ThreadMap::Count::kTile; - - static_assert(ThreadMap::Iterations::kRow > 0, "ThreadMap::Iterations::kRow must be > 0"); - static_assert(ThreadMap::Iterations::kGroup > 0, "ThreadMap::Iterations::kGroup must be > 0"); - static_assert(ThreadMap::Iterations::kCluster > 0, "ThreadMap::Iterations::kCluster must be > 0"); - static_assert(ThreadMap::Iterations::kColumn > 0, "ThreadMap::Iterations::kColumn must be > 0"); - - /// Fragment object - using Fragment = Array; - - /// Memory access size - using AccessType = AlignedArray; - - // - // Parameters struct - // - - /// Uses a non-template class - struct Params : PredicatedTileIteratorParams { - using Base = PredicatedTileIteratorParams; - - CUTLASS_HOST_DEVICE - Params() {} - - CUTLASS_HOST_DEVICE - Params(Layout const& layout) - : PredicatedTileIteratorParams( - layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess, - make_OutputTileThreadMapDesc()) - { - } - - CUTLASS_HOST_DEVICE - Params(Base const& base) : Base(base) {} - }; - - /// Mask object - struct Mask { - static int const kCount = ThreadMap::Iterations::kColumn; - - /// Predicate state - bool predicates[kCount]; - - // - // Mask - // - CUTLASS_HOST_DEVICE - Mask() { enable(); } - - ///< Efficiently disables all accesses guarded by mask - CUTLASS_HOST_DEVICE void clear() - { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kCount; ++i) { - predicates[i] = false; - } - } - - ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask - CUTLASS_DEVICE void enable() - { - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < kCount; ++i) { - predicates[i] = true; - } - } - }; - - private: - // - // Data members - // - - /// Parameters structure containing reference and precomputed state. - PredicatedTileIteratorParams params_; - - /// Byte-level pointer - uint8_t* byte_pointer_; - - /// Array of boolean values to contain steady-state predicates - Mask mask_; - - /// Extent of the matrix tile in rows - Index extent_row_; - - /// Extent of the matrix tile in rows - Index extent_column_; - - /// A thread's starting row position (assuming steady-state predicates have been computed) - Index thread_start_row_; - - /// A thread's starting column - Index thread_start_column_; - - /// Internal state counter - int state_[3]; - - /// Scatter indices - int const* indices_; - - // - // Static asserts about internal strides - // - - static_assert(sizeof(extent_row_) == 4, "Expected 32b extents"); - static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents"); - static_assert(sizeof(PredicatedTileIteratorParams::stride) == 8, "Expected 64b strides"); - - private: - // - // Methods - // - - public: - // - // Methods - // - - /// Constructor - CUTLASS_DEVICE - PredicatedTileIteratorNormVec(PredicatedTileIteratorParams const& params, - Element* pointer, - TensorCoord extent, - int thread_idx, - TensorCoord threadblock_offset = TensorCoord(), - int const* indices = nullptr) - : params_(params), indices_(indices) - { - TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx) + threadblock_offset; - - extent_row_ = extent.row(); - extent_column_ = extent.column(); - - thread_start_row_ = thread_offset.row(); - thread_start_column_ = thread_offset.column(); - - // Initialize predicates - CUTLASS_PRAGMA_UNROLL - for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) { - mask_.predicates[c] = - ((thread_offset.column() + ThreadMap::Delta::kColumn * c) < extent.column()); - } - - // Null pointer performs no accesses - if (!pointer) { mask_.clear(); } - - if (ScatterD && !indices) { mask_.clear(); } - - // Initialize pointer - byte_pointer_ = reinterpret_cast(pointer) + - LongIndex(thread_offset.row()) * LongIndex(params_.stride); - - if (ScatterD) { - byte_pointer_ = reinterpret_cast(pointer) + - LongIndex(thread_offset.column()) * sizeof(AccessType) / kElementsPerAccess; - } - - // Initialize internal state counter - state_[0] = state_[1] = state_[2] = 0; - } - - /// Adds a pointer offset in units of Element - CUTLASS_HOST_DEVICE - void add_pointer_offset(LongIndex pointer_offset) - { - byte_pointer_ += pointer_offset * sizeof_bits::value / 8; - } - - /// Loads a fragment from memory - CUTLASS_DEVICE - void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const - { - uint8_t* byte_pointer = byte_pointer_; - AccessType* frag_ptr = reinterpret_cast(&frag); - - CUTLASS_PRAGMA_UNROLL - for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { - CUTLASS_PRAGMA_UNROLL - for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { - CUTLASS_PRAGMA_UNROLL - for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = - (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - - int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; - - bool row_guard = ((row_offset + thread_start_row_) < extent_row_); - - AccessType* memory_pointer = reinterpret_cast(byte_pointer + byte_offset); - - if (ScatterD && row_guard) { - assert(indices_); - - memory_pointer = reinterpret_cast( - byte_pointer + byte_offset + - LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride)); - } - - CUTLASS_PRAGMA_UNROLL - for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { - bool guard = row_guard && mask_.predicates[column]; - if (column == 0) { - cutlass::arch::global_load( - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column], - (void*)&memory_pointer[0], - guard); - } else { - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn]; - } - } - - if (row + 1 < ThreadMap::Iterations::kRow) { - if (!ScatterD) { byte_pointer += params_.increment_row; } - } - } - - if (group + 1 < ThreadMap::Iterations::kGroup) { byte_pointer += params_.increment_group; } - } - - if (cluster + 1 < ThreadMap::Iterations::kCluster) { - byte_pointer += params_.increment_cluster; - } - } - } - - /// Loads a fragment from memory - CUTLASS_DEVICE - void load(Fragment& frag) const { load_with_byte_offset(frag, 0); } - - /// Stores a fragment to memory - CUTLASS_DEVICE - void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const - { - uint8_t* byte_pointer = byte_pointer_; - AccessType const* frag_ptr = reinterpret_cast(&frag); - - CUTLASS_PRAGMA_UNROLL - for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { - CUTLASS_PRAGMA_UNROLL - for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { - CUTLASS_PRAGMA_UNROLL - for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = - (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - - int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; - - bool row_guard = ((row_offset + thread_start_row_) < extent_row_); - - AccessType* memory_pointer = reinterpret_cast(byte_pointer + byte_offset); - - if (ScatterD && row_guard) { - assert(indices_); - - memory_pointer = reinterpret_cast( - byte_pointer + byte_offset + - LongIndex(indices_[row_offset + thread_start_row_]) * LongIndex(params_.stride)); - } - - CUTLASS_PRAGMA_UNROLL - for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { - bool guard = row_guard && mask_.predicates[column]; - - if (UseCUDAStore) { - if (guard) { - memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] = - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]; - } - } else { - cutlass::arch::global_store( - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column], - (void*)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess], - guard); - } - } - - if (row + 1 < ThreadMap::Iterations::kRow) { - if (!ScatterD) { byte_pointer += params_.increment_row; } - } - } - - if (group + 1 < ThreadMap::Iterations::kGroup) { byte_pointer += params_.increment_group; } - } - - if (cluster + 1 < ThreadMap::Iterations::kCluster) { - byte_pointer += params_.increment_cluster; - } - } - } - - /// Stores a fragment to memory - CUTLASS_DEVICE - void store(Fragment const& frag) const { store_with_byte_offset(frag, 0); } - - /// Loads a fragment from memory - CUTLASS_DEVICE - void downsample_load_with_byte_offset(Fragment& frag, - int64_t byte_offset, - int convolution_P, - int convolution_Q, - int add_P, - int add_Q, - int problem_N) const - { - uint8_t* byte_pointer = byte_pointer_; - AccessType* frag_ptr = reinterpret_cast(&frag); - - CUTLASS_PRAGMA_UNROLL - for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { - CUTLASS_PRAGMA_UNROLL - for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { - CUTLASS_PRAGMA_UNROLL - for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = - (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - - int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; - - bool row_guard = ((row_offset + thread_start_row_) < extent_row_); - - int output_row = row_offset + thread_start_row_; - int output_N = output_row / (convolution_P * convolution_Q); - int output_PQ = output_row % (convolution_P * convolution_Q); - int output_P = output_PQ / convolution_Q; - int output_Q = output_PQ % convolution_Q; - - int input_row = output_N * 2 * convolution_P * 2 * convolution_Q + - (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q; - - int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float); - - AccessType* memory_pointer = reinterpret_cast(byte_pointer + byte_offset); - - CUTLASS_PRAGMA_UNROLL - for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { - bool guard = row_guard && mask_.predicates[column]; - - cutlass::arch::global_load( - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column], - (void*)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess], - guard); - } - - if (row + 1 < ThreadMap::Iterations::kRow) { byte_pointer += params_.increment_row; } - } - - if (group + 1 < ThreadMap::Iterations::kGroup) { byte_pointer += params_.increment_group; } - } - - if (cluster + 1 < ThreadMap::Iterations::kCluster) { - byte_pointer += params_.increment_cluster; - } - } - } - - /// Loads a fragment from memory - CUTLASS_DEVICE - void upsample_load_with_byte_offset(Fragment& frag, - int64_t byte_offset, - int convolution_P, - int convolution_Q, - int add_P, - int add_Q, - int problem_N) const - { - uint8_t* byte_pointer = byte_pointer_; - AccessType* frag_ptr = reinterpret_cast(&frag); - - CUTLASS_PRAGMA_UNROLL - for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { - CUTLASS_PRAGMA_UNROLL - for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { - CUTLASS_PRAGMA_UNROLL - for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { - int frag_row_idx = - (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - - int row_offset = row * ThreadMap::Delta::kRow + group * ThreadMap::Delta::kGroup + - cluster * ThreadMap::Delta::kCluster; - - bool row_guard = ((row_offset + thread_start_row_) < extent_row_); - - int output_row = row_offset + thread_start_row_; - int output_N = output_row / (convolution_P * convolution_Q); - int output_PQ = output_row % (convolution_P * convolution_Q); - int output_P = output_PQ / convolution_Q; - int output_Q = output_PQ % convolution_Q; - int row_add_P = add_P; - int row_add_Q = add_Q; - if (output_P > convolution_P - 2) row_add_P = 0; - if (output_Q > convolution_Q - 2) row_add_Q = 0; - - int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) + - ((output_P + row_add_P) / 2) * (convolution_Q / 2) + - (output_Q + row_add_Q) / 2; - - int64_t byte_offset = (input_row - output_row) * problem_N * sizeof(float); - - AccessType* memory_pointer = reinterpret_cast(byte_pointer + byte_offset); - - CUTLASS_PRAGMA_UNROLL - for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { - bool guard = row_guard && mask_.predicates[column]; - - cutlass::arch::global_load( - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column], - (void*)&memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess], - guard); - } - - if (row + 1 < ThreadMap::Iterations::kRow) { byte_pointer += params_.increment_row; } - } - - if (group + 1 < ThreadMap::Iterations::kGroup) { byte_pointer += params_.increment_group; } - } - - if (cluster + 1 < ThreadMap::Iterations::kCluster) { - byte_pointer += params_.increment_cluster; - } - } - } - - CUTLASS_DEVICE - MatrixCoord thread_start() const { return MatrixCoord(thread_start_row_, thread_start_column_); } - - /// Need to get the thread start row from the tile iterator - CUTLASS_DEVICE - int32_t thread_start_row() const { return thread_start_row_; } - - /// Need to get the thread start row from the tile iterator - CUTLASS_DEVICE - int32_t thread_start_column() const { return thread_start_column_; } - - /// Extent of the matrix in rows - CUTLASS_DEVICE - Index extent_row() const { return extent_row_; } - - /// Extent of the matrix in columns - CUTLASS_DEVICE - Index extent_column() const { return extent_column_; } - - /// Advances to the next position to load or store - CUTLASS_HOST_DEVICE - PredicatedTileIteratorNormVec& operator++() - { - ++state_[0]; - - if (!ScatterD) { byte_pointer_ += params_.advance_row; } - - thread_start_row_ += ThreadMap::Shape::kRow; - - if (state_[0] == ThreadMap::Count::kRow) { - state_[0] = 0; - ++state_[1]; - byte_pointer_ += params_.advance_group; - - thread_start_row_ += - (ThreadMap::Shape::kGroup - 1) * ThreadMap::Shape::kRow * ThreadMap::Count::kRow; - - if (state_[1] == ThreadMap::Count::kGroup) { - state_[1] = 0; - ++state_[2]; - byte_pointer_ += params_.advance_cluster; - - thread_start_row_ += ThreadMap::Count::kGroup * ThreadMap::Shape::kGroup * - ThreadMap::Count::kRow * ThreadMap::Shape::kRow; - - if (state_[2] == ThreadMap::Count::kCluster) { - state_[2] = 0; - byte_pointer_ += params_.advance_tile; - } - } - } - - return *this; - } - - ///< Efficiently disables all accesses guarded by mask - CUTLASS_DEVICE void clear_mask() { mask_.clear(); } - - ///< Efficiently enables all accesses guarded by mask - CUTLASS_DEVICE void enable_mask() { mask_.enable(); } - - ///< Sets the mask - CUTLASS_DEVICE void get_mask(Mask& mask) const { mask = mask_; } - - ///< Sets the mask - CUTLASS_DEVICE void set_mask(Mask const& mask) { mask_ = mask; } -}; - -/////////////////////////////////////////////////////////////////////////////// - -} // namespace threadblock -} // namespace epilogue -} // namespace cutlass - -//////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/include/cuvs/distance/distance-ext.cuh b/cpp/include/cuvs/distance/distance-ext.cuh deleted file mode 100644 index fdbe6a971..000000000 --- a/cpp/include/cuvs/distance/distance-ext.cuh +++ /dev/null @@ -1,1065 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include // rbf_fin_op -#include // cuvs::distance::DistanceType -#include // raft::device_matrix_view -#include // raft::identity_op -#include // raft::resources -#include // RAFT_EXPLICIT -#include // rmm::device_uvector - -#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY - -namespace cuvs { -namespace distance { - -template -void distance(raft::resources const& handle, - const DataT* x, - const DataT* y, - OutT* dist, - IdxT m, - IdxT n, - IdxT k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - bool isRowMajor = true, - DataT metric_arg = 2.0f) RAFT_EXPLICIT; - -template -void distance(raft::resources const& handle, - const DataT* x, - const DataT* y, - OutT* dist, - IdxT m, - IdxT n, - IdxT k, - void* workspace, - size_t worksize, - bool isRowMajor = true, - DataT metric_arg = 2.0f) RAFT_EXPLICIT; - -template -size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) RAFT_EXPLICIT; - -template -size_t getWorkspaceSize(raft::device_matrix_view const& x, - raft::device_matrix_view const& y) RAFT_EXPLICIT; - -template -void distance(raft::resources const& handle, - const DataT* x, - const DataT* y, - OutT* dist, - IdxT m, - IdxT n, - IdxT k, - bool isRowMajor = true, - DataT metric_arg = 2.0f) RAFT_EXPLICIT; - -template -void pairwise_distance(raft::resources const& handle, - const Type* x, - const Type* y, - Type* dist, - IdxT m, - IdxT n, - IdxT k, - rmm::device_uvector& workspace, - cuvs::distance::DistanceType metric, - bool isRowMajor = true, - Type metric_arg = 2.0f) RAFT_EXPLICIT; - -template -void pairwise_distance(raft::resources const& handle, - const Type* x, - const Type* y, - Type* dist, - IdxT m, - IdxT n, - IdxT k, - cuvs::distance::DistanceType metric, - bool isRowMajor = true, - Type metric_arg = 2.0f) RAFT_EXPLICIT; - -template -void distance(raft::resources const& handle, - raft::device_matrix_view const x, - raft::device_matrix_view const y, - raft::device_matrix_view dist, - DataT metric_arg = 2.0f) RAFT_EXPLICIT; - -template -void pairwise_distance(raft::resources const& handle, - device_matrix_view const x, - device_matrix_view const y, - device_matrix_view dist, - cuvs::distance::DistanceType metric, - Type metric_arg = 2.0f) RAFT_EXPLICIT; - -}; // namespace distance -}; // namespace cuvs - -#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY - -/* - * Hierarchy of instantiations: - * - * This file defines the extern template instantiations for the public API of - * cuvs::distance. To improve compile times, the extern template instantiation - * of the distance kernels is handled in - * distance/detail/pairwise_matrix/dispatch-ext.cuh. - * - * After adding an instance here, make sure to also add the instance to - * dispatch-ext.cuh and the corresponding .cu files. - */ - -#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, FinalLambda, IdxT) \ - extern template void cuvs::distance::distance( \ - raft::resources const& handle, \ - const DataT* x, \ - const DataT* y, \ - OutT* dist, \ - IdxT m, \ - IdxT n, \ - IdxT k, \ - void* workspace, \ - size_t worksize, \ - FinalLambda fin_op, \ - bool isRowMajor, \ - DataT metric_arg) - -// The following two instances are used in test/distance/gram.cu. Note the use -// of int64_t for the index type. -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2Unexpanded, - float, - float, - float, - cuvs::distance::kernels::detail::rbf_fin_op, - int64_t); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2Unexpanded, - double, - double, - double, - cuvs::distance::kernels::detail::rbf_fin_op, - int64_t); - -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CorrelationExpanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::CorrelationExpanded, - double, - double, - double, - raft::identity_op, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CosineExpanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CosineExpanded, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HammingUnexpanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HammingUnexpanded, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HellingerExpanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HellingerExpanded, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::InnerProduct, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::InnerProduct, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::JensenShannon, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::JensenShannon, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::KLDivergence, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::KLDivergence, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L1, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L1, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtExpanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtExpanded, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtUnexpanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtUnexpanded, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Unexpanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Unexpanded, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Linf, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Linf, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::LpUnexpanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::LpUnexpanded, double, double, double, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::RusselRaoExpanded, float, float, float, raft::identity_op, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::RusselRaoExpanded, double, double, double, raft::identity_op, int); - -#undef instantiate_raft_distance_distance - -// Same, but without raft::identity_op -#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT) \ - extern template void cuvs::distance::distance( \ - raft::resources const& handle, \ - const DataT* x, \ - const DataT* y, \ - OutT* dist, \ - IdxT m, \ - IdxT n, \ - IdxT k, \ - void* workspace, \ - size_t worksize, \ - bool isRowMajor, \ - DataT metric_arg) - -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CorrelationExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CorrelationExpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CosineExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CosineExpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HammingUnexpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HammingUnexpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HellingerExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HellingerExpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::InnerProduct, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::InnerProduct, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::JensenShannon, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::JensenShannon, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::KLDivergence, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::KLDivergence, double, double, double, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L1, float, float, float, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L1, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtExpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Unexpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Unexpanded, double, double, double, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::Linf, float, float, float, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::Linf, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::LpUnexpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::LpUnexpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::RusselRaoExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::RusselRaoExpanded, double, double, double, int); - -#undef instantiate_raft_distance_distance - -// Same, but without workspace -#define instantiate_raft_distance_distance(DT, DataT, AccT, OutT, IdxT) \ - extern template void cuvs::distance::distance( \ - raft::resources const& handle, \ - const DataT* x, \ - const DataT* y, \ - OutT* dist, \ - IdxT m, \ - IdxT n, \ - IdxT k, \ - bool isRowMajor, \ - DataT metric_arg) - -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CorrelationExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CorrelationExpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CosineExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::CosineExpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HammingUnexpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HammingUnexpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HellingerExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::HellingerExpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::InnerProduct, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::InnerProduct, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::JensenShannon, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::JensenShannon, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::KLDivergence, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::KLDivergence, double, double, double, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L1, float, float, float, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L1, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtExpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Unexpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Unexpanded, double, double, double, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::Linf, float, float, float, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::Linf, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::LpUnexpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::LpUnexpanded, double, double, double, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::RusselRaoExpanded, float, float, float, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::RusselRaoExpanded, double, double, double, int); - -#undef instantiate_raft_distance_distance - -#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT) \ - extern template size_t cuvs::distance::getWorkspaceSize( \ - const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) - -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::Canberra, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::Canberra, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::CorrelationExpanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::CorrelationExpanded, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::CosineExpanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::CosineExpanded, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::HammingUnexpanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::HammingUnexpanded, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::HellingerExpanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::HellingerExpanded, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::InnerProduct, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::InnerProduct, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::JensenShannon, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::JensenShannon, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::KLDivergence, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::KLDivergence, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L1, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L1, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Expanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Expanded, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2SqrtExpanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2SqrtExpanded, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2SqrtUnexpanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2SqrtUnexpanded, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Unexpanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Unexpanded, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::Linf, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::Linf, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::LpUnexpanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::LpUnexpanded, double, double, double, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::RusselRaoExpanded, float, float, float, int); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::RusselRaoExpanded, double, double, double, int); - -#undef instantiate_raft_distance_getWorkspaceSize - -#define instantiate_raft_distance_getWorkspaceSize(DistT, DataT, AccT, OutT, IdxT, layout) \ - extern template size_t cuvs::distance::getWorkspaceSize( \ - raft::device_matrix_view const& x, \ - raft::device_matrix_view const& y) - -// We could consider not taking template parameters for this function. The -// number of instantiations seems a bit excessive.. -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::Canberra, float, float, float, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::Canberra, double, double, double, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::Canberra, float, float, float, int, raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::Canberra, double, double, double, int, raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::CorrelationExpanded, - float, - float, - float, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::CorrelationExpanded, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::CorrelationExpanded, - float, - float, - float, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::CorrelationExpanded, - double, - double, - double, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::CosineExpanded, - float, - float, - float, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::CosineExpanded, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::CosineExpanded, - float, - float, - float, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::CosineExpanded, - double, - double, - double, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::HammingUnexpanded, - float, - float, - float, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::HammingUnexpanded, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::HammingUnexpanded, - float, - float, - float, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::HammingUnexpanded, - double, - double, - double, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::HellingerExpanded, - float, - float, - float, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::HellingerExpanded, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::HellingerExpanded, - float, - float, - float, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::HellingerExpanded, - double, - double, - double, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::InnerProduct, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::InnerProduct, float, float, float, int, raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::InnerProduct, - double, - double, - double, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::JensenShannon, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::JensenShannon, float, float, float, int, raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::JensenShannon, - double, - double, - double, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::KLDivergence, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::KLDivergence, float, float, float, int, raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::KLDivergence, - double, - double, - double, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L1, float, float, float, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L1, double, double, double, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L1, float, float, float, int, raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L1, double, double, double, int, raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Expanded, float, float, float, int, raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Expanded, double, double, double, int, raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::L2SqrtExpanded, - float, - float, - float, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::L2SqrtExpanded, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::L2SqrtExpanded, - float, - float, - float, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::L2SqrtExpanded, - double, - double, - double, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::L2SqrtUnexpanded, - float, - float, - float, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::L2SqrtUnexpanded, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::L2SqrtUnexpanded, - float, - float, - float, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::L2SqrtUnexpanded, - double, - double, - double, - int, - raft::layout_f_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize(cuvs::distance::DistanceType::L2Unexpanded, - double, - double, - double, - int, - raft::layout_c_contiguous); -instantiate_raft_distance_getWorkspaceSize( - cuvs::distance::DistanceType::L2Unexpanded, float, float, float, int, raft::layout_f_contiguous); - -#undef instantiate_raft_distance_getWorkspaceSize - -#define instantiate_raft_distance_pairwise_distance(DataT, IdxT) \ - extern template void cuvs::distance::pairwise_distance(raft::resources const& handle, \ - const DataT* x, \ - const DataT* y, \ - DataT* dist, \ - IdxT m, \ - IdxT n, \ - IdxT k, \ - rmm::device_uvector& workspace, \ - cuvs::distance::DistanceType metric, \ - bool isRowMajor, \ - DataT metric_arg) - -instantiate_raft_distance_pairwise_distance(float, int); -instantiate_raft_distance_pairwise_distance(double, int); - -#undef instantiate_raft_distance_pairwise_distance - -// Same, but without workspace -#define instantiate_raft_distance_pairwise_distance(DataT, IdxT) \ - extern template void cuvs::distance::pairwise_distance(raft::resources const& handle, \ - const DataT* x, \ - const DataT* y, \ - DataT* dist, \ - IdxT m, \ - IdxT n, \ - IdxT k, \ - cuvs::distance::DistanceType metric, \ - bool isRowMajor, \ - DataT metric_arg) - -instantiate_raft_distance_pairwise_distance(float, int); -instantiate_raft_distance_pairwise_distance(double, int); - -#undef instantiate_raft_distance_pairwise_distance - -// Version with mdspan -#define instantiate_raft_distance_distance(DistT, DataT, AccT, OutT, layout, IdxT) \ - extern template void cuvs::distance::distance( \ - raft::resources const& handle, \ - raft::device_matrix_view const x, \ - raft::device_matrix_view const y, \ - raft::device_matrix_view dist, \ - DataT metric_arg) - -// Again, we might want to consider reigning in the number of instantiations... -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, float, float, float, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, double, double, double, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, float, float, float, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Canberra, double, double, double, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::CorrelationExpanded, - float, - float, - float, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::CorrelationExpanded, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::CorrelationExpanded, - float, - float, - float, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::CorrelationExpanded, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::CosineExpanded, - float, - float, - float, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::CosineExpanded, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::CosineExpanded, - float, - float, - float, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::CosineExpanded, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::HammingUnexpanded, - float, - float, - float, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::HammingUnexpanded, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::HammingUnexpanded, - float, - float, - float, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::HammingUnexpanded, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::HellingerExpanded, - float, - float, - float, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::HellingerExpanded, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::HellingerExpanded, - float, - float, - float, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::HellingerExpanded, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::InnerProduct, float, float, float, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::InnerProduct, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::InnerProduct, float, float, float, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::InnerProduct, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::JensenShannon, float, float, float, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::JensenShannon, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::JensenShannon, float, float, float, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::JensenShannon, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::KLDivergence, float, float, float, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::KLDivergence, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::KLDivergence, float, float, float, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::KLDivergence, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L1, float, float, float, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L1, double, double, double, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L1, float, float, float, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L1, double, double, double, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, float, float, float, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, double, double, double, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, float, float, float, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Expanded, double, double, double, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2SqrtExpanded, - float, - float, - float, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2SqrtExpanded, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2SqrtExpanded, - float, - float, - float, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2SqrtExpanded, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2SqrtUnexpanded, - float, - float, - float, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2SqrtUnexpanded, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2SqrtUnexpanded, - float, - float, - float, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2SqrtUnexpanded, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2Unexpanded, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::L2Unexpanded, float, float, float, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::L2Unexpanded, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Linf, float, float, float, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Linf, double, double, double, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Linf, float, float, float, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::Linf, double, double, double, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_c_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::LpUnexpanded, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance( - cuvs::distance::DistanceType::LpUnexpanded, float, float, float, raft::layout_f_contiguous, int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::LpUnexpanded, - double, - double, - double, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::RusselRaoExpanded, - float, - float, - float, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::RusselRaoExpanded, - double, - double, - double, - raft::layout_c_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::RusselRaoExpanded, - float, - float, - float, - raft::layout_f_contiguous, - int); -instantiate_raft_distance_distance(cuvs::distance::DistanceType::RusselRaoExpanded, - double, - double, - double, - raft::layout_f_contiguous, - int); - -#undef instantiate_raft_distance_distance - -#define instantiate_raft_distance_pairwise_distance(DataT, layout, IdxT) \ - extern template void cuvs::distance::pairwise_distance( \ - raft::resources const& handle, \ - raft::device_matrix_view const x, \ - raft::device_matrix_view const y, \ - raft::device_matrix_view dist, \ - cuvs::distance::DistanceType metric, \ - DataT metric_arg) - -instantiate_raft_distance_pairwise_distance(float, raft::layout_c_contiguous, int); -instantiate_raft_distance_pairwise_distance(float, raft::layout_f_contiguous, int); -instantiate_raft_distance_pairwise_distance(double, raft::layout_c_contiguous, int); -instantiate_raft_distance_pairwise_distance(double, raft::layout_f_contiguous, int); - -#undef instantiate_raft_distance_pairwise_distance diff --git a/cpp/include/cuvs/distance/distance-inl.cuh b/cpp/include/cuvs/distance/distance-inl.cuh deleted file mode 100644 index 0abdeacff..000000000 --- a/cpp/include/cuvs/distance/distance-inl.cuh +++ /dev/null @@ -1,477 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include - -namespace cuvs { -namespace distance { - -/** - * @defgroup pairwise_distance pointer-based pairwise distance prims - * @{ - */ - -/** - * @brief Evaluate pairwise distances with the user epilogue lamba allowed - * @tparam DistanceType which distance to evaluate - * @tparam DataT input argument type - * @tparam AccT accumulation type - * @tparam OutT output type - * @tparam FinalLambda user-defined epilogue lamba - * @tparam IdxT Index type - * @param handle raft handle for managing expensive resources - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param workspace temporary workspace needed for computations - * @param worksize number of bytes of the workspace - * @param fin_op the final gemm epilogue lambda - * @param isRowMajor whether the matrices are row-major or col-major - * @param metric_arg metric argument (used for Minkowski distance) - * - * @note fin_op: This is a device lambda which is supposed to operate upon the - * input which is AccT and returns the output in OutT. It's signature is - * as follows:

OutT fin_op(AccT in, int g_idx);
. If one needs - * any other parameters, feel free to pass them via closure. - */ -template -void distance(raft::resources const& handle, - const DataT* x, - const DataT* y, - OutT* dist, - IdxT m, - IdxT n, - IdxT k, - void* workspace, - size_t worksize, - FinalLambda fin_op, - bool isRowMajor = true, - DataT metric_arg = 2.0f) -{ - detail::distance( - handle, x, y, dist, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg); -} - -/** - * @brief Evaluate pairwise distances for the simple use case - * @tparam DistanceType which distance to evaluate - * @tparam DataT input argument type - * @tparam AccT accumulation type - * @tparam OutT output type - * @tparam IdxT Index type - * @param handle raft handle for managing expensive resources - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param workspace temporary workspace needed for computations - * @param worksize number of bytes of the workspace - * @param isRowMajor whether the matrices are row-major or col-major - * @param metric_arg metric argument (used for Minkowski distance) - */ -template -void distance(raft::resources const& handle, - const DataT* x, - const DataT* y, - OutT* dist, - IdxT m, - IdxT n, - IdxT k, - void* workspace, - size_t worksize, - bool isRowMajor = true, - DataT metric_arg = 2.0f) -{ - detail::distance( - handle, x, y, dist, m, n, k, workspace, worksize, isRowMajor, metric_arg); -} - -/** - * @brief Return the exact workspace size to compute the distance - * @tparam DistanceType which distance to evaluate - * @tparam DataT input argument type - * @tparam AccT accumulation type - * @tparam OutT output type - * @tparam IdxT Index type - * @param x first set of points - * @param y second set of points - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * - * @note If the specified DistT doesn't need the workspace at all, it - * returns 0. - */ -template -size_t getWorkspaceSize(const DataT* x, const DataT* y, IdxT m, IdxT n, IdxT k) -{ - return detail::getWorkspaceSize(x, y, m, n, k); -} - -/** - * @brief Return the exact workspace size to compute the distance - * @tparam DistanceType which distance to evaluate - * @tparam DataT input argument type - * @tparam AccT accumulation type - * @tparam OutT output type - * @tparam IdxT Index type - * @param x first set of points (size m*k) - * @param y second set of points (size n*k) - * @return number of bytes needed in workspace - * - * @note If the specified DistT doesn't need the workspace at all, it - * returns 0. - */ -template -size_t getWorkspaceSize(raft::device_matrix_view const& x, - raft::device_matrix_view const& y) -{ - RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal."); - - return getWorkspaceSize( - x.data_handle(), y.data_handle(), x.extent(0), y.extent(0), x.extent(1)); -} - -/** - * @brief Evaluate pairwise distances for the simple use case - * @tparam DistanceType which distance to evaluate - * @tparam DataT input argument type - * @tparam AccT accumulation type - * @tparam OutT output type - * @tparam IdxT Index type - * @param handle raft handle for managing expensive resources - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param isRowMajor whether the matrices are row-major or col-major - * @param metric_arg metric argument (used for Minkowski distance) - */ -template -void distance(raft::resources const& handle, - const DataT* x, - const DataT* y, - OutT* dist, - IdxT m, - IdxT n, - IdxT k, - bool isRowMajor = true, - DataT metric_arg = 2.0f) -{ - auto stream = raft::resource::get_cuda_stream(handle); - rmm::device_uvector workspace(0, stream); - auto worksize = getWorkspaceSize(x, y, m, n, k); - workspace.resize(worksize, stream); - detail::distance( - handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg); -} - -/** - * @brief Convenience wrapper around 'distance' prim to convert runtime metric - * into compile time for the purpose of dispatch - * @tparam Type input/accumulation/output data-type - * @tparam IdxT indexing type - * @param handle raft handle for managing expensive resources - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param workspace temporary workspace buffer which can get resized as per the - * needed workspace size - * @param metric distance metric - * @param isRowMajor whether the matrices are row-major or col-major - * @param metric_arg metric argument (used for Minkowski distance) - */ -template -void pairwise_distance(raft::resources const& handle, - const Type* x, - const Type* y, - Type* dist, - IdxT m, - IdxT n, - IdxT k, - rmm::device_uvector& workspace, - cuvs::distance::DistanceType metric, - bool isRowMajor = true, - Type metric_arg = 2.0f) -{ - cudaStream_t stream = raft::resource::get_cuda_stream(handle); - - auto dispatch = [&](auto distance_type) { - auto worksize = getWorkspaceSize(x, y, m, n, k); - workspace.resize(worksize, stream); - detail::distance( - handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg); - }; - - switch (metric) { - case DistanceType::Canberra: - dispatch(std::integral_constant{}); - break; - case DistanceType::CorrelationExpanded: - dispatch(std::integral_constant{}); - break; - case DistanceType::CosineExpanded: - dispatch(std::integral_constant{}); - break; - case DistanceType::HammingUnexpanded: - dispatch(std::integral_constant{}); - break; - case DistanceType::HellingerExpanded: - dispatch(std::integral_constant{}); - break; - case cuvs::distance::DistanceType::InnerProduct: - dispatch(std::integral_constant{}); - break; - case DistanceType::JensenShannon: - dispatch(std::integral_constant{}); - break; - case DistanceType::KLDivergence: - dispatch(std::integral_constant{}); - break; - case DistanceType::L1: - dispatch(std::integral_constant{}); - break; - case DistanceType::L2Expanded: - dispatch(std::integral_constant{}); - break; - case DistanceType::L2SqrtExpanded: - dispatch(std::integral_constant{}); - break; - case DistanceType::L2SqrtUnexpanded: - dispatch(std::integral_constant{}); - break; - case DistanceType::L2Unexpanded: - dispatch(std::integral_constant{}); - break; - case DistanceType::Linf: - dispatch(std::integral_constant{}); - break; - case DistanceType::LpUnexpanded: - dispatch(std::integral_constant{}); - break; - case DistanceType::RusselRaoExpanded: - dispatch(std::integral_constant{}); - break; - default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric); - }; -} - -/** - * @brief Convenience wrapper around 'distance' prim to convert runtime metric - * into compile time for the purpose of dispatch - * @tparam Type input/accumulation/output data-type - * @tparam IdxT indexing type - * @param handle raft handle for managing expensive resources - * @param x first set of points - * @param y second set of points - * @param dist output distance matrix - * @param m number of points in x - * @param n number of points in y - * @param k dimensionality - * @param metric distance metric - * @param isRowMajor whether the matrices are row-major or col-major - * @param metric_arg metric argument (used for Minkowski distance) - */ -template -void pairwise_distance(raft::resources const& handle, - const Type* x, - const Type* y, - Type* dist, - IdxT m, - IdxT n, - IdxT k, - cuvs::distance::DistanceType metric, - bool isRowMajor = true, - Type metric_arg = 2.0f) -{ - auto stream = raft::resource::get_cuda_stream(handle); - rmm::device_uvector workspace(0, stream); - pairwise_distance( - handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg); -} - -/** @} */ - -/** - * \defgroup distance_mdspan Pairwise distance functions - * @{ - */ - -/** - * @brief Evaluate pairwise distances for the simple use case. - * - * Note: Only contiguous row- or column-major layouts supported currently. - * - * Usage example: - * @code{.cpp} - * #include - * #include - * #include - * #include - * - * raft::raft::resources handle; - * int n_samples = 5000; - * int n_features = 50; - * - * auto input = raft::make_device_matrix(handle, n_samples, n_features); - * auto labels = raft::make_device_vector(handle, n_samples); - * auto output = raft::make_device_matrix(handle, n_samples, n_samples); - * - * raft::random::make_blobs(handle, input.view(), labels.view()); - * auto metric = cuvs::distance::DistanceType::L2SqrtExpanded; - * cuvs::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric); - * @endcode - * - * @tparam DistanceType which distance to evaluate - * @tparam DataT input argument type - * @tparam AccT accumulation type - * @tparam OutT output type - * @tparam IdxT Index type - * @param handle raft handle for managing expensive resources - * @param x first set of points (size n*k) - * @param y second set of points (size m*k) - * @param dist output distance matrix (size n*m) - * @param metric_arg metric argument (used for Minkowski distance) - */ -template -void distance(raft::resources const& handle, - raft::device_matrix_view const x, - raft::device_matrix_view const y, - raft::device_matrix_view dist, - DataT metric_arg = 2.0f) -{ - RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal."); - RAFT_EXPECTS(dist.extent(0) == x.extent(0), - "Number of rows in output must be equal to " - "number of rows in X"); - RAFT_EXPECTS(dist.extent(1) == y.extent(0), - "Number of columns in output must be equal to " - "number of rows in Y"); - - RAFT_EXPECTS(x.is_exhaustive(), "Input x must be contiguous."); - RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous."); - - constexpr auto is_rowmajor = std::is_same_v; - - distance(handle, - x.data_handle(), - y.data_handle(), - dist.data_handle(), - x.extent(0), - y.extent(0), - x.extent(1), - is_rowmajor, - metric_arg); -} - -/** - * @brief Convenience wrapper around 'distance' prim to convert runtime metric - * into compile time for the purpose of dispatch - * @tparam Type input/accumulation/output data-type - * @tparam IdxT indexing type - * @param handle raft handle for managing expensive resources - * @param x first matrix of points (size mxk) - * @param y second matrix of points (size nxk) - * @param dist output distance matrix (size mxn) - * @param metric distance metric - * @param metric_arg metric argument (used for Minkowski distance) - */ -template -void pairwise_distance(raft::resources const& handle, - raft::device_matrix_view const x, - raft::device_matrix_view const y, - raft::device_matrix_view dist, - cuvs::distance::DistanceType metric, - Type metric_arg = 2.0f) -{ - RAFT_EXPECTS(x.extent(1) == y.extent(1), "Number of columns must be equal."); - RAFT_EXPECTS(dist.extent(0) == x.extent(0), - "Number of rows in output must be equal to " - "number of rows in X"); - RAFT_EXPECTS(dist.extent(1) == y.extent(0), - "Number of columns in output must be equal to " - "number of rows in Y"); - - RAFT_EXPECTS(x.is_exhaustive(), "Input x must be contiguous."); - RAFT_EXPECTS(y.is_exhaustive(), "Input y must be contiguous."); - RAFT_EXPECTS(dist.is_exhaustive(), "Output must be contiguous."); - - constexpr auto rowmajor = std::is_same_v; - - auto stream = raft::resource::get_cuda_stream(handle); - rmm::device_uvector workspace(0, stream); - - pairwise_distance(handle, - x.data_handle(), - y.data_handle(), - dist.data_handle(), - x.extent(0), - y.extent(0), - x.extent(1), - metric, - rowmajor, - metric_arg); -} - -/** @} */ - -}; // namespace distance -}; // namespace cuvs diff --git a/cpp/include/cuvs/distance/distance.cuh b/cpp/include/cuvs/distance/distance.cuh deleted file mode 100644 index de70cd469..000000000 --- a/cpp/include/cuvs/distance/distance.cuh +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY -#include "distance-inl.cuh" -#endif - -#ifdef RAFT_COMPILED -#include "distance-ext.cuh" -#endif diff --git a/cpp/include/cuvs/distance/fused_l2_nn-ext.cuh b/cpp/include/cuvs/distance/fused_l2_nn-ext.cuh deleted file mode 100644 index eb993b681..000000000 --- a/cpp/include/cuvs/distance/fused_l2_nn-ext.cuh +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // int64_t -#include // include initialize and reduce operations -#include // raft::KeyValuePair -#include // raft::resources -#include // RAFT_EXPLICIT - -#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY - -namespace cuvs { -namespace distance { - -template -void fusedL2NNMinReduce(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - void* workspace, - bool sqrt, - bool initOutBuffer, - cudaStream_t stream) RAFT_EXPLICIT; - -} // namespace distance -} // namespace cuvs - -#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY - -#define instantiate_raft_distance_fusedL2NNMinReduce(DataT, OutT, IdxT) \ - extern template void cuvs::distance::fusedL2NNMinReduce(OutT * min, \ - const DataT* x, \ - const DataT* y, \ - const DataT* xn, \ - const DataT* yn, \ - IdxT m, \ - IdxT n, \ - IdxT k, \ - void* workspace, \ - bool sqrt, \ - bool initOutBuffer, \ - cudaStream_t stream) - -instantiate_raft_distance_fusedL2NNMinReduce(double, double, int); -instantiate_raft_distance_fusedL2NNMinReduce(double, double, int64_t); -instantiate_raft_distance_fusedL2NNMinReduce(float, float, int); -instantiate_raft_distance_fusedL2NNMinReduce(float, float, int64_t); - -// We can't have comma's in the macro expansion, so we use the COMMA macro: -#define COMMA , - -instantiate_raft_distance_fusedL2NNMinReduce(double, raft::KeyValuePair, int); -instantiate_raft_distance_fusedL2NNMinReduce(double, - raft::KeyValuePair, - int64_t); -instantiate_raft_distance_fusedL2NNMinReduce(float, raft::KeyValuePair, int); -instantiate_raft_distance_fusedL2NNMinReduce(float, - raft::KeyValuePair, - int64_t); - -#undef COMMA - -#undef instantiate_raft_distance_fusedL2NNMinReduce diff --git a/cpp/include/cuvs/distance/fused_l2_nn-inl.cuh b/cpp/include/cuvs/distance/fused_l2_nn-inl.cuh deleted file mode 100644 index c6e7acb51..000000000 --- a/cpp/include/cuvs/distance/fused_l2_nn-inl.cuh +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __FUSED_L2_NN_H -#define __FUSED_L2_NN_H - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace distance { - -/** - * \ingroup fused_l2_nn - * @{ - */ -/** - * @brief Fused L2 distance and 1-nearest-neighbor computation in a single call. - * - * The benefits of such a call are 2-fold: 1) eliminate the need for an - * intermediate buffer to store the output of gemm 2) reduce the memory read - * traffic on this intermediate buffer, otherwise needed during the reduction - * phase for 1-NN. - * - * @tparam DataT data type - * @tparam OutT output type to either store 1-NN indices and their minimum - * distances or store only the min distances. Accordingly, one - * has to pass an appropriate `ReduceOpT` - * @tparam IdxT indexing arithmetic type - * @tparam ReduceOpT A struct to perform the final needed reduction operation - * and also to initialize the output array elements with the - * appropriate initial value needed for reduction. - * - * @param[out] min will contain the reduced output (Length = `m`) - * (on device) - * @param[in] x first matrix. Row major. Dim = `m x k`. - * (on device). - * @param[in] y second matrix. Row major. Dim = `n x k`. - * (on device). - * @param[in] xn L2 squared norm of `x`. Length = `m`. (on device). - * @param[in] yn L2 squared norm of `y`. Length = `n`. (on device) - * @param[in] m gemm m - * @param[in] n gemm n - * @param[in] k gemm k - * @param[in] workspace temp workspace. Size = sizeof(int)*m. (on device) - * @param[in] redOp reduction operator in the epilogue - * @param[in] pairRedOp reduction operation on key value pairs - * @param[in] sqrt Whether the output `minDist` should contain L2-sqrt - * @param[in] initOutBuffer whether to initialize the output buffer before the - * main kernel launch - * @param[in] stream cuda stream - */ -template -void fusedL2NN(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - void* workspace, - ReduceOpT redOp, - KVPReduceOpT pairRedOp, - bool sqrt, - bool initOutBuffer, - cudaStream_t stream) -{ - // When k is smaller than 32, the Policy4x4 results in redundant calculations - // as it uses tiles that have k=32. Therefore, use a "skinny" policy instead - // that uses tiles with a smaller value of k. - bool is_skinny = k < 32; - - size_t bytes = sizeof(DataT) * k; - auto px = reinterpret_cast(x); - auto py = reinterpret_cast(y); - if (16 % sizeof(DataT) == 0 && bytes % 16 == 0 && px % 16 == 0 && py % 16 == 0) { - if (is_skinny) { - detail::fusedL2NNImpl< - DataT, - OutT, - IdxT, - typename raft::linalg::Policy4x4Skinny::Policy, - ReduceOpT>( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); - } else { - detail::fusedL2NNImpl::Policy, - ReduceOpT>( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); - } - } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0 && px % 8 == 0 && py % 8 == 0) { - if (is_skinny) { - detail::fusedL2NNImpl< - DataT, - OutT, - IdxT, - typename raft::linalg::Policy4x4Skinny::Policy, - ReduceOpT>( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); - } else { - detail::fusedL2NNImpl::Policy, - ReduceOpT>( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); - } - } else { - if (is_skinny) { - detail::fusedL2NNImpl::Policy, - ReduceOpT>( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); - } else { - detail::fusedL2NNImpl::Policy, - ReduceOpT>( - min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); - } - } -} - -/** - * @brief Wrapper around fusedL2NN with minimum reduction operators. - * - * fusedL2NN cannot be compiled in the distance library due to the lambda - * operators, so this wrapper covers the most common case (minimum). - * This should be preferred to the more generic API when possible, in order to - * reduce compilation times for users of the shared library. - * - * @tparam DataT data type - * @tparam OutT output type to either store 1-NN indices and their minimum - * distances (e.g. raft::KeyValuePair) or store only the min - * distances. - * @tparam IdxT indexing arithmetic type - * @param[out] min will contain the reduced output (Length = `m`) - * (on device) - * @param[in] x first matrix. Row major. Dim = `m x k`. - * (on device). - * @param[in] y second matrix. Row major. Dim = `n x k`. - * (on device). - * @param[in] xn L2 squared norm of `x`. Length = `m`. (on device). - * @param[in] yn L2 squared norm of `y`. Length = `n`. (on device) - * @param[in] m gemm m - * @param[in] n gemm n - * @param[in] k gemm k - * @param[in] workspace temp workspace. Size = sizeof(int)*m. (on device) - * @param[in] sqrt Whether the output `minDist` should contain L2-sqrt - * @param[in] initOutBuffer whether to initialize the output buffer before the - * main kernel launch - * @param[in] stream cuda stream - */ -template -void fusedL2NNMinReduce(OutT* min, - const DataT* x, - const DataT* y, - const DataT* xn, - const DataT* yn, - IdxT m, - IdxT n, - IdxT k, - void* workspace, - bool sqrt, - bool initOutBuffer, - cudaStream_t stream) -{ - MinAndDistanceReduceOp redOp; - KVPMinReduce pairRedOp; - - fusedL2NN( - min, x, y, xn, yn, m, n, k, workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream); -} - -/** @} */ - -} // namespace distance -} // namespace cuvs - -#endif diff --git a/cpp/include/cuvs/distance/fused_l2_nn.cuh b/cpp/include/cuvs/distance/fused_l2_nn.cuh deleted file mode 100644 index b1a355132..000000000 --- a/cpp/include/cuvs/distance/fused_l2_nn.cuh +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY -#include "fused_l2_nn-inl.cuh" -#endif - -#ifdef RAFT_COMPILED -#include "fused_l2_nn-ext.cuh" -#endif diff --git a/cpp/include/cuvs/distance/fused_l2_nn_helpers.cuh b/cpp/include/cuvs/distance/fused_l2_nn_helpers.cuh deleted file mode 100644 index 29a4ae523..000000000 --- a/cpp/include/cuvs/distance/fused_l2_nn_helpers.cuh +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -namespace cuvs::distance { - -/** - * \defgroup fused_l2_nn Fused 1-nearest neighbors - * @{ - */ - -template -using KVPMinReduce = detail::KVPMinReduceImpl; - -template -using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl; - -template -using MinReduceOp = detail::MinReduceOpImpl; - -/** @} */ - -/** - * Initialize array using init value from reduction op - */ -template -void initialize(raft::resources const& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) -{ - detail::initialize( - min, m, maxVal, redOp, resource::get_cuda_stream(handle)); -} - -} // namespace cuvs::distance diff --git a/cpp/include/cuvs/distance/kernels.cuh b/cpp/include/cuvs/distance/kernels.cuh deleted file mode 100644 index 0133892a6..000000000 --- a/cpp/include/cuvs/distance/kernels.cuh +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include -#include - -namespace cuvs::distance::kernels { - -// TODO: Need to expose formal APIs for this that are more consistent w/ other APIs in RAFT -using cuvs::distance::kernels::detail::GramMatrixBase; -using cuvs::distance::kernels::detail::KernelFactory; - -}; // end namespace cuvs::distance::kernels diff --git a/cpp/include/cuvs/distance/masked_nn.cuh b/cpp/include/cuvs/distance/masked_nn.cuh deleted file mode 100644 index 6f3bde891..000000000 --- a/cpp/include/cuvs/distance/masked_nn.cuh +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __MASKED_L2_NN_H -#define __MASKED_L2_NN_H - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace distance { -/** - * \defgroup masked_nn Masked 1-nearest neighbors - * @{ - */ - -/** - * @brief Parameter struct for masked_l2_nn function - * - * @tparam ReduceOpT Type of reduction operator in the epilogue. - * @tparam KVPReduceOpT Type of Reduction operation on key value pairs. - * - * Usage example: - * @code{.cpp} - * #include - * - * using IdxT = int; - * using DataT = float; - * using RedOpT = cuvs::distance::MinAndDistanceReduceOp; - * using PairRedOpT = cuvs::distance::KVPMinReduce; - * using ParamT = cuvs::distance::masked_l2_nn_params; - * - * bool init_out = true; - * bool sqrt = false; - * - * ParamT masked_l2_params{RedOpT{}, PairRedOpT{}, sqrt, init_out}; - * @endcode - * - * Prescribes how to reduce a distance to an intermediate type (`redOp`), and - * how to reduce two intermediate types (`pairRedOp`). Typically, a distance is - * mapped to an (index, value) pair and (index, value) pair with the lowest - * value (distance) is selected. - * - * In addition, prescribes whether to compute the square root of the distance - * (`sqrt`) and whether to initialize the output buffer (`initOutBuffer`). - */ -template -struct masked_l2_nn_params { - /** Reduction operator in the epilogue */ - ReduceOpT redOp; - /** Reduction operation on key value pairs */ - KVPReduceOpT pairRedOp; - /** Whether the output `minDist` should contain L2-sqrt */ - bool sqrt; - /** Whether to initialize the output buffer before the main kernel launch */ - bool initOutBuffer; -}; - -/** - * @brief Masked L2 distance and 1-nearest-neighbor computation in a single call. - * - * This function enables faster computation of nearest neighbors if the - * computation of distances between certain point pairs can be skipped. - * - * We use an adjacency matrix that describes which distances to calculate. The - * points in `y` are divided into groups, and the adjacency matrix indicates - * whether to compute distances between points in `x` and groups in `y`. In other - * words, if `adj[i,k]` is true then distance between point `x_i`, and points in - * `group_k` will be calculated. - * - * **Performance considerations** - * - * The points in `x` are processed in tiles of `M` points (`M` is currently 64, - * but may change in the future). As a result, the largest compute time - * reduction occurs if all `M` points can skip a group. If only part of the `M` - * points can skip a group, then at most a minor compute time reduction and a - * modest energy use reduction can be expected. - * - * The points in `y` are also grouped into tiles of `N` points (`N` is currently - * 64, but may change in the future). As a result, group sizes should be larger - * than `N` to avoid wasting computational resources. If the group sizes are - * evenly divisible by `N`, then the computation is most efficient, although for - * larger group sizes this effect is minor. - * - * - * **Comparison to SDDM** - * - * [SDDMM](https://ieeexplore.ieee.org/document/8638042) (sampled dense-dense - * matrix multiplication) is a matrix-matrix multiplication where only part of - * the output is computed. Compared to masked_l2_nn, there are a few differences: - * - * - The output of masked_l2_nn is a single vector (of nearest neighbors) and not - * a sparse matrix. - * - * - The sampling in masked_l2_nn is expressed through intermediate "groups" - rather than a CSR format. - * - * @tparam DataT data type - * @tparam OutT output type to either store 1-NN indices and their minimum - * distances or store only the min distances. Accordingly, one - * has to pass an appropriate `ReduceOpT` - * @tparam IdxT indexing arithmetic type - * @tparam ReduceOpT A struct to perform the final needed reduction operation - * and also to initialize the output array elements with the - * appropriate initial value needed for reduction. - * - * @param handle RAFT handle for managing expensive resources - * @param params Parameter struct specifying the reduction operations. - * @param[in] x First matrix. Row major. Dim = `m x k`. - * (on device). - * @param[in] y Second matrix. Row major. Dim = `n x k`. - * (on device). - * @param[in] x_norm L2 squared norm of `x`. Length = `m`. (on device). - * @param[in] y_norm L2 squared norm of `y`. Length = `n`. (on device) - * @param[in] adj A boolean adjacency matrix indicating for each - * row of `x` and each group in `y` whether to compute the - * distance. Dim = `m x num_groups`. - * @param[in] group_idxs An array containing the *end* indices of each group - * in `y`. The value of group_idxs[j] indicates the - * start of group j + 1, i.e., it is the inclusive - * scan of the group lengths. The first group is - * always assumed to start at index 0 and the last - * group typically ends at index `n`. Length = - * `num_groups`. - * @param[out] out will contain the reduced output (Length = `m`) - * (on device) - */ -template -void masked_l2_nn(raft::resources const& handle, - cuvs::distance::masked_l2_nn_params params, - raft::device_matrix_view x, - raft::device_matrix_view y, - raft::device_vector_view x_norm, - raft::device_vector_view y_norm, - raft::device_matrix_view adj, - raft::device_vector_view group_idxs, - raft::device_vector_view out) -{ - IdxT m = x.extent(0); - IdxT n = y.extent(0); - IdxT k = x.extent(1); - IdxT num_groups = group_idxs.extent(0); - - // Match k dimension of x, y - RAFT_EXPECTS(x.extent(1) == y.extent(1), "Dimension of vectors in x and y must be equal."); - // Match x, x_norm and y, y_norm - RAFT_EXPECTS(m == x_norm.extent(0), "Length of `x_norm` must match input `x`."); - RAFT_EXPECTS(n == y_norm.extent(0), "Length of `y_norm` must match input `y` "); - // Match adj to x and group_idxs - RAFT_EXPECTS(m == adj.extent(0), "#rows in `adj` must match input `x`."); - RAFT_EXPECTS(num_groups == adj.extent(1), "#cols in `adj` must match length of `group_idxs`."); - // NOTE: We do not check if all indices in group_idxs actually points *inside* y. - - // If there is no work to be done, return immediately. - if (m == 0 || n == 0 || k == 0 || num_groups == 0) { return; } - - detail::masked_l2_nn_impl(handle, - out.data_handle(), - x.data_handle(), - y.data_handle(), - x_norm.data_handle(), - y_norm.data_handle(), - adj.data_handle(), - group_idxs.data_handle(), - num_groups, - m, - n, - k, - params.redOp, - params.pairRedOp, - params.sqrt, - params.initOutBuffer); -} - -/** @} */ - -} // namespace distance -} // namespace cuvs - -#endif diff --git a/cpp/include/cuvs/spectral/cluster_solvers.cuh b/cpp/include/cuvs/spectral/cluster_solvers.cuh deleted file mode 100644 index 63859adb1..000000000 --- a/cpp/include/cuvs/spectral/cluster_solvers.cuh +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __CLUSTER_SOLVERS_H -#define __CLUSTER_SOLVERS_H - -#pragma once - -#include -#include -#include // for std::pair - -namespace cuvs { -namespace spectral { - -using namespace matrix; - -// aggregate of control params for Eigen Solver: -// -template -struct cluster_solver_config_t { - size_type_t n_clusters; - size_type_t maxIter; - - value_type_t tol; - - unsigned long long seed{123456}; -}; - -template -struct kmeans_solver_t { - explicit kmeans_solver_t( - cluster_solver_config_t const& config) - : config_(config) - { - } - - std::pair solve(raft::resources const& handle, - size_type_t n_obs_vecs, - size_type_t dim, - value_type_t const* __restrict__ obs, - index_type_t* __restrict__ codes) const - { - RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); - RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); - value_type_t residual{}; - index_type_t iters{}; - cuvs::cluster::KMeansParams km_params; - km_params.n_clusters = config_.n_clusters; - km_params.tol = config_.tol; - km_params.max_iter = config_.maxIter; - km_params.rng_state.seed = config_.seed; - - auto X = raft::make_device_matrix_view(obs, n_obs_vecs, dim); - auto labels = raft::make_device_vector_view(codes, n_obs_vecs); - auto centroids = - raft::make_device_matrix(handle, config_.n_clusters, dim); - auto weight = raft::make_device_vector(handle, n_obs_vecs); - thrust::fill(raft::resource::get_thrust_policy(handle), - weight.data_handle(), - weight.data_handle() + n_obs_vecs, - 1); - - auto sw = std::make_optional((raft::device_vector_view)weight.view()); - cuvs::cluster::kmeans_fit_predict( - handle, - km_params, - X, - sw, - centroids.view(), - labels, - raft::make_host_scalar_view(&residual), - raft::make_host_scalar_view(&iters)); - return std::make_pair(residual, iters); - } - - auto const& get_config(void) const { return config_; } - - private: - cluster_solver_config_t config_; -}; - -} // namespace spectral -} // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/spectral/cluster_solvers_deprecated.cuh b/cpp/include/cuvs/spectral/cluster_solvers_deprecated.cuh deleted file mode 100644 index c45be88ef..000000000 --- a/cpp/include/cuvs/spectral/cluster_solvers_deprecated.cuh +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Note: This file is deprecated and will be removed in a future release - * Please use include/cuvs/cluster/kmeans.cuh instead - */ - -#ifndef __CLUSTER_SOLVERS_deprecated_H -#define __CLUSTER_SOLVERS_deprecated_H - -#pragma once - -#include -#include // for std::pair - -namespace cuvs { -namespace spectral { - -using namespace matrix; - -// aggregate of control params for Eigen Solver: -// -template -struct cluster_solver_config_deprecated_t { - size_type_t n_clusters; - size_type_t maxIter; - - value_type_t tol; - - unsigned long long seed{123456}; -}; - -template -struct kmeans_solver_deprecated_t { - explicit kmeans_solver_deprecated_t( - cluster_solver_config_deprecated_t const& config) - : config_(config) - { - } - - std::pair solve(raft::resources const& handle, - size_type_t n_obs_vecs, - size_type_t dim, - value_type_t const* __restrict__ obs, - index_type_t* __restrict__ codes) const - { - RAFT_EXPECTS(obs != nullptr, "Null obs buffer."); - RAFT_EXPECTS(codes != nullptr, "Null codes buffer."); - value_type_t residual{}; - index_type_t iters{}; - - cuvs::cluster::kmeans(handle, - n_obs_vecs, - dim, - config_.n_clusters, - config_.tol, - config_.maxIter, - obs, - codes, - residual, - iters, - config_.seed); - return std::make_pair(residual, iters); - } - - auto const& get_config(void) const { return config_; } - - private: - cluster_solver_config_deprecated_t config_; -}; - -} // namespace spectral -} // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/spectral/detail/lapack.hpp b/cpp/include/cuvs/spectral/detail/lapack.hpp deleted file mode 100644 index b2016c5c9..000000000 --- a/cpp/include/cuvs/spectral/detail/lapack.hpp +++ /dev/null @@ -1,574 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include - -#include -#include -#include - -// for now; TODO: check if/where this `define` should be; -// -#define USE_LAPACK - -namespace cuvs { - -#define lapackCheckError(status) \ - { \ - if (status < 0) { \ - std::stringstream ss; \ - ss << "Lapack error: argument number " << -status << " had an illegal value."; \ - throw exception(ss.str()); \ - } else if (status > 0) \ - RAFT_FAIL("Lapack error: internal error."); \ - } - -extern "C" void sgeqrf_( - int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info); -extern "C" void dgeqrf_( - int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info); -extern "C" void sormqr_(char* side, - char* trans, - int* m, - int* n, - int* k, - float* a, - int* lda, - const float* tau, - float* c, - int* ldc, - float* work, - int* lwork, - int* info); -extern "C" void dormqr_(char* side, - char* trans, - int* m, - int* n, - int* k, - double* a, - int* lda, - const double* tau, - double* c, - int* ldc, - double* work, - int* lwork, - int* info); -extern "C" int dgeev_(char* jobvl, - char* jobvr, - int* n, - double* a, - int* lda, - double* wr, - double* wi, - double* vl, - int* ldvl, - double* vr, - int* ldvr, - double* work, - int* lwork, - int* info); - -extern "C" int sgeev_(char* jobvl, - char* jobvr, - int* n, - float* a, - int* lda, - float* wr, - float* wi, - float* vl, - int* ldvl, - float* vr, - int* ldvr, - float* work, - int* lwork, - int* info); - -extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float* alpha, - const float* A, - int lda, - const float* B, - int ldb, - const float* beta, - float* C, - int ldc); - -extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const double* alpha, - const double* A, - int lda, - const double* B, - int ldb, - const double* beta, - double* C, - int ldc); - -extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info); - -extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info); - -extern "C" cusolverStatus_t cusolverDnSsteqrHost( - const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info); - -extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz, - int n, - double* d, - double* e, - double* z, - int ldz, - double* work, - int* info); - -template -class Lapack { - private: - Lapack(); - ~Lapack(); - - public: - static void check_lapack_enabled(); - - static void gemm(bool transa, - bool transb, - int m, - int n, - int k, - T alpha, - const T* A, - int lda, - const T* B, - int ldb, - T beta, - T* C, - int ldc); - - // special QR for lanczos - static void sterf(int n, T* d, T* e); - static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work); - - // QR - // computes the QR factorization of a general matrix - static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork); - // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf. - - // multiply C by implicit Q - static void ormqr(bool right_side, - bool transq, - int m, - int n, - int k, - T* a, - int lda, - T* tau, - T* c, - int ldc, - T* work, - int* lwork); - - static void geev(T* A, T* eigenvalues, int dim, int lda); - static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr); - static void geev(T* A, - T* eigenvalues_r, - T* eigenvalues_i, - T* eigenvectors_r, - T* eigenvectors_i, - int dim, - int lda, - int ldvr); - - private: - static void lapack_gemm(const char transa, - const char transb, - int m, - int n, - int k, - float alpha, - const float* a, - int lda, - const float* b, - int ldb, - float beta, - float* c, - int ldc) - { - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnSgemmHost( - cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc); - } - - static void lapack_gemm(const signed char transa, - const signed char transb, - int m, - int n, - int k, - double alpha, - const double* a, - int lda, - const double* b, - int ldb, - double beta, - double* c, - int ldc) - { - cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T; - cusolverDnDgemmHost(cublas_transa, - cublas_transb, - m, - n, - k, - &alpha, - (double*)a, - lda, - (double*)b, - ldb, - &beta, - c, - ldc); - } - - static void lapack_sterf(int n, float* d, float* e, int* info) - { - cusolverDnSsterfHost(n, d, e, info); - } - - static void lapack_sterf(int n, double* d, double* e, int* info) - { - cusolverDnDsterfHost(n, d, e, info); - } - - static void lapack_steqr( - const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info) - { - cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info); - } - - static void lapack_steqr(const signed char compz, - int n, - double* d, - double* e, - double* z, - int ldz, - double* work, - int* info) - { - cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info); - } - - static void lapack_geqrf( - int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info) - { - sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); - } - - static void lapack_geqrf( - int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info) - { - dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info); - } - - static void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - float* a, - int lda, - float* tau, - float* c, - int ldc, - float* work, - int* lwork, - int* info) - { - sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); - } - - static void lapack_ormqr(char side, - char trans, - int m, - int n, - int k, - double* a, - int lda, - double* tau, - double* c, - int ldc, - double* work, - int* lwork, - int* info) - { - dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info); - } - - static int lapack_geev_dispatch(char* jobvl, - char* jobvr, - int* n, - double* a, - int* lda, - double* wr, - double* wi, - double* vl, - int* ldvl, - double* vr, - int* ldvr, - double* work, - int* lwork, - int* info) - { - return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); - } - - static int lapack_geev_dispatch(char* jobvl, - char* jobvr, - int* n, - float* a, - int* lda, - float* wr, - float* wi, - float* vl, - int* ldvl, - float* vr, - int* ldvr, - float* work, - int* lwork, - int* info) - { - return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info); - } - - // real eigenvalues - static void lapack_geev(T* A, T* eigenvalues, int dim, int lda) - { - char job = 'N'; - std::vector WI(dim); - int ldv = 1; - T* vl = 0; - int work_size = 6 * dim; - std::vector work(work_size); - int info; - lapack_geev_dispatch(&job, - &job, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldv, - vl, - &ldv, - work.data(), - &work_size, - &info); - lapackCheckError(info); - } - - // real eigenpairs - static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) - { - char jobvl = 'N'; - char jobvr = 'V'; - std::vector WI(dim); - int work_size = 6 * dim; - T* vl = 0; - int ldvl = 1; - std::vector work(work_size); - int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues, - WI.data(), - vl, - &ldvl, - eigenvectors, - &ldvr, - work.data(), - &work_size, - &info); - lapackCheckError(info); - } - - // complex eigenpairs - static void lapack_geev(T* A, - T* eigenvalues_r, - T* eigenvalues_i, - T* eigenvectors_r, - T* eigenvectors_i, - int dim, - int lda, - int ldvr) - { - char jobvl = 'N'; - char jobvr = 'V'; - int work_size = 8 * dim; - int ldvl = 1; - std::vector work(work_size); - int info; - lapack_geev_dispatch(&jobvl, - &jobvr, - &dim, - A, - &lda, - eigenvalues_r, - eigenvalues_i, - 0, - &ldvl, - eigenvectors_r, - &ldvr, - work.data(), - &work_size, - &info); - lapackCheckError(info); - } -}; - -template -void Lapack::check_lapack_enabled() -{ -#ifndef USE_LAPACK - RAFT_FAIL("Error: LAPACK not enabled."); -#endif -} - -template -void Lapack::gemm(bool transa, - bool transb, - int m, - int n, - int k, - T alpha, - const T* A, - int lda, - const T* B, - int ldb, - T beta, - T* C, - int ldc) -{ - // check_lapack_enabled(); - // #ifdef NVGRAPH_USE_LAPACK - const char transA_char = transa ? 'T' : 'N'; - const char transB_char = transb ? 'T' : 'N'; - lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); - // #endif -} - -template -void Lapack::sterf(int n, T* d, T* e) -{ - // check_lapack_enabled(); - // #ifdef NVGRAPH_USE_LAPACK - int info; - lapack_sterf(n, d, e, &info); - lapackCheckError(info); - // #endif -} - -template -void Lapack::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work) -{ - // check_lapack_enabled(); - // #ifdef NVGRAPH_USE_LAPACK - int info; - lapack_steqr(compz, n, d, e, z, ldz, work, &info); - lapackCheckError(info); - // #endif -} - -template -void Lapack::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork) -{ - check_lapack_enabled(); -#ifdef USE_LAPACK - int info; - lapack_geqrf(m, n, a, lda, tau, work, lwork, &info); - lapackCheckError(info); -#endif -} -template -void Lapack::ormqr(bool right_side, - bool transq, - int m, - int n, - int k, - T* a, - int lda, - T* tau, - T* c, - int ldc, - T* work, - int* lwork) -{ - check_lapack_enabled(); -#ifdef USE_LAPACK - char side = right_side ? 'R' : 'L'; - char trans = transq ? 'T' : 'N'; - int info; - lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info); - lapackCheckError(info); -#endif -} - -// real eigenvalues -template -void Lapack::geev(T* A, T* eigenvalues, int dim, int lda) -{ - check_lapack_enabled(); -#ifdef USE_LAPACK - lapack_geev(A, eigenvalues, dim, lda); -#endif -} -// real eigenpairs -template -void Lapack::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr) -{ - check_lapack_enabled(); -#ifdef USE_LAPACK - lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr); -#endif -} -// complex eigenpairs -template -void Lapack::geev(T* A, - T* eigenvalues_r, - T* eigenvalues_i, - T* eigenvectors_r, - T* eigenvectors_i, - int dim, - int lda, - int ldvr) -{ - check_lapack_enabled(); -#ifdef USE_LAPACK - lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr); -#endif -} - -} // namespace cuvs diff --git a/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp b/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp deleted file mode 100644 index ebdb9835a..000000000 --- a/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp +++ /dev/null @@ -1,465 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -// ========================================================= -// Useful macros -// ========================================================= - -// Get index of matrix entry -#define IDX(i, j, lda) ((i) + (j) * (lda)) - -namespace cuvs { -namespace spectral { -namespace matrix { -namespace detail { - -using size_type = int; // for now; TODO: move it in appropriate header - -// Apply diagonal matrix to vector: -// -template -RAFT_KERNEL diagmv(IndexType_ n, - ValueType_ alpha, - const ValueType_* __restrict__ D, - const ValueType_* __restrict__ x, - ValueType_* __restrict__ y) -{ - IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x; - while (i < n) { - y[i] += alpha * D[i] * x[i]; - i += blockDim.x * gridDim.x; - } -} - -// specifies type of algorithm used -// for SpMv: -// -enum struct sparse_mv_alg_t : int { - SPARSE_MV_UNDEFINED = -1, - SPARSE_MV_ALG_DEFAULT, // generic, for any sparse matrix - SPARSE_MV_ALG1, // typical for CSR - SPARSE_MV_ALG2 // may provide better performance for irregular sparse matrices -}; - -// Vector "view"-like aggregate for linear algebra purposes -// -template -struct vector_view_t { - value_type* buffer_; - size_type size_; - - vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {} - - vector_view_t(vector_view_t&& other) : buffer_(other.raw()), size_(other.size()) {} - - vector_view_t& operator=(vector_view_t&& other) - { - buffer_ = other.raw(); - size_ = other.size(); - } -}; - -template -class vector_t { - public: - vector_t(raft::resources const& raft_handle, size_type sz) - : buffer_(sz, resource::get_cuda_stream(raft_handle)), - thrust_policy(raft::resource::get_thrust_policy(raft_handle)) - { - } - - size_type size(void) const { return buffer_.size(); } - - value_type* raw(void) { return buffer_.data(); } - - value_type const* raw(void) const { return buffer_.data(); } - - value_type nrm1() const - { - return thrust::reduce(thrust_policy, - buffer_.data(), - buffer_.data() + buffer_.size(), - value_type{0}, - [] __device__(auto left, auto right) { - auto abs_left = left > 0 ? left : -left; - auto abs_right = right > 0 ? right : -right; - return abs_left + abs_right; - }); - } - - void fill(value_type value) - { - thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value); - } - - private: - using thrust_exec_policy_t = - thrust::detail::execute_with_allocator, - thrust::cuda_cub::execute_on_stream_base>; - rmm::device_uvector buffer_; - const thrust_exec_policy_t thrust_policy; -}; - -template -struct sparse_matrix_t { - sparse_matrix_t(raft::resources const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const ncols, - index_type const nnz) - : handle_(raft_handle), - row_offsets_(row_offsets), - col_indices_(col_indices), - values_(values), - nrows_(nrows), - ncols_(ncols), - nnz_(nnz) - { - } - - sparse_matrix_t(raft::resources const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz) - : handle_(raft_handle), - row_offsets_(row_offsets), - col_indices_(col_indices), - values_(values), - nrows_(nrows), - ncols_(nrows), - nnz_(nnz) - { - } - - template - sparse_matrix_t(raft::resources const& raft_handle, CSRView const& csr_view) - : handle_(raft_handle), - row_offsets_(csr_view.offsets), - col_indices_(csr_view.indices), - values_(csr_view.edge_data), - nrows_(csr_view.number_of_vertices), - ncols_(csr_view.number_of_vertices), - nnz_(csr_view.number_of_edges) - { - } - - virtual ~sparse_matrix_t(void) = - default; // virtual because used as base for following matrix types - - // y = alpha*A*x + beta*y - //(Note: removed const-ness of x, because CUDA 11 SpMV - // descriptor creation works with non-const, and const-casting - // down is dangerous) - // - virtual void mv(value_type alpha, - value_type* __restrict__ x, - value_type beta, - value_type* __restrict__ y, - sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, - bool symmetric = false) const - { - using namespace sparse; - - RAFT_EXPECTS(x != nullptr, "Null x buffer."); - RAFT_EXPECTS(y != nullptr, "Null y buffer."); - - auto cusparse_h = resource::get_cusparse_handle(handle_); - auto stream = resource::get_cuda_stream(handle_); - - cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE : // transpose - CUSPARSE_OPERATION_NON_TRANSPOSE; // non-transpose - -#if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP - auto size_x = transpose ? nrows_ : ncols_; - auto size_y = transpose ? ncols_ : nrows_; - - cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg); - - // create descriptors: - //(below casts are necessary, because - // cusparseCreateCsr(...) takes non-const - // void*; the casts should be harmless) - // - cusparseSpMatDescr_t matA; - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatecsr(&matA, - nrows_, - ncols_, - nnz_, - const_cast(row_offsets_), - const_cast(col_indices_), - const_cast(values_))); - - cusparseDnVecDescr_t vecX; - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(&vecX, size_x, x)); - - rmm::device_uvector y_tmp(size_y, stream); - raft::copy(y_tmp.data(), y, size_y, stream); - - cusparseDnVecDescr_t vecY; - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecreatednvec(&vecY, size_y, y_tmp.data())); - - // get (scratch) external device buffer size: - // - size_t bufferSize; - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv_buffersize( - cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream)); - - // allocate external buffer: - // - vector_t external_buffer(handle_, bufferSize); - - // finally perform SpMV: - // - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv( - cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream)); - - // FIXME: This is a workaround for a cusparse issue being encountered in CUDA 12 - raft::copy(y, y_tmp.data(), size_y, stream); - // free descriptors: - //(TODO: maybe wrap them in a RAII struct?) - // - RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(vecY)); - RAFT_CUSPARSE_TRY(cusparseDestroyDnVec(vecX)); - RAFT_CUSPARSE_TRY(cusparseDestroySpMat(matA)); -#else - RAFT_CUSPARSE_TRY( - raft::sparse::detail::cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream)); - cusparseMatDescr_t descr = 0; - RAFT_CUSPARSE_TRY(cusparseCreateMatDescr(&descr)); - if (symmetric) { - RAFT_CUSPARSE_TRY(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_SYMMETRIC)); - } else { - RAFT_CUSPARSE_TRY(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); - } - RAFT_CUSPARSE_TRY(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsecsrmv(cusparse_h, - trans, - nrows_, - ncols_, - nnz_, - &alpha, - descr, - values_, - row_offsets_, - col_indices_, - x, - &beta, - y, - stream)); - RAFT_CUSPARSE_TRY(cusparseDestroyMatDescr(descr)); -#endif - } - - resources const& get_handle(void) const { return handle_; } - -#if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP - cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const - { - switch (alg) { - case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_SPMV_CSR_ALG1; - case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_SPMV_CSR_ALG2; - default: return CUSPARSE_SPMV_ALG_DEFAULT; - } - } -#endif - - // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, - // aggregate - - raft::resources const& handle_; - index_type const* row_offsets_; - index_type const* col_indices_; - value_type const* values_; - index_type const nrows_; - index_type const ncols_; - index_type const nnz_; -}; - -template -struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(raft::resources const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz) - : sparse_matrix_t( - raft_handle, row_offsets, col_indices, values, nrows, nnz), - diagonal_(raft_handle, nrows) - { - vector_t ones{raft_handle, nrows}; - ones.fill(1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); - } - - laplacian_matrix_t(raft::resources const& raft_handle, - sparse_matrix_t const& csr_m) - : sparse_matrix_t(raft_handle, - csr_m.row_offsets_, - csr_m.col_indices_, - csr_m.values_, - csr_m.nrows_, - csr_m.nnz_), - diagonal_(raft_handle, csr_m.nrows_) - { - vector_t ones{raft_handle, csr_m.nrows_}; - ones.fill(1.0); - sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); - } - - // y = alpha*A*x + beta*y - // - void mv(value_type alpha, - value_type* __restrict__ x, - value_type beta, - value_type* __restrict__ y, - sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, - bool symmetric = false) const override - { - constexpr int BLOCK_SIZE = 1024; - auto n = sparse_matrix_t::nrows_; - - auto handle = sparse_matrix_t::get_handle(); - auto cublas_h = resource::get_cublas_handle(handle); - auto stream = resource::get_cuda_stream(handle); - - // scales y by beta: - // - if (beta == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(y, 0, n * sizeof(value_type), stream)); - } else if (beta != 1) { - // TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal(cublas_h, n, &beta, y, 1, stream)); - } - - // Apply diagonal matrix - // - dim3 gridDim{std::min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1}; - - dim3 blockDim{BLOCK_SIZE, 1, 1}; - diagmv<<>>(n, alpha, diagonal_.raw(), x, y); - RAFT_CHECK_CUDA(stream); - - // Apply adjacency matrix - // - sparse_matrix_t::mv(-alpha, x, 1, y, alg, transpose, symmetric); - } - - vector_t diagonal_; -}; - -template -struct modularity_matrix_t : laplacian_matrix_t { - modularity_matrix_t(raft::resources const& raft_handle, - index_type const* row_offsets, - index_type const* col_indices, - value_type const* values, - index_type const nrows, - index_type const nnz) - : laplacian_matrix_t( - raft_handle, row_offsets, col_indices, values, nrows, nnz) - { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); - } - - modularity_matrix_t(raft::resources const& raft_handle, - sparse_matrix_t const& csr_m) - : laplacian_matrix_t(raft_handle, csr_m) - { - edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); - } - - // y = alpha*A*x + beta*y - // - void mv(value_type alpha, - value_type* __restrict__ x, - value_type beta, - value_type* __restrict__ y, - sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1, - bool transpose = false, - bool symmetric = false) const override - { - auto n = sparse_matrix_t::nrows_; - - auto handle = sparse_matrix_t::get_handle(); - auto cublas_h = resource::get_cublas_handle(handle); - auto stream = resource::get_cuda_stream(handle); - - // y = A*x - // - sparse_matrix_t::mv(alpha, x, 0, y, alg, transpose, symmetric); - value_type dot_res; - - // gamma = d'*x - // - // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res); - // TODO: Call from public API when ready - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublasdot(cublas_h, - n, - laplacian_matrix_t::diagonal_.raw(), - 1, - x, - 1, - &dot_res, - stream)); - - // y = y -(gamma/edge_sum)*d - // - value_type gamma_ = -dot_res / edge_sum_; - // TODO: Call from public API when ready - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublasaxpy(cublas_h, - n, - &gamma_, - laplacian_matrix_t::diagonal_.raw(), - 1, - y, - 1, - stream)); - } - - value_type edge_sum_; -}; - -} // namespace detail -} // namespace matrix -} // namespace spectral -} // namespace cuvs diff --git a/cpp/include/cuvs/spectral/detail/modularity_maximization.hpp b/cpp/include/cuvs/spectral/detail/modularity_maximization.hpp deleted file mode 100644 index 72247c7d9..000000000 --- a/cpp/include/cuvs/spectral/detail/modularity_maximization.hpp +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -namespace cuvs { -namespace spectral { -namespace detail { - -// ========================================================= -// Spectral modularity_maximization -// ========================================================= - -/** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param G Weighted graph in CSR format - * @param nClusters Number of partitions. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter_lanczos Maximum number of Lanczos iterations. - * @param restartIter_lanczos Maximum size of Lanczos system before - * implicit restart. - * @param tol_lanczos Convergence tolerance for Lanczos method. - * @param maxIter_kmeans Maximum number of k-means iterations. - * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param clusters (Output, device memory, n entries) Cluster - * assignments. - * @param iters_lanczos On exit, number of Lanczos iterations - * performed. - * @param iters_kmeans On exit, number of k-means iterations - * performed. - * @return error flag. - */ -template -std::tuple modularity_maximization( - raft::resources const& handle, - raft::spectral::matrix::sparse_matrix_t const& csr_m, - EigenSolver const& eigen_solver, - ClusterSolver const& cluster_solver, - vertex_t* __restrict__ clusters, - weight_t* eigVals, - weight_t* eigVecs) -{ - RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - - auto stream = resource::get_cuda_stream(handle); - auto cublas_h = resource::get_cublas_handle(handle); - - std::tuple - stats; // # iters eigen solver, cluster solver residual, # iters cluster solver - - vertex_t n = csr_m.nrows_; - - // Compute eigenvectors of Modularity Matrix - - // Initialize Modularity Matrix - raft::spectral::matrix::modularity_matrix_t B{handle, csr_m}; - - auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; - - // Compute eigenvectors corresponding to largest eigenvalues - std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs); - - // Whiten eigenvector matrix - transform_eigen_matrix(handle, n, nEigVecs, eigVecs); - - // notice that at this point the matrix has already been transposed, so we are scaling - // columns - scale_obs(nEigVecs, n, eigVecs); - RAFT_CHECK_CUDA(stream); - - // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); - - std::get<1>(stats) = pair_cluster.first; - std::get<2>(stats) = pair_cluster.second; - - return stats; -} -//=================================================== -// Analysis of graph partition -// ========================================================= - -/// Compute modularity -/** This function determines the modularity based on a graph and cluster assignments - * @param G Weighted graph in CSR format - * @param nClusters Number of clusters. - * @param clusters (Input, device memory, n entries) Cluster assignments. - * @param modularity On exit, modularity - */ -template -void analyzeModularity(raft::resources const& handle, - raft::spectral::matrix::sparse_matrix_t const& csr_m, - vertex_t nClusters, - vertex_t const* __restrict__ clusters, - weight_t& modularity) -{ - RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - - vertex_t i; - vertex_t n = csr_m.nrows_; - weight_t partModularity, clustersize; - - auto cublas_h = resource::get_cublas_handle(handle); - auto stream = resource::get_cuda_stream(handle); - - // Device memory - raft::spectral::matrix::vector_t part_i(handle, n); - raft::spectral::matrix::vector_t Bx(handle, n); - - // Initialize cuBLAS - RAFT_CUBLAS_TRY(linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - // Initialize Modularity - raft::spectral::matrix::modularity_matrix_t B{handle, csr_m}; - - // Initialize output - modularity = 0; - - // Iterate through partitions - for (i = 0; i < nClusters; ++i) { - if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) { - WARNING("empty partition"); - continue; - } - - // Record results - modularity += partModularity; - } - - modularity = modularity / B.diagonal_.nrm1(); -} - -} // namespace detail -} // namespace spectral -} // namespace cuvs diff --git a/cpp/include/cuvs/spectral/detail/partition.hpp b/cpp/include/cuvs/spectral/detail/partition.hpp deleted file mode 100644 index a91124866..000000000 --- a/cpp/include/cuvs/spectral/detail/partition.hpp +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include - -namespace cuvs { -namespace spectral { -namespace detail { - -// ========================================================= -// Spectral partitioner -// ========================================================= - -/// Compute spectral graph partition -/** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param G Weighted graph in CSR format - * @param nClusters Number of partitions. - * @param nEigVecs Number of eigenvectors to compute. - * @param maxIter_lanczos Maximum number of Lanczos iterations. - * @param restartIter_lanczos Maximum size of Lanczos system before - * implicit restart. - * @param tol_lanczos Convergence tolerance for Lanczos method. - * @param maxIter_kmeans Maximum number of k-means iterations. - * @param tol_kmeans Convergence tolerance for k-means algorithm. - * @param clusters (Output, device memory, n entries) Partition - * assignments. - * @param iters_lanczos On exit, number of Lanczos iterations - * performed. - * @param iters_kmeans On exit, number of k-means iterations - * performed. - * @return statistics: number of eigensolver iterations, . - */ -template -std::tuple partition( - raft::resources const& handle, - spectral::matrix::sparse_matrix_t const& csr_m, - EigenSolver const& eigen_solver, - ClusterSolver const& cluster_solver, - vertex_t* __restrict__ clusters, - weight_t* eigVals, - weight_t* eigVecs) -{ - RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - - auto stream = resource::get_cuda_stream(handle); - auto cublas_h = resource::get_cublas_handle(handle); - - std::tuple - stats; //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, - // cluster solver residual, # iters cluster solver - - vertex_t n = csr_m.nrows_; - - // ------------------------------------------------------- - // Spectral partitioner - // ------------------------------------------------------- - - // Compute eigenvectors of Laplacian - - // Initialize Laplacian - /// sparse_matrix_t A{handle, graph}; - spectral::matrix::laplacian_matrix_t L{handle, csr_m}; - - auto eigen_config = eigen_solver.get_config(); - auto nEigVecs = eigen_config.n_eigVecs; - - // Compute smallest eigenvalues and eigenvectors - std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs); - - // Whiten eigenvector matrix - transform_eigen_matrix(handle, n, nEigVecs, eigVecs); - - // Find partition clustering - auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters); - - std::get<1>(stats) = pair_cluster.first; - std::get<2>(stats) = pair_cluster.second; - - return stats; -} - -// ========================================================= -// Analysis of graph partition -// ========================================================= - -/// Compute cost function for partition -/** This function determines the edges cut by a partition and a cost - * function: - * Cost = \sum_i (Edges cut by ith partition)/(Vertices in ith partition) - * Graph is assumed to be weighted and undirected. - * - * @param G Weighted graph in CSR format - * @param nClusters Number of partitions. - * @param clusters (Input, device memory, n entries) Partition - * assignments. - * @param edgeCut On exit, weight of edges cut by partition. - * @param cost On exit, partition cost function. - * @return error flag. - */ -template -void analyzePartition(raft::resources const& handle, - spectral::matrix::sparse_matrix_t const& csr_m, - vertex_t nClusters, - const vertex_t* __restrict__ clusters, - weight_t& edgeCut, - weight_t& cost) -{ - RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer."); - - vertex_t i; - vertex_t n = csr_m.nrows_; - - auto stream = resource::get_cuda_stream(handle); - auto cublas_h = resource::get_cublas_handle(handle); - - weight_t partEdgesCut, clustersize; - - // Device memory - spectral::matrix::vector_t part_i(handle, n); - spectral::matrix::vector_t Lx(handle, n); - - // Initialize cuBLAS - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - // Initialize Laplacian - /// sparse_matrix_t A{handle, graph}; - spectral::matrix::laplacian_matrix_t L{handle, csr_m}; - - // Initialize output - cost = 0; - edgeCut = 0; - - // Iterate through partitions - for (i = 0; i < nClusters; ++i) { - // Construct indicator vector for ith partition - if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) { - WARNING("empty partition"); - continue; - } - - // Record results - cost += partEdgesCut / clustersize; - edgeCut += partEdgesCut / 2; - } -} - -} // namespace detail -} // namespace spectral -} // namespace cuvs diff --git a/cpp/include/cuvs/spectral/detail/spectral_util.cuh b/cpp/include/cuvs/spectral/detail/spectral_util.cuh deleted file mode 100644 index c0abc77b3..000000000 --- a/cpp/include/cuvs/spectral/detail/spectral_util.cuh +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace cuvs { -namespace spectral { - -template -RAFT_KERNEL scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs) -{ - index_type_t i, j, k, index, mm; - value_type_t alpha, v, last; - bool valid; - // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension - - // compute alpha - mm = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x); // m in multiple of blockDim.x - alpha = 0.0; - - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < mm; i += blockDim.x) { - // check if the thread is valid - valid = i < m; - - // get the value of the last thread - last = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x); - - // if you are valid read the value from memory, otherwise set your value to 0 - alpha = (valid) ? obs[i + j * m] : 0.0; - alpha = alpha * alpha; - - // do prefix sum (of size warpSize=blockDim.x =< 32) - for (k = 1; k < blockDim.x; k *= 2) { - v = __shfl_up_sync(warp_full_mask(), alpha, k, blockDim.x); - if (threadIdx.x >= k) alpha += v; - } - // shift by last - alpha += last; - } - } - - // scale by alpha - alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x); - alpha = raft::sqrt(alpha); - for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) { - for (i = threadIdx.x; i < m; i += blockDim.x) { // blockDim.x=32 - index = i + j * m; - obs[index] = obs[index] / alpha; - } - } -} - -template -index_type_t next_pow2(index_type_t n) -{ - index_type_t v; - // Reference: - // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float - v = n - 1; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return v + 1; -} - -template -cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) -{ - index_type_t p2m; - - // find next power of 2 - p2m = next_pow2(m); - // setup launch configuration - unsigned int xsize = std::max(2, std::min(p2m, 32)); - dim3 nthreads{xsize, 256 / xsize, 1}; - - dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1}; - - // launch scaling kernel (scale each column of obs by its norm) - scale_obs_kernel<<>>(m, n, obs); - - return cudaSuccess; -} - -template -void transform_eigen_matrix(raft::resources const& handle, - edge_t n, - vertex_t nEigVecs, - weight_t* eigVecs) -{ - auto stream = resource::get_cuda_stream(handle); - auto cublas_h = resource::get_cublas_handle(handle); - auto thrust_exec_policy = resource::get_thrust_policy(handle); - - const weight_t zero{0.0}; - const weight_t one{1.0}; - - // Whiten eigenvector matrix - for (auto i = 0; i < nEigVecs; ++i) { - weight_t mean, std; - - mean = thrust::reduce(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n))); - RAFT_CHECK_CUDA(stream); - mean /= n; - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(mean), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::minus()); - RAFT_CHECK_CUDA(stream); - - // TODO: Call from public API when ready - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream)); - - std /= std::sqrt(static_cast(n)); - - thrust::transform(thrust_exec_policy, - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)), - thrust::make_constant_iterator(std), - thrust::device_pointer_cast(eigVecs + IDX(0, i, n)), - thrust::divides()); - RAFT_CHECK_CUDA(stream); - } - - // Transpose eigenvector matrix - // TODO: in-place transpose - { - raft::spectral::matrix::vector_t work(handle, nEigVecs * n); - // TODO: Call from public API when ready - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream)); - - // TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam(cublas_h, - CUBLAS_OP_T, - CUBLAS_OP_N, - nEigVecs, - n, - &one, - eigVecs, - n, - &zero, - (weight_t*)NULL, - nEigVecs, - work.raw(), - nEigVecs, - stream)); - - RAFT_CUDA_TRY(cudaMemcpyAsync( - eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream)); - } -} - -namespace { -/// Functor to generate indicator vectors -/** For use in Thrust transform - */ -template -struct equal_to_i_op { - const index_type_t i; - - public: - equal_to_i_op(index_type_t _i) : i(_i) {} - template - __host__ __device__ void operator()(Tuple_ t) - { - thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0; - } -}; -} // namespace - -// Construct indicator vector for ith partition -// -template -bool construct_indicator(raft::resources const& handle, - edge_t index, - edge_t n, - weight_t& clustersize, - weight_t& partStats, - vertex_t const* __restrict__ clusters, - raft::spectral::matrix::vector_t& part_i, - raft::spectral::matrix::vector_t& Bx, - raft::spectral::matrix::laplacian_matrix_t const& B) -{ - auto stream = resource::get_cuda_stream(handle); - auto cublas_h = resource::get_cublas_handle(handle); - auto thrust_exec_policy = resource::get_thrust_policy(handle); - - thrust::for_each( - thrust_exec_policy, - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters), - thrust::device_pointer_cast(part_i.raw()))), - thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n), - thrust::device_pointer_cast(part_i.raw() + n))), - equal_to_i_op(index)); - RAFT_CHECK_CUDA(stream); - - // Compute size of ith partition - // TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot( - cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream)); - - clustersize = round(clustersize); - if (clustersize < 0.5) { return false; } - - // Compute part stats - B.mv(1, part_i.raw(), 0, Bx.raw()); - // TODO: Call from public API when ready - RAFT_CUBLAS_TRY( - raft::linalg::detail::cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream)); - - return true; -} - -} // namespace spectral -} // namespace cuvs diff --git a/cpp/include/cuvs/spectral/detail/warn_dbg.hpp b/cpp/include/cuvs/spectral/detail/warn_dbg.hpp deleted file mode 100644 index 2a9039e33..000000000 --- a/cpp/include/cuvs/spectral/detail/warn_dbg.hpp +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -#ifdef DEBUG -#define COUT() (std::cout) -#define CERR() (std::cerr) - -// nope: -// -#define WARNING(message) \ - do { \ - std::stringstream ss; \ - ss << "Warning (" << __FILE__ << ":" << __LINE__ << "): " << message; \ - CERR() << ss.str() << std::endl; \ - } while (0) -#else // DEBUG -#define WARNING(message) -#endif diff --git a/cpp/include/cuvs/spectral/eigen_solvers.cuh b/cpp/include/cuvs/spectral/eigen_solvers.cuh deleted file mode 100644 index 59e0c0d96..000000000 --- a/cpp/include/cuvs/spectral/eigen_solvers.cuh +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __EIGEN_SOLVERS_H -#define __EIGEN_SOLVERS_H - -#pragma once - -#include -#include - -namespace cuvs { -namespace spectral { - -// aggregate of control params for Eigen Solver: -// -template -struct eigen_solver_config_t { - size_type_t n_eigVecs; - size_type_t maxIter; - - size_type_t restartIter; - value_type_t tol; - - bool reorthogonalize{false}; - unsigned long long seed{ - 1234567}; // CAVEAT: this default value is now common to all instances of using seed in - // Lanczos; was not the case before: there were places where a default seed = 123456 - // was used; this may trigger slightly different # solver iterations -}; - -template -struct lanczos_solver_t { - explicit lanczos_solver_t( - eigen_solver_config_t const& config) - : config_(config) - { - } - - index_type_t solve_smallest_eigenvectors( - raft::resources const& handle, - raft::matrix::sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const - { - RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - index_type_t iters{}; - sparse::solver::computeSmallestEigenvectors(handle, - A, - config_.n_eigVecs, - config_.maxIter, - config_.restartIter, - config_.tol, - config_.reorthogonalize, - iters, - eigVals, - eigVecs, - config_.seed); - return iters; - } - - index_type_t solve_largest_eigenvectors( - raft::resources const& handle, - raft::matrix::sparse_matrix_t const& A, - value_type_t* __restrict__ eigVals, - value_type_t* __restrict__ eigVecs) const - { - RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer."); - RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer."); - index_type_t iters{}; - sparse::solver::computeLargestEigenvectors(handle, - A, - config_.n_eigVecs, - config_.maxIter, - config_.restartIter, - config_.tol, - config_.reorthogonalize, - iters, - eigVals, - eigVecs, - config_.seed); - return iters; - } - - auto const& get_config(void) const { return config_; } - - private: - eigen_solver_config_t config_; -}; - -} // namespace spectral -} // namespace cuvs - -#endif diff --git a/cpp/include/cuvs/spectral/matrix_wrappers.hpp b/cpp/include/cuvs/spectral/matrix_wrappers.hpp deleted file mode 100644 index 9d07c4cdc..000000000 --- a/cpp/include/cuvs/spectral/matrix_wrappers.hpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -// ========================================================= -// Useful macros -// ========================================================= - -namespace cuvs { -namespace spectral { -namespace matrix { - -using size_type = int; // for now; TODO: move it in appropriate header - -// specifies type of algorithm used -// for SpMv: -// -using detail::sparse_mv_alg_t; - -// Vector "view"-like aggregate for linear algebra purposes -// -using detail::vector_view_t; - -using detail::vector_t; - -using detail::sparse_matrix_t; - -using detail::laplacian_matrix_t; - -using detail::modularity_matrix_t; - -} // namespace matrix -} // namespace spectral -} // namespace cuvs diff --git a/cpp/include/cuvs/spectral/modularity_maximization.cuh b/cpp/include/cuvs/spectral/modularity_maximization.cuh deleted file mode 100644 index 6cee2086d..000000000 --- a/cpp/include/cuvs/spectral/modularity_maximization.cuh +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __MODULARITY_MAXIMIZATION_H -#define __MODULARITY_MAXIMIZATION_H - -#pragma once - -#include - -#include - -namespace cuvs { -namespace spectral { - -// ========================================================= -// Spectral modularity_maximization -// ========================================================= - -/** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param handle raft handle for managing expensive resources - * @param csr_m Weighted graph in CSR format - * @param eigen_solver Eigensolver implementation - * @param cluster_solver Cluster solver implementation - * @param clusters (Output, device memory, n entries) Partition - * assignments. - * @param eigVals Output eigenvalue array pointer on device - * @param eigVecs Output eigenvector array pointer on device - * @return statistics: number of eigensolver iterations, . - */ -template -std::tuple modularity_maximization( - raft::resources const& handle, - raft::matrix::sparse_matrix_t const& csr_m, - EigenSolver const& eigen_solver, - ClusterSolver const& cluster_solver, - vertex_t* __restrict__ clusters, - weight_t* eigVals, - weight_t* eigVecs) -{ - return raft::spectral::detail:: - modularity_maximization( - handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs); -} -//=================================================== -// Analysis of graph partition -// ========================================================= - -/// Compute modularity -/** This function determines the modularity based on a graph and cluster assignments - * @param handle raft handle for managing expensive resources - * @param csr_m Weighted graph in CSR format - * @param nClusters Number of clusters. - * @param clusters (Input, device memory, n entries) Cluster assignments. - * @param modularity On exit, modularity - */ -template -void analyzeModularity(raft::resources const& handle, - raft::matrix::sparse_matrix_t const& csr_m, - vertex_t nClusters, - vertex_t const* __restrict__ clusters, - weight_t& modularity) -{ - raft::spectral::detail::analyzeModularity( - handle, csr_m, nClusters, clusters, modularity); -} - -} // namespace spectral -} // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/spectral/partition.cuh b/cpp/include/cuvs/spectral/partition.cuh deleted file mode 100644 index 3f327dbfb..000000000 --- a/cpp/include/cuvs/spectral/partition.cuh +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __PARTITION_H -#define __PARTITION_H - -#pragma once - -#include - -#include - -namespace cuvs { -namespace spectral { - -// ========================================================= -// Spectral partitioner -// ========================================================= - -/// Compute spectral graph partition -/** Compute partition for a weighted undirected graph. This - * partition attempts to minimize the cost function: - * Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition) - * - * @param handle raft handle for managing expensive resources - * @param csr_m Weighted graph in CSR format - * @param eigen_solver Eigensolver implementation - * @param cluster_solver Cluster solver implementation - * @param clusters (Output, device memory, n entries) Partition - * assignments. - * @param eigVals Output eigenvalue array pointer on device - * @param eigVecs Output eigenvector array pointer on device - * @return statistics: number of eigensolver iterations, . - */ -template -std::tuple partition( - raft::resources const& handle, - raft::matrix::sparse_matrix_t const& csr_m, - EigenSolver const& eigen_solver, - ClusterSolver const& cluster_solver, - vertex_t* __restrict__ clusters, - weight_t* eigVals, - weight_t* eigVecs) -{ - return raft::spectral::detail::partition( - handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs); -} - -// ========================================================= -// Analysis of graph partition -// ========================================================= - -/// Compute cost function for partition -/** This function determines the edges cut by a partition and a cost - * function: - * Cost = \f$sum_i\f$ (Edges cut by ith partition)/(Vertices in ith partition) - * Graph is assumed to be weighted and undirected. - * - * @param handle raft handle for managing expensive resources - * @param csr_m Weighted graph in CSR format - * @param nClusters Number of partitions. - * @param clusters (Input, device memory, n entries) Partition - * assignments. - * @param edgeCut On exit, weight of edges cut by partition. - * @param cost On exit, partition cost function. - */ -template -void analyzePartition(raft::resources const& handle, - raft::matrix::sparse_matrix_t const& csr_m, - vertex_t nClusters, - const vertex_t* __restrict__ clusters, - weight_t& edgeCut, - weight_t& cost) -{ - raft::spectral::detail::analyzePartition( - handle, csr_m, nClusters, clusters, edgeCut, cost); -} - -} // namespace spectral -} // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/spectral/specializations.cuh b/cpp/include/cuvs/spectral/specializations.cuh deleted file mode 100644 index 9588a7f32..000000000 --- a/cpp/include/cuvs/spectral/specializations.cuh +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#pragma message( \ - __FILE__ \ - " is deprecated and will be removed." \ - " Including specializations is not necessary any more." \ - " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html") diff --git a/cpp/include/cuvs/stats/accuracy.cuh b/cpp/include/cuvs/stats/accuracy.cuh deleted file mode 100644 index b7523449f..000000000 --- a/cpp/include/cuvs/stats/accuracy.cuh +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __STATS_ACCURACY_H -#define __STATS_ACCURACY_H - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Compute accuracy of predictions. Useful for classification. - * @tparam math_t: data type for predictions (e.g., int for classification) - * @param[in] predictions: array of predictions (GPU pointer). - * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer). - * @param[in] n: number of elements in each of predictions, ref_predictions. - * @param[in] stream: cuda stream. - * @return: Accuracy score in [0, 1]; higher is better. - */ -template -float accuracy(const math_t* predictions, const math_t* ref_predictions, int n, cudaStream_t stream) -{ - return detail::accuracy_score(predictions, ref_predictions, n, stream); -} - -/** - * @defgroup stats_accuracy Accuracy Score - * @{ - */ - -/** - * @brief Compute accuracy of predictions. Useful for classification. - * @tparam value_t: data type for predictions (e.g., int for classification) - * @tparam idx_t Index type of matrix extent. - * @param[in] handle: the raft handle. - * @param[in] predictions: array of predictions (GPU pointer). - * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer). - * @return: Accuracy score in [0, 1]; higher is better. - */ -template -float accuracy(raft::resources const& handle, - raft::device_vector_view predictions, - raft::device_vector_view ref_predictions) -{ - RAFT_EXPECTS(predictions.size() == ref_predictions.size(), "Size mismatch"); - RAFT_EXPECTS(predictions.is_exhaustive(), "predictions must be contiguous"); - RAFT_EXPECTS(ref_predictions.is_exhaustive(), "ref_predictions must be contiguous"); - - return detail::accuracy_score(predictions.data_handle(), - ref_predictions.data_handle(), - predictions.extent(0), - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_accuracy - -} // namespace stats -} // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/adjusted_rand_index.cuh b/cpp/include/cuvs/stats/adjusted_rand_index.cuh deleted file mode 100644 index 17fac4467..000000000 --- a/cpp/include/cuvs/stats/adjusted_rand_index.cuh +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file adjusted_rand_index.cuh - * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index. - * Such a correction for chance establishes a baseline by using the expected similarity - * of all pair-wise comparisons between clusterings specified by a random model. - */ -#ifndef __ADJUSTED_RAND_INDEX_H -#define __ADJUSTED_RAND_INDEX_H - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Function to calculate Adjusted RandIndex - * @see https://en.wikipedia.org/wiki/Rand_index - * @tparam T data-type for input label arrays - * @tparam MathT integral data-type used for computing n-choose-r - * @param firstClusterArray: the array of classes - * @param secondClusterArray: the array of classes - * @param size: the size of the data points of type int - * @param stream: the cudaStream object - */ -template -double adjusted_rand_index(const T* firstClusterArray, - const T* secondClusterArray, - int size, - cudaStream_t stream) -{ - return detail::compute_adjusted_rand_index(firstClusterArray, secondClusterArray, size, stream); -} - -/** - * @defgroup stats_adj_rand_index Adjusted Rand Index - * @{ - */ - -/** - * @brief Function to calculate Adjusted RandIndex - * @see https://en.wikipedia.org/wiki/Rand_index - * @tparam value_t data-type for input label arrays - * @tparam math_t integral data-type used for computing n-choose-r - * @tparam idx_t Index type of matrix extent. - * @param[in] handle: the raft handle. - * @param[in] first_cluster_array: the array of classes - * @param[in] second_cluster_array: the array of classes - * @return the Adjusted RandIndex - */ -template -double adjusted_rand_index(raft::resources const& handle, - raft::device_vector_view first_cluster_array, - raft::device_vector_view second_cluster_array) -{ - RAFT_EXPECTS(first_cluster_array.size() == second_cluster_array.size(), "Size mismatch"); - RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous"); - RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous"); - - return detail::compute_adjusted_rand_index(first_cluster_array.data_handle(), - second_cluster_array.data_handle(), - first_cluster_array.extent(0), - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_adj_rand_index - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/completeness_score.cuh b/cpp/include/cuvs/stats/completeness_score.cuh deleted file mode 100644 index a09bf7764..000000000 --- a/cpp/include/cuvs/stats/completeness_score.cuh +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __COMPLETENESS_SCORE_H -#define __COMPLETENESS_SCORE_H - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Function to calculate the completeness score between two clusters - * - * @param truthClusterArray: the array of truth classes of type T - * @param predClusterArray: the array of predicted classes of type T - * @param size: the size of the data points of type int - * @param lower_label_range: the lower bound of the range of labels - * @param upper_label_range: the upper bound of the range of labels - * @param stream: the cudaStream object - */ -template -double completeness_score(const T* truthClusterArray, - const T* predClusterArray, - int size, - T lower_label_range, - T upper_label_range, - cudaStream_t stream) -{ - return detail::homogeneity_score( - predClusterArray, truthClusterArray, size, lower_label_range, upper_label_range, stream); -} - -/** - * @defgroup stats_completeness Completeness Score - * @{ - */ - -/** - * @brief Function to calculate the completeness score between two clusters - * - * @tparam value_t the data type - * @tparam idx_t Index type of matrix extent. - * @param[in] handle: the raft handle. - * @param[in] truth_cluster_array: the array of truth classes of type value_t - * @param[in] pred_cluster_array: the array of predicted classes of type value_t - * @param[in] lower_label_range: the lower bound of the range of labels - * @param[in] upper_label_range: the upper bound of the range of labels - * @return the cluster completeness score - */ -template -double completeness_score(raft::resources const& handle, - raft::device_vector_view truth_cluster_array, - raft::device_vector_view pred_cluster_array, - value_t lower_label_range, - value_t upper_label_range) -{ - RAFT_EXPECTS(truth_cluster_array.size() == pred_cluster_array.size(), "Size mismatch"); - RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous"); - RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous"); - return detail::homogeneity_score(pred_cluster_array.data_handle(), - truth_cluster_array.data_handle(), - truth_cluster_array.extent(0), - lower_label_range, - upper_label_range, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_completeness - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/contingency_matrix.cuh b/cpp/include/cuvs/stats/contingency_matrix.cuh deleted file mode 100644 index a3ff1e68b..000000000 --- a/cpp/include/cuvs/stats/contingency_matrix.cuh +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __CONTINGENCY_MATRIX_H -#define __CONTINGENCY_MATRIX_H - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief use this to allocate output matrix size - * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int) - * @param groundTruth: device 1-d array for ground truth (num of rows) - * @param nSamples: number of elements in input array - * @param stream: cuda stream for execution - * @param minLabel: [out] calculated min value in input array - * @param maxLabel: [out] calculated max value in input array - */ -template -void getInputClassCardinality( - const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel) -{ - detail::getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel); -} - -/** - * @brief Calculate workspace size for running contingency matrix calculations - * @tparam T label type - * @tparam OutT output matrix type - * @param nSamples: number of elements in input array - * @param groundTruth: device 1-d array for ground truth (num of rows) - * @param stream: cuda stream for execution - * @param minLabel: Optional, min value in input array - * @param maxLabel: Optional, max value in input array - */ -template -size_t getContingencyMatrixWorkspaceSize(int nSamples, - const T* groundTruth, - cudaStream_t stream, - T minLabel = std::numeric_limits::max(), - T maxLabel = std::numeric_limits::max()) -{ - return detail::getContingencyMatrixWorkspaceSize( - nSamples, groundTruth, stream, minLabel, maxLabel); -} - -/** - * @brief construct contingency matrix given input ground truth and prediction - * labels. Users should call function getInputClassCardinality to find - * and allocate memory for output. Similarly workspace requirements - * should be checked using function getContingencyMatrixWorkspaceSize - * @tparam T label type - * @tparam OutT output matrix type - * @param groundTruth: device 1-d array for ground truth (num of rows) - * @param predictedLabel: device 1-d array for prediction (num of columns) - * @param nSamples: number of elements in input array - * @param outMat: output buffer for contingency matrix - * @param stream: cuda stream for execution - * @param workspace: Optional, workspace memory allocation - * @param workspaceSize: Optional, size of workspace memory - * @param minLabel: Optional, min value in input ground truth array - * @param maxLabel: Optional, max value in input ground truth array - */ -template -void contingencyMatrix(const T* groundTruth, - const T* predictedLabel, - int nSamples, - OutT* outMat, - cudaStream_t stream, - void* workspace = nullptr, - size_t workspaceSize = 0, - T minLabel = std::numeric_limits::max(), - T maxLabel = std::numeric_limits::max()) -{ - detail::contingencyMatrix(groundTruth, - predictedLabel, - nSamples, - outMat, - stream, - workspace, - workspaceSize, - minLabel, - maxLabel); -} - -/** - * @defgroup contingency_matrix Contingency Matrix - * @{ - */ - -/** - * @brief use this to allocate output matrix size - * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int) - * @tparam value_t label type - * @tparam idx_t Index type of matrix extent. - * @param[in] handle: the raft handle. - * @param[in] groundTruth: device 1-d array for ground truth (num of rows) - * @param[out] minLabel: calculated min value in input array - * @param[out] maxLabel: calculated max value in input array - */ -template -void get_input_class_cardinality(raft::resources const& handle, - raft::device_vector_view groundTruth, - raft::host_scalar_view minLabel, - raft::host_scalar_view maxLabel) -{ - RAFT_EXPECTS(minLabel.data_handle() != nullptr, "Invalid minLabel pointer"); - RAFT_EXPECTS(maxLabel.data_handle() != nullptr, "Invalid maxLabel pointer"); - detail::getInputClassCardinality(groundTruth.data_handle(), - groundTruth.extent(0), - resource::get_cuda_stream(handle), - *minLabel.data_handle(), - *maxLabel.data_handle()); -} - -/** - * @brief construct contingency matrix given input ground truth and prediction - * labels. Users should call function getInputClassCardinality to find - * and allocate memory for output. Similarly workspace requirements - * should be checked using function getContingencyMatrixWorkspaceSize - * @tparam value_t label type - * @tparam out_t output matrix type - * @tparam idx_t Index type of matrix extent. - * @tparam layout_t Layout type of the input data. - * @tparam opt_min_label_t std::optional @c opt_min_label - * @tparam opt_max_label_t std::optional @c opt_max_label - * @param[in] handle: the raft handle. - * @param[in] ground_truth: device 1-d array for ground truth (num of rows) - * @param[in] predicted_label: device 1-d array for prediction (num of columns) - * @param[out] out_mat: output buffer for contingency matrix - * @param[in] opt_min_label: std::optional, min value in input ground truth array - * @param[in] opt_max_label: std::optional, max value in input ground truth array - */ -template -void contingency_matrix(raft::resources const& handle, - raft::device_vector_view ground_truth, - raft::device_vector_view predicted_label, - raft::device_matrix_view out_mat, - opt_min_label_t&& opt_min_label, - opt_max_label_t&& opt_max_label) -{ - std::optional min_label = std::forward(opt_min_label); - std::optional max_label = std::forward(opt_max_label); - - RAFT_EXPECTS(ground_truth.size() == predicted_label.size(), "Size mismatch"); - RAFT_EXPECTS(ground_truth.is_exhaustive(), "ground_truth must be contiguous"); - RAFT_EXPECTS(predicted_label.is_exhaustive(), "predicted_label must be contiguous"); - RAFT_EXPECTS(out_mat.is_exhaustive(), "out_mat must be contiguous"); - - value_t min_label_value = std::numeric_limits::max(); - value_t max_label_value = std::numeric_limits::max(); - if (min_label.has_value()) { min_label_value = min_label.value(); } - if (max_label.has_value()) { max_label_value = max_label.value(); } - - auto workspace_sz = detail::getContingencyMatrixWorkspaceSize(ground_truth.extent(0), - ground_truth.data_handle(), - resource::get_cuda_stream(handle), - min_label_value, - max_label_value); - auto workspace = raft::make_device_vector(handle, workspace_sz); - - detail::contingencyMatrix(ground_truth.data_handle(), - predicted_label.data_handle(), - ground_truth.extent(0), - out_mat.data_handle(), - resource::get_cuda_stream(handle), - workspace.data_handle(), - workspace_sz, - min_label_value, - max_label_value); -} - -/** @} */ // end group contingency_matrix - -/** - * @brief Overload of `contingency_matrix` to help the - * compiler find the above overload, in case users pass in - * `std::nullopt` for the optional arguments. - * - * Please see above for documentation of `contingency_matrix`. - */ -template > -void contingency_matrix(Args... args) -{ - contingency_matrix(std::forward(args)..., std::nullopt, std::nullopt); -} -}; // namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/cov.cuh b/cpp/include/cuvs/stats/cov.cuh deleted file mode 100644 index 037bdbc8e..000000000 --- a/cpp/include/cuvs/stats/cov.cuh +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __COV_H -#define __COV_H - -#pragma once - -#include -#include -#include -namespace cuvs { -namespace stats { -/** - * @brief Compute covariance of the input matrix - * - * Mean operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @param covar the output covariance matrix - * @param data the input matrix (this will get mean-centered at the end!) - * @param mu mean vector of the input matrix - * @param D number of columns of data - * @param N number of rows of data - * @param sample whether to evaluate sample covariance or not. In other words, - * whether to normalize the output using N-1 or N, for true or false, - * respectively - * @param rowMajor whether the input data is row or col major - * @param stable whether to run the slower-but-numerically-stable version or not - * @param handle cublas handle - * @param stream cuda stream - * @note if stable=true, then the input data will be mean centered after this - * function returns! - */ -template -void cov(raft::resources const& handle, - Type* covar, - Type* data, - const Type* mu, - std::size_t D, - std::size_t N, - bool sample, - bool rowMajor, - bool stable, - cudaStream_t stream) -{ - detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream); -} - -/** - * @defgroup stats_cov Covariance Matrix Construction - * @{ - */ - -/** - * @brief Compute covariance of the input matrix - * - * Mean operation is assumed to be performed on a given column. - * - * @tparam value_t the data type - * @tparam idx_t the index type - * @tparam layout_t Layout type of the input data. - * @param[in] handle the raft handle - * @param[in] data the input matrix (this will get mean-centered at the end!) - * (length = nrows * ncols) - * @param[in] mu mean vector of the input matrix (length = ncols) - * @param[out] covar the output covariance matrix (length = ncols * ncols) - * @param[in] sample whether to evaluate sample covariance or not. In other words, - * whether to normalize the output using N-1 or N, for true or false, - * respectively - * @param[in] stable whether to run the slower-but-numerically-stable version or not - * @note if stable=true, then the input data will be mean centered after this - * function returns! - */ -template -void cov(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view mu, - raft::device_matrix_view covar, - bool sample, - bool stable) -{ - static_assert( - std::is_same_v || std::is_same_v, - "Data layout not supported"); - RAFT_EXPECTS(data.extent(1) == covar.extent(0) && data.extent(1) == covar.extent(1), - "Size mismatch"); - RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous"); - RAFT_EXPECTS(covar.is_exhaustive(), "covar must be contiguous"); - RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous"); - - detail::cov(handle, - covar.data_handle(), - data.data_handle(), - mu.data_handle(), - data.extent(1), - data.extent(0), - std::is_same_v, - sample, - stable, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_cov - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/detail/adjusted_rand_index.cuh b/cpp/include/cuvs/stats/detail/adjusted_rand_index.cuh deleted file mode 100644 index 52e7a323d..000000000 --- a/cpp/include/cuvs/stats/detail/adjusted_rand_index.cuh +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file adjusted_rand_index.cuh - * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index. - * Such a correction for chance establishes a baseline by using the expected similarity - * of all pair-wise comparisons between clusterings specified by a random model. - */ - -#pragma once - -#include "contingencyMatrix.cuh" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief Lambda to calculate the number of unordered pairs in a given input - * - * @tparam Type: Data type of the input - * @param in: the input to the functional mapping - * @param i: the indexing(not used in this case) - */ -template -struct nCTwo { - HDI Type operator()(Type in, int i = 0) - { - return in % 2 ? ((in - 1) >> 1) * in : (in >> 1) * (in - 1); - } -}; - -template -struct Binner { - Binner(DataT minL) : minLabel(minL) {} - - DI int operator()(DataT val, IdxT row, IdxT col) { return int(val - minLabel); } - - private: - DataT minLabel; -}; // struct Binner - -/** - * @brief Function to count the number of unique elements in the input array - * - * @tparam T data-type for input arrays - * - * @param[in] arr input array [on device] [len = size] - * @param[in] size the size of the input array - * @param[out] minLabel the lower bound of the range of labels - * @param[out] maxLabel the upper bound of the range of labels - * @param[in] stream cuda stream - * - * @return the number of unique elements in the array - */ -template -int countUnique(const T* arr, int size, T& minLabel, T& maxLabel, cudaStream_t stream) -{ - auto ptr = thrust::device_pointer_cast(arr); - auto minmax = thrust::minmax_element(thrust::cuda::par.on(stream), ptr, ptr + size); - minLabel = *minmax.first; - maxLabel = *minmax.second; - auto totalLabels = int(maxLabel - minLabel + 1); - rmm::device_uvector labelCounts(totalLabels, stream); - rmm::device_scalar nUniq(stream); - raft::stats::histogram( - raft::stats::HistTypeAuto, - labelCounts.data(), - totalLabels, - arr, - size, - 1, - stream, - [minLabel] __device__(T val, int row, int col) { return int(val - minLabel); }); - raft::linalg::mapThenSumReduce( - nUniq.data(), - totalLabels, - [] __device__(const T& val) { return val != 0; }, - stream, - labelCounts.data()); - auto numUniques = nUniq.value(stream); - return numUniques; -} - -/** - * @brief Function to calculate Adjusted RandIndex as described - * here - * @tparam T data-type for input label arrays - * @tparam MathT integral data-type used for computing n-choose-r - * @param firstClusterArray: the array of classes - * @param secondClusterArray: the array of classes - * @param size: the size of the data points of type int - * @param stream: the cudaStream object - */ -template -double compute_adjusted_rand_index(const T* firstClusterArray, - const T* secondClusterArray, - int size, - cudaStream_t stream) -{ - ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); - T minFirst, maxFirst, minSecond, maxSecond; - auto nUniqFirst = countUnique(firstClusterArray, size, minFirst, maxFirst, stream); - auto nUniqSecond = countUnique(secondClusterArray, size, minSecond, maxSecond, stream); - auto lowerLabelRange = std::min(minFirst, minSecond); - auto upperLabelRange = std::max(maxFirst, maxSecond); - auto nClasses = upperLabelRange - lowerLabelRange + 1; - // degenerate case of single cluster or clusters each with just one element - if (nUniqFirst == nUniqSecond) { - if (nUniqFirst == 1 || nUniqFirst == size) return 1.0; - } - auto nUniqClasses = MathT(nClasses); - rmm::device_uvector dContingencyMatrix(nUniqClasses * nUniqClasses, stream); - RAFT_CUDA_TRY(cudaMemsetAsync( - dContingencyMatrix.data(), 0, nUniqClasses * nUniqClasses * sizeof(MathT), stream)); - auto workspaceSz = getContingencyMatrixWorkspaceSize( - size, firstClusterArray, stream, lowerLabelRange, upperLabelRange); - rmm::device_uvector workspaceBuff(workspaceSz, stream); - contingencyMatrix(firstClusterArray, - secondClusterArray, - size, - dContingencyMatrix.data(), - stream, - workspaceBuff.data(), - workspaceSz, - lowerLabelRange, - upperLabelRange); - rmm::device_uvector a(nUniqClasses, stream); - rmm::device_uvector b(nUniqClasses, stream); - rmm::device_scalar d_aCTwoSum(stream); - rmm::device_scalar d_bCTwoSum(stream); - rmm::device_scalar d_nChooseTwoSum(stream); - MathT h_aCTwoSum, h_bCTwoSum, h_nChooseTwoSum; - RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, nUniqClasses * sizeof(MathT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, nUniqClasses * sizeof(MathT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_aCTwoSum.data(), 0, sizeof(MathT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_bCTwoSum.data(), 0, sizeof(MathT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_nChooseTwoSum.data(), 0, sizeof(MathT), stream)); - // calculating the sum of NijC2 - raft::linalg::mapThenSumReduce>(d_nChooseTwoSum.data(), - nUniqClasses * nUniqClasses, - nCTwo(), - stream, - dContingencyMatrix.data(), - dContingencyMatrix.data()); - // calculating the row-wise sums - raft::linalg::reduce( - a.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, true, true, stream); - // calculating the column-wise sums - raft::linalg::reduce( - b.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, true, false, stream); - // calculating the sum of number of unordered pairs for every element in a - raft::linalg::mapThenSumReduce>( - d_aCTwoSum.data(), nUniqClasses, nCTwo(), stream, a.data(), a.data()); - // calculating the sum of number of unordered pairs for every element of b - raft::linalg::mapThenSumReduce>( - d_bCTwoSum.data(), nUniqClasses, nCTwo(), stream, b.data(), b.data()); - // updating in the host memory - raft::update_host(&h_nChooseTwoSum, d_nChooseTwoSum.data(), 1, stream); - raft::update_host(&h_aCTwoSum, d_aCTwoSum.data(), 1, stream); - raft::update_host(&h_bCTwoSum, d_bCTwoSum.data(), 1, stream); - // calculating the ARI - auto nChooseTwo = double(size) * double(size - 1) / 2.0; - auto expectedIndex = double(h_aCTwoSum) * double(h_bCTwoSum) / double(nChooseTwo); - auto maxIndex = (double(h_bCTwoSum) + double(h_aCTwoSum)) / 2.0; - auto index = double(h_nChooseTwoSum); - if (maxIndex - expectedIndex) - return (index - expectedIndex) / (maxIndex - expectedIndex); - else - return 0; -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/batched/information_criterion.cuh b/cpp/include/cuvs/stats/detail/batched/information_criterion.cuh deleted file mode 100644 index 50853d601..000000000 --- a/cpp/include/cuvs/stats/detail/batched/information_criterion.cuh +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once -#include -#include - -#include - -namespace cuvs { -namespace stats { -namespace batched { -namespace detail { - -/** - * Compute the given type of information criterion - * - * @note: it is safe to do the computation in-place (i.e give same pointer - * as input and output) - * - * @param[out] d_ic Information criterion to be returned for each - * series (device) - * @param[in] d_loglikelihood Log-likelihood for each series (device) - * @param[in] ic_type Type of criterion to compute. See IC_Type - * @param[in] n_params Number of parameters in the model - * @param[in] batch_size Number of series in the batch - * @param[in] n_samples Number of samples in each series - * @param[in] stream CUDA stream - */ -template -void information_criterion(ScalarT* d_ic, - const ScalarT* d_loglikelihood, - IC_Type ic_type, - IdxT n_params, - IdxT batch_size, - IdxT n_samples, - cudaStream_t stream) -{ - ScalarT ic_base{}; - ScalarT N = static_cast(n_params); - ScalarT T = static_cast(n_samples); - switch (ic_type) { - case AIC: ic_base = (ScalarT)2.0 * N; break; - case AICc: - ic_base = (ScalarT)2.0 * (N + (N * (N + (ScalarT)1.0)) / (T - N - (ScalarT)1.0)); - break; - case BIC: ic_base = std::log(T) * N; break; - } - /* Compute information criterion from log-likelihood and base term */ - raft::linalg::unaryOp( - d_ic, - d_loglikelihood, - batch_size, - [=] __device__(ScalarT loglike) { return ic_base - (ScalarT)2.0 * loglike; }, - stream); -} - -} // namespace detail -} // namespace batched -} // namespace stats -} // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/batched/silhouette_score.cuh b/cpp/include/cuvs/stats/detail/batched/silhouette_score.cuh deleted file mode 100644 index 241c47986..000000000 --- a/cpp/include/cuvs/stats/detail/batched/silhouette_score.cuh +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "../silhouette_score.cuh" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace batched { -namespace detail { - -/** - * This kernel initializes matrix b (n_rows * n_labels) - * For each label that the corresponding row is not a part of is initialized as 0 - * If the corresponding row is the only sample in its label, again 0 - * Only if the there are > 1 samples in the label, row is initialized to max - */ -template -RAFT_KERNEL fill_b_kernel(value_t* b, - const label_idx* y, - value_idx n_rows, - label_idx n_labels, - const value_idx* cluster_counts) -{ - value_idx idx = threadIdx.x + blockIdx.x * blockDim.x; - label_idx idy = threadIdx.y + blockIdx.y * blockDim.y; - - if (idx >= n_rows || idy >= n_labels) { return; } - - auto row_cluster = y[idx]; - - auto col_cluster_count = cluster_counts[idy]; - - // b for own cluster should be max value - // so that it does not interfere with min operator - // b is also max if col cluster count is 0 - // however, b is 0 if self cluster count is 1 - if (row_cluster == idy || col_cluster_count == 0) { - if (cluster_counts[row_cluster] == 1) { - b[idx * n_labels + idy] = 0; - } else { - b[idx * n_labels + idy] = std::numeric_limits::max(); - } - } else { - b[idx * n_labels + idy] = 0; - } -} - -/** - * This kernel does an elementwise sweep of chunked pairwise distance matrix - * By knowing the offsets of the chunked pairwise distance matrix in the - * global pairwise distance matrix, we are able to calculate - * intermediate values of a and b for the rows and columns present in the - * current chunked pairwise distance matrix. - */ -template -RAFT_KERNEL compute_chunked_a_b_kernel(value_t* a, - value_t* b, - value_idx row_offset, - value_idx col_offset, - const label_idx* y, - label_idx n_labels, - const value_idx* cluster_counts, - const value_t* distances, - value_idx dist_rows, - value_idx dist_cols) -{ - value_idx row_id = threadIdx.x + blockIdx.x * blockDim.x; - value_idx col_id = threadIdx.y + blockIdx.y * blockDim.y; - - // these are global offsets of current element - // in the full pairwise distance matrix - value_idx pw_row_id = row_id + row_offset; - value_idx pw_col_id = col_id + col_offset; - - if (row_id >= dist_rows || col_id >= dist_cols || pw_row_id == pw_col_id) { return; } - - auto row_cluster = y[pw_row_id]; - if (cluster_counts[row_cluster] == 1) { return; } - - auto col_cluster = y[pw_col_id]; - auto col_cluster_counts = cluster_counts[col_cluster]; - - if (col_cluster == row_cluster) { - atomicAdd(&a[pw_row_id], distances[row_id * dist_cols + col_id] / (col_cluster_counts - 1)); - } else { - atomicAdd(&b[pw_row_id * n_labels + col_cluster], - distances[row_id * dist_cols + col_id] / col_cluster_counts); - } -} - -template -rmm::device_uvector get_cluster_counts(raft::resources const& handle, - const label_idx* y, - value_idx& n_rows, - label_idx& n_labels) -{ - auto stream = resource::get_cuda_stream(handle); - - rmm::device_uvector cluster_counts(n_labels, stream); - - rmm::device_uvector workspace(1, stream); - - raft::stats::detail::countLabels(y, cluster_counts.data(), n_rows, n_labels, workspace, stream); - - return cluster_counts; -} - -template -rmm::device_uvector get_pairwise_distance(raft::resources const& handle, - const value_t* left_begin, - const value_t* right_begin, - value_idx& n_left_rows, - value_idx& n_right_rows, - value_idx& n_cols, - cuvs::distance::DistanceType metric, - cudaStream_t stream) -{ - rmm::device_uvector distances(n_left_rows * n_right_rows, stream); - - cuvs::distance::pairwise_distance( - handle, left_begin, right_begin, distances.data(), n_left_rows, n_right_rows, n_cols, metric); - - return distances; -} - -template -void compute_chunked_a_b(raft::resources const& handle, - value_t* a, - value_t* b, - value_idx& row_offset, - value_idx& col_offset, - const label_idx* y, - label_idx& n_labels, - const value_idx* cluster_counts, - const value_t* distances, - value_idx& dist_rows, - value_idx& dist_cols, - cudaStream_t stream) -{ - dim3 block_size(std::min(dist_rows, 32), std::min(dist_cols, 32)); - dim3 grid_size(raft::ceildiv(dist_rows, (value_idx)block_size.x), - raft::ceildiv(dist_cols, (value_idx)block_size.y)); - - detail::compute_chunked_a_b_kernel<<>>( - a, b, row_offset, col_offset, y, n_labels, cluster_counts, distances, dist_rows, dist_cols); -} - -template -value_t silhouette_score( - raft::resources const& handle, - const value_t* X, - value_idx n_rows, - value_idx n_cols, - const label_idx* y, - label_idx n_labels, - value_t* scores, - value_idx chunk, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded) -{ - ASSERT(n_labels >= 2 && n_labels <= (n_rows - 1), - "silhouette Score not defined for the given number of labels!"); - - rmm::device_uvector cluster_counts = get_cluster_counts(handle, y, n_rows, n_labels); - - auto stream = resource::get_cuda_stream(handle); - auto policy = resource::get_thrust_policy(handle); - - auto b_size = n_rows * n_labels; - - value_t *a_ptr, *b_ptr; - rmm::device_uvector a(0, stream); - rmm::device_uvector b(b_size, stream); - - b_ptr = b.data(); - - // since a and silhouette score per sample are same size, reusing - if (scores == nullptr || scores == NULL) { - a.resize(n_rows, stream); - a_ptr = a.data(); - } else { - a_ptr = scores; - } - - thrust::fill(policy, a_ptr, a_ptr + n_rows, 0); - - dim3 block_size(std::min(n_rows, 32), std::min(n_labels, 32)); - dim3 grid_size(raft::ceildiv(n_rows, (value_idx)block_size.x), - raft::ceildiv(n_labels, (label_idx)block_size.y)); - detail::fill_b_kernel<<>>( - b_ptr, y, n_rows, n_labels, cluster_counts.data()); - - resource::wait_stream_pool_on_stream(handle); - - auto n_iters = 0; - - for (value_idx i = 0; i < n_rows; i += chunk) { - for (value_idx j = 0; j < n_rows; j += chunk) { - ++n_iters; - - auto chunk_stream = resource::get_next_usable_stream(handle, i + chunk * j); - - const auto* left_begin = X + (i * n_cols); - const auto* right_begin = X + (j * n_cols); - - auto n_left_rows = (i + chunk) < n_rows ? chunk : (n_rows - i); - auto n_right_rows = (j + chunk) < n_rows ? chunk : (n_rows - j); - - rmm::device_uvector distances = get_pairwise_distance( - handle, left_begin, right_begin, n_left_rows, n_right_rows, n_cols, metric, chunk_stream); - - compute_chunked_a_b(handle, - a_ptr, - b_ptr, - i, - j, - y, - n_labels, - cluster_counts.data(), - distances.data(), - n_left_rows, - n_right_rows, - chunk_stream); - } - } - - resource::sync_stream_pool(handle); - - // calculating row-wise minimum in b - // this prim only supports int indices for now - raft::linalg::reduce( - b_ptr, - b_ptr, - n_labels, - n_rows, - std::numeric_limits::max(), - true, - true, - stream, - false, - raft::identity_op(), - raft::min_op()); - - // calculating the silhouette score per sample - raft::linalg::binaryOp, value_t, value_idx>( - a_ptr, a_ptr, b_ptr, n_rows, raft::stats::detail::SilOp(), stream); - - return thrust::reduce(policy, a_ptr, a_ptr + n_rows, value_t(0)) / n_rows; -} - -} // namespace detail -} // namespace batched -} // namespace stats -} // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/contingencyMatrix.cuh b/cpp/include/cuvs/stats/detail/contingencyMatrix.cuh deleted file mode 100644 index 6aa5b6789..000000000 --- a/cpp/include/cuvs/stats/detail/contingencyMatrix.cuh +++ /dev/null @@ -1,316 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include -#include -#include -#include - -#include - -#include - -namespace cuvs { -namespace stats { -namespace detail { - -typedef enum { - IMPL_NONE, - SMEM_ATOMICS, - GLOBAL_ATOMICS, - SORT_AND_GATOMICS -} ContingencyMatrixImplType; - -template -RAFT_KERNEL devConstructContingencyMatrix(const T* groundTruth, - const T* predicted, - int nSamples, - OutT* outMat, - int outIdxOffset, - int outMatWidth) -{ - int elementId = threadIdx.x + blockDim.x * blockIdx.x; - if (elementId < nSamples) { - T gt = groundTruth[elementId]; - T pd = predicted[elementId]; - auto outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset; - raft::myAtomicAdd(outMat + outputIdx, OutT(1)); - } -} - -template -void computeCMatWAtomics(const T* groundTruth, - const T* predictedLabel, - int nSamples, - OutT* outMat, - int outIdxOffset, - int outDimN, - cudaStream_t stream) -{ - RAFT_CUDA_TRY( - cudaFuncSetCacheConfig(devConstructContingencyMatrix, cudaFuncCachePreferL1)); - static const int block = 128; - auto grid = raft::ceildiv(nSamples, block); - devConstructContingencyMatrix<<>>( - groundTruth, predictedLabel, nSamples, outMat, outIdxOffset, outDimN); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -RAFT_KERNEL devConstructContingencyMatrixSmem(const T* groundTruth, - const T* predicted, - int nSamples, - OutT* outMat, - int outIdxOffset, - int outMatWidth) -{ - extern __shared__ char smem[]; - auto* sMemMatrix = reinterpret_cast(smem); - for (int smemIdx = threadIdx.x; smemIdx < outMatWidth * outMatWidth; smemIdx += blockDim.x) { - sMemMatrix[smemIdx] = 0; - } - __syncthreads(); - int elementId = threadIdx.x + blockDim.x * blockIdx.x; - if (elementId < nSamples) { - T gt = groundTruth[elementId]; - T pd = predicted[elementId]; - auto outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset; - raft::myAtomicAdd(sMemMatrix + outputIdx, OutT(1)); - } - __syncthreads(); - for (int smemIdx = threadIdx.x; smemIdx < outMatWidth * outMatWidth; smemIdx += blockDim.x) { - raft::myAtomicAdd(outMat + smemIdx, sMemMatrix[smemIdx]); - } -} - -template -void computeCMatWSmemAtomics(const T* groundTruth, - const T* predictedLabel, - int nSamples, - OutT* outMat, - int outIdxOffset, - int outDimN, - cudaStream_t stream) -{ - static const int block = 128; - auto grid = raft::ceildiv(nSamples, block); - size_t smemSizePerBlock = outDimN * outDimN * sizeof(OutT); - devConstructContingencyMatrixSmem<<>>( - groundTruth, predictedLabel, nSamples, outMat, outIdxOffset, outDimN); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void contingencyMatrixWSort(const T* groundTruth, - const T* predictedLabel, - int nSamples, - OutT* outMat, - T minLabel, - T maxLabel, - void* workspace, - size_t workspaceSize, - cudaStream_t stream) -{ - T* outKeys = reinterpret_cast(workspace); - auto alignedBufferSz = raft::alignTo(nSamples * sizeof(T), 256); - T* outValue = reinterpret_cast((size_t)workspace + alignedBufferSz); - void* pWorkspaceCub = reinterpret_cast((size_t)workspace + 2 * alignedBufferSz); - auto bitsToSort = log2(maxLabel); - if (!raft::isPo2(maxLabel)) ++bitsToSort; - // we dont really need perfect sorting, should get by with some sort of - // binning-reordering operation - ///@todo: future work - explore "efficient" custom binning kernels vs cub sort - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(pWorkspaceCub, - workspaceSize, - groundTruth, - outKeys, - predictedLabel, - outValue, - nSamples, - 0, - bitsToSort, - stream)); - auto outDimM_N = int(maxLabel - minLabel + 1); - computeCMatWAtomics(outKeys, outValue, nSamples, outMat, minLabel, outDimM_N, stream); -} - -template -ContingencyMatrixImplType getImplVersion(OutT outDimN) -{ - int currDevice = 0; - int l2CacheSize = 0; - // no way to query this from CUDA APIs, value for CC 7.0, 3.0 - int maxBlocksResidentPerSM = 16; - RAFT_CUDA_TRY(cudaGetDevice(&currDevice)); - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2CacheSize, cudaDevAttrL2CacheSize, currDevice)); - auto maxSmemPerBlock = raft::getSharedMemPerBlock(); - ContingencyMatrixImplType implVersion = IMPL_NONE; - // keeping 8 block per SM to get good utilization - // can go higher but reduced L1 size degrades perf - OutT upperLimitSmemAtomics = - std::floor(std::sqrt(maxSmemPerBlock / (sizeof(OutT) * (maxBlocksResidentPerSM / 2)))); - OutT upperLimitL2Atomics = std::floor(std::sqrt(l2CacheSize / sizeof(OutT))); - if (outDimN <= upperLimitSmemAtomics) - implVersion = SMEM_ATOMICS; - else if (outDimN <= upperLimitL2Atomics) - implVersion = GLOBAL_ATOMICS; - else - implVersion = SORT_AND_GATOMICS; - return implVersion; -} - -/** - * @brief use this to allocate output matrix size - * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int) - * @param groundTruth: device 1-d array for ground truth (num of rows) - * @param nSamples: number of elements in input array - * @param stream: cuda stream for execution - * @param minLabel: [out] calculated min value in input array - * @param maxLabel: [out] calculated max value in input array - */ -template -void getInputClassCardinality( - const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel) -{ - thrust::device_ptr dTrueLabel = thrust::device_pointer_cast(groundTruth); - auto min_max = - thrust::minmax_element(thrust::cuda::par.on(stream), dTrueLabel, dTrueLabel + nSamples); - minLabel = *min_max.first; - maxLabel = *min_max.second; -} - -/** - * @brief Calculate workspace size for running contingency matrix calculations - * @tparam T label type - * @tparam OutT output matrix type - * @param nSamples: number of elements in input array - * @param groundTruth: device 1-d array for ground truth (num of rows) - * @param stream: cuda stream for execution - * @param minLabel: Optional, min value in input array - * @param maxLabel: Optional, max value in input array - */ -template -size_t getContingencyMatrixWorkspaceSize(int nSamples, - const T* groundTruth, - cudaStream_t stream, - T minLabel = std::numeric_limits::max(), - T maxLabel = std::numeric_limits::max()) -{ - size_t workspaceSize = 0; - // below is a redundant computation - can be avoided - if (minLabel == std::numeric_limits::max() || maxLabel == std::numeric_limits::max()) { - getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel); - } - auto outDimN = OutT(maxLabel - minLabel + 1); - ContingencyMatrixImplType implVersion = getImplVersion(outDimN); - if (implVersion == SORT_AND_GATOMICS) { - void* pWorkspaceCub{}; - size_t tmpStorageBytes = 0; - // no-op pointers to get workspace size - T* pTmpUnused{}; - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs( - pWorkspaceCub, tmpStorageBytes, pTmpUnused, pTmpUnused, pTmpUnused, pTmpUnused, nSamples)); - auto tmpStagingMemorySize = raft::alignTo(nSamples * sizeof(T), 256); - tmpStagingMemorySize *= 2; - workspaceSize = tmpStagingMemorySize + tmpStorageBytes; - } - return workspaceSize; -} - -/** - * @brief construct contingency matrix given input ground truth and prediction - * labels. Users should call function getInputClassCardinality to find - * and allocate memory for output. Similarly workspace requirements - * should be checked using function getContingencyMatrixWorkspaceSize - * @tparam T label type - * @tparam OutT output matrix type - * @param groundTruth: device 1-d array for ground truth (num of rows) - * @param predictedLabel: device 1-d array for prediction (num of columns) - * @param nSamples: number of elements in input array - * @param outMat: output buffer for contingecy matrix - * @param stream: cuda stream for execution - * @param workspace: Optional, workspace memory allocation - * @param workspaceSize: Optional, size of workspace memory - * @param minLabel: Optional, min value in input ground truth array - * @param maxLabel: Optional, max value in input ground truth array - */ -template -void contingencyMatrix(const T* groundTruth, - const T* predictedLabel, - int nSamples, - OutT* outMat, - cudaStream_t stream, - void* workspace = nullptr, - size_t workspaceSize = 0, - T minLabel = std::numeric_limits::max(), - T maxLabel = std::numeric_limits::max()) -{ - // assumptions: - // output is not at par with scikit learn - output will be square matrix - // always with numRows = numColumns = numOfClassesInTrueLabel - // it is also assumed that true labels are monotically increasing - // if for some reason groundTruth completely skips some labels - // eg: {0,1,2,5} instead of {0,1,2,3}. - // Output matrix will still have empty rows for label value {3,4} - // Users can use "make_monotonic" to convert their discontinuous input label - // range to a monotonically increasing one // - // this also serves as way to measure co-occurrence/joint counts for NLP tasks which - // can be used to then compute pointwise mutual information and mutual information - if (minLabel == std::numeric_limits::max() || maxLabel == std::numeric_limits::max()) { - getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel); - } - auto outDimM_N = OutT(maxLabel - minLabel + 1); - RAFT_CUDA_TRY(cudaMemsetAsync(outMat, 0, sizeof(OutT) * outDimM_N * outDimM_N, stream)); - ContingencyMatrixImplType implVersion = getImplVersion(outDimM_N); - switch (implVersion) { - case SMEM_ATOMICS: - // smem atomics and then single global mem atomics only works - // when all label count can fit in smem for a block - // helps when GLOBAL_ATOMICS performance blocked by atomic update - // serialization -when very less labels ~10 labels - computeCMatWSmemAtomics( - groundTruth, predictedLabel, nSamples, outMat, minLabel, outDimM_N, stream); - break; - case GLOBAL_ATOMICS: - // launch kernel - global atomic ops per (groundTruth,predictedValue) pair - computeCMatWAtomics( - groundTruth, predictedLabel, nSamples, outMat, minLabel, outDimM_N, stream); - break; - // more L2 thrashing if atomic OPs land in completely different mem - // segment - when more labels - case SORT_AND_GATOMICS: - contingencyMatrixWSort(groundTruth, - predictedLabel, - nSamples, - outMat, - minLabel, - maxLabel, - workspace, - workspaceSize, - stream); - break; - case IMPL_NONE: break; - } -} - -}; // namespace detail -}; // namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/cov.cuh b/cpp/include/cuvs/stats/detail/cov.cuh deleted file mode 100644 index 2a76b103d..000000000 --- a/cpp/include/cuvs/stats/detail/cov.cuh +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { -/** - * @brief Compute covariance of the input matrix - * - * Mean operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @param covar the output covariance matrix - * @param data the input matrix (this will get mean-centered at the end!) - * @param mu mean vector of the input matrix - * @param D number of columns of data - * @param N number of rows of data - * @param sample whether to evaluate sample covariance or not. In other words, - * whether to normalize the output using N-1 or N, for true or false, - * respectively - * @param rowMajor whether the input data is row or col major - * @param stable whether to run the slower-but-numerically-stable version or not - * @param handle cublas handle - * @param stream cuda stream - * @note if stable=true, then the input data will be mean centered after this - * function returns! - */ -template -void cov(raft::resources const& handle, - Type* covar, - Type* data, - const Type* mu, - std::size_t D, - std::size_t N, - bool sample, - bool rowMajor, - bool stable, - cudaStream_t stream) -{ - if (stable) { - cublasHandle_t cublas_h = resource::get_cublas_handle(handle); - - // since mean operation is assumed to be along a given column, broadcast - // must be along rows! - raft::stats::meanCenter(data, data, mu, D, N, rowMajor, true, stream); - Type alpha = Type(1) / (sample ? Type(N - 1) : Type(N)); - Type beta = Type(0); - if (rowMajor) { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h, - CUBLAS_OP_N, - CUBLAS_OP_T, - D, - D, - N, - &alpha, - data, - D, - data, - D, - &beta, - covar, - D, - stream)); - } else { - raft::linalg::gemm( - handle, data, N, D, data, covar, D, D, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream); - } - } else { - ///@todo: implement this using cutlass + customized epilogue! - ASSERT(false, "cov: Implement stable=false case!"); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/dispersion.cuh b/cpp/include/cuvs/stats/detail/dispersion.cuh deleted file mode 100644 index 221fe5467..000000000 --- a/cpp/include/cuvs/stats/detail/dispersion.cuh +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { - -///@todo: ColsPerBlk has been tested only for 32! -template -RAFT_KERNEL weightedMeanKernel(DataT* mu, const DataT* data, const IdxT* counts, IdxT D, IdxT N) -{ - constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxT thisColId = threadIdx.x % ColsPerBlk; - IdxT thisRowId = threadIdx.x / ColsPerBlk; - IdxT colId = thisColId + ((IdxT)blockIdx.y * ColsPerBlk); - IdxT rowId = thisRowId + ((IdxT)blockIdx.x * RowsPerBlkPerIter); - DataT thread_data = DataT(0); - const IdxT stride = RowsPerBlkPerIter * gridDim.x; - __shared__ DataT smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = DataT(0); - for (IdxT i = rowId; i < N; i += stride) { - thread_data += (colId < D) ? data[i * D + colId] * (DataT)counts[i] : DataT(0); - } - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_data); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -RAFT_KERNEL dispersionKernel(DataT* result, - const DataT* clusters, - const IdxT* clusterSizes, - const DataT* mu, - IdxT dim, - IdxT nClusters) -{ - IdxT tid = threadIdx.x + blockIdx.x * blockDim.x; - IdxT len = dim * nClusters; - IdxT stride = blockDim.x * gridDim.x; - DataT sum = DataT(0); - for (; tid < len; tid += stride) { - IdxT col = tid % dim; - IdxT row = tid / dim; - DataT diff = clusters[tid] - mu[col]; - sum += diff * diff * DataT(clusterSizes[row]); - } - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - __syncthreads(); - auto acc = BlockReduce(temp_storage).Sum(sum); - __syncthreads(); - if (threadIdx.x == 0) raft::myAtomicAdd(result, acc); -} - -/** - * @brief Compute cluster dispersion metric. This is very useful for - * automatically finding the 'k' (in kmeans) that improves this metric. - * @tparam DataT data type - * @tparam IdxT index type - * @tparam TPB threads block for kernels launched - * @param centroids the cluster centroids. This is assumed to be row-major - * and of dimension (nClusters x dim) - * @param clusterSizes number of points in the dataset which belong to each - * cluster. This is of length nClusters - * @param globalCentroid compute the global weighted centroid of all cluster - * centroids. This is of length dim. Pass a nullptr if this is not needed - * @param nClusters number of clusters - * @param nPoints number of points in the dataset - * @param dim dataset dimensionality - * @param stream cuda stream - * @return the cluster dispersion value - */ -template -DataT dispersion(const DataT* centroids, - const IdxT* clusterSizes, - DataT* globalCentroid, - IdxT nClusters, - IdxT nPoints, - IdxT dim, - cudaStream_t stream) -{ - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(nPoints, (IdxT)RowsPerBlk), raft::ceildiv(dim, (IdxT)ColsPerBlk)); - rmm::device_uvector mean(0, stream); - rmm::device_uvector result(1, stream); - DataT* mu = globalCentroid; - if (globalCentroid == nullptr) { - mean.resize(dim, stream); - mu = mean.data(); - } - RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(DataT) * dim, stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(result.data(), 0, sizeof(DataT), stream)); - weightedMeanKernel - <<>>(mu, centroids, clusterSizes, dim, nClusters); - RAFT_CUDA_TRY(cudaGetLastError()); - DataT ratio = DataT(1) / DataT(nPoints); - raft::linalg::scalarMultiply(mu, mu, ratio, dim, stream); - // finally, compute the dispersion - constexpr int ItemsPerThread = 4; - int nblks = raft::ceildiv(dim * nClusters, TPB * ItemsPerThread); - dispersionKernel - <<>>(result.data(), centroids, clusterSizes, mu, dim, nClusters); - RAFT_CUDA_TRY(cudaGetLastError()); - DataT h_result; - raft::update_host(&h_result, result.data(), 1, stream); - raft::interruptible::synchronize(stream); - return sqrt(h_result); -} - -} // end namespace detail -} // end namespace stats -} // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/entropy.cuh b/cpp/include/cuvs/stats/detail/entropy.cuh deleted file mode 100644 index cae676171..000000000 --- a/cpp/include/cuvs/stats/detail/entropy.cuh +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file entropy.cuh - * @brief Calculates the entropy for a labeling in nats.(ie, uses natural logarithm for the - * calculations) - */ - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief Lambda to calculate the entropy of a sample given its probability value - * - * @param p: the input to the functional mapping - * @param q: dummy param - */ -struct entropyOp { - HDI double operator()(double p, double q) - { - if (p) - return -1 * (p) * (log(p)); - else - return 0.0; - } -}; - -/** - * @brief function to calculate the bincounts of number of samples in every label - * - * @tparam LabelT: type of the labels - * @param labels: the pointer to the array containing labels for every data sample - * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster - * @param nRows: number of data samples - * @param lowerLabelRange - * @param upperLabelRange - * @param workspace: device buffer containing workspace memory - * @param stream: the cuda stream where to launch this kernel - */ -template -void countLabels(const LabelT* labels, - double* binCountArray, - int nRows, - LabelT lowerLabelRange, - LabelT upperLabelRange, - rmm::device_uvector& workspace, - cudaStream_t stream) -{ - int num_levels = upperLabelRange - lowerLabelRange + 2; - LabelT lower_level = lowerLabelRange; - LabelT upper_level = upperLabelRange + 1; - size_t temp_storage_bytes = 0; - - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr, - temp_storage_bytes, - labels, - binCountArray, - num_levels, - lower_level, - upper_level, - nRows, - stream)); - - workspace.resize(temp_storage_bytes, stream); - - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(), - temp_storage_bytes, - labels, - binCountArray, - num_levels, - lower_level, - upper_level, - nRows, - stream)); -} - -/** - * @brief Function to calculate entropy - * more info on entropy - * - * @param clusterArray: the array of classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - * @return the entropy score - */ -template -double entropy(const T* clusterArray, - const int size, - const T lowerLabelRange, - const T upperLabelRange, - cudaStream_t stream) -{ - if (!size) return 1.0; - - T numUniqueClasses = upperLabelRange - lowerLabelRange + 1; - - // declaring, allocating and initializing memory for bincount array and entropy values - rmm::device_uvector prob(numUniqueClasses, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream)); - rmm::device_scalar d_entropy(stream); - RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream)); - - // workspace allocation - rmm::device_uvector workspace(1, stream); - - // calculating the bincounts and populating the prob array - countLabels(clusterArray, prob.data(), size, lowerLabelRange, upperLabelRange, workspace, stream); - - // scalar dividing by size - raft::linalg::divideScalar( - prob.data(), prob.data(), (double)size, numUniqueClasses, stream); - - // calculating the aggregate entropy - raft::linalg::mapThenSumReduce( - d_entropy.data(), numUniqueClasses, entropyOp(), stream, prob.data(), prob.data()); - - // updating in the host memory - double h_entropy; - raft::update_host(&h_entropy, d_entropy.data(), 1, stream); - - raft::interruptible::synchronize(stream); - - return h_entropy; -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/histogram.cuh b/cpp/include/cuvs/stats/detail/histogram.cuh deleted file mode 100644 index c68fc045f..000000000 --- a/cpp/include/cuvs/stats/detail/histogram.cuh +++ /dev/null @@ -1,496 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -// This file is a shameless amalgamation of independent works done by -// Lars Nyland and Andy Adinets - -///@todo: add cub's histogram as another option - -namespace cuvs { -namespace stats { -namespace detail { - -/** Default mapper which just returns the value of the data itself */ -template -struct IdentityBinner { - DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); } -}; - -static const int ThreadsPerBlock = 256; - -template -dim3 computeGridDim(IdxT nrows, IdxT ncols, const void* kernel) -{ - int occupancy; - RAFT_CUDA_TRY( - cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, ThreadsPerBlock, 0)); - const auto maxBlks = occupancy * raft::getMultiProcessorCount(); - int nblksx = raft::ceildiv(VecLen ? nrows / VecLen : nrows, ThreadsPerBlock); - // for cases when there aren't a lot of blocks for computing one histogram - nblksx = std::min(nblksx, maxBlks); - return dim3(nblksx, ncols); -} - -template -DI void histCoreOp(const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner, CoreOp op, IdxT col) -{ - IdxT offset = col * nrows; - auto bdim = IdxT(blockDim.x); - IdxT tid = threadIdx.x + bdim * blockIdx.x; - tid *= VecLen; - IdxT stride = bdim * gridDim.x * VecLen; - int nCeil = raft::alignTo(nrows, stride); - typedef raft::TxN_t VecType; - VecType a; - for (auto i = tid; i < nCeil; i += stride) { - if (i < nrows) { a.load(data, offset + i); } -#pragma unroll - for (int j = 0; j < VecLen; ++j) { - int binId = binner(a.val.data[j], i + j, col); - op(binId, i + j, col); - } - } -} - -template -RAFT_KERNEL gmemHistKernel(int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner) -{ - auto op = [=] __device__(int binId, IdxT row, IdxT col) { - if (row >= nrows) return; - auto binOffset = col * nbins; -#if __CUDA_ARCH__ < 700 - raft::myAtomicAdd(bins + binOffset + binId, 1); -#else - auto amask = __activemask(); - auto mask = __match_any_sync(amask, binId); - auto leader = __ffs(mask) - 1; - if (raft::laneId() == leader) { raft::myAtomicAdd(bins + binOffset + binId, __popc(mask)); } -#endif // __CUDA_ARCH__ - }; - histCoreOp(data, nrows, nbins, binner, op, blockIdx.y); -} - -template -void gmemHist(int* bins, - IdxT nbins, - const DataT* data, - IdxT nrows, - IdxT ncols, - BinnerOp binner, - cudaStream_t stream) -{ - auto blks = computeGridDim( - nrows, ncols, (const void*)gmemHistKernel); - gmemHistKernel - <<>>(bins, data, nrows, nbins, binner); -} - -template -RAFT_KERNEL smemHistKernel(int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner) -{ - extern __shared__ unsigned sbins[]; - for (auto i = threadIdx.x; i < nbins; i += blockDim.x) { - sbins[i] = 0; - } - __syncthreads(); - auto op = [=] __device__(int binId, IdxT row, IdxT col) { - if (row >= nrows) return; -#if __CUDA_ARCH__ < 700 - raft::myAtomicAdd(sbins + binId, 1); -#else - if (UseMatchAny) { - auto amask = __activemask(); - auto mask = __match_any_sync(amask, binId); - auto leader = __ffs(mask) - 1; - if (raft::laneId() == leader) { - raft::myAtomicAdd(sbins + binId, __popc(mask)); - } - } else { - raft::myAtomicAdd(sbins + binId, 1); - } -#endif // __CUDA_ARCH__ - }; - IdxT col = blockIdx.y; - histCoreOp(data, nrows, nbins, binner, op, col); - __syncthreads(); - auto binOffset = col * nbins; - for (auto i = threadIdx.x; i < nbins; i += blockDim.x) { - auto val = sbins[i]; - if (val > 0) { raft::myAtomicAdd((unsigned int*)bins + binOffset + i, val); } - } -} - -template -void smemHist(int* bins, - IdxT nbins, - const DataT* data, - IdxT nrows, - IdxT ncols, - BinnerOp binner, - cudaStream_t stream) -{ - auto blks = computeGridDim( - nrows, ncols, (const void*)smemHistKernel); - size_t smemSize = nbins * sizeof(unsigned); - smemHistKernel - <<>>(bins, data, nrows, nbins, binner); -} - -template -struct BitsInfo { - static unsigned const BIN_BITS = _BIN_BITS; - static unsigned const WORD_BITS = sizeof(unsigned) * 8; - static unsigned const WORD_BINS = WORD_BITS / BIN_BITS; - static unsigned const BIN_MASK = (1 << BIN_BITS) - 1; -}; - -template -DI void incrementBin(unsigned* sbins, int* bins, int nbins, int binId) -{ - typedef BitsInfo Bits; - auto iword = binId / Bits::WORD_BINS; - auto ibin = binId % Bits::WORD_BINS; - auto sh = ibin * Bits::BIN_BITS; - auto old_word = atomicAdd(sbins + iword, unsigned(1 << sh)); - auto new_word = old_word + unsigned(1 << sh); - if ((new_word >> sh & Bits::BIN_MASK) != 0) return; - // overflow - raft::myAtomicAdd((unsigned int*)bins + binId, Bits::BIN_MASK + 1); - for (int dbin = 1; ibin + dbin < Bits::WORD_BINS && binId + dbin < nbins; ++dbin) { - auto sh1 = (ibin + dbin) * Bits::BIN_BITS; - if ((new_word >> sh1 & Bits::BIN_MASK) == 0) { - // overflow - raft::myAtomicAdd((unsigned int*)bins + binId + dbin, Bits::BIN_MASK); - } else { - // correction - raft::myAtomicAdd(bins + binId + dbin, -1); - break; - } - } -} - -template <> -DI void incrementBin<1>(unsigned* sbins, int* bins, int nbins, int binId) -{ - typedef BitsInfo<1> Bits; - auto iword = binId / Bits::WORD_BITS; - auto sh = binId % Bits::WORD_BITS; - auto old_word = atomicXor(sbins + iword, unsigned(1 << sh)); - if ((old_word >> sh & 1) != 0) raft::myAtomicAdd(bins + binId, 2); -} - -template -RAFT_KERNEL smemBitsHistKernel( - int* bins, const DataT* data, IdxT nrows, IdxT nbins, BinnerOp binner) -{ - extern __shared__ unsigned sbins[]; - typedef BitsInfo Bits; - auto nwords = raft::ceildiv(nbins, Bits::WORD_BINS); - for (auto j = threadIdx.x; j < nwords; j += blockDim.x) { - sbins[j] = 0; - } - __syncthreads(); - IdxT col = blockIdx.y; - IdxT binOffset = col * nbins; - auto op = [=] __device__(int binId, IdxT row, IdxT col) { - if (row >= nrows) return; - incrementBin(sbins, bins + binOffset, (int)nbins, binId); - }; - histCoreOp(data, nrows, nbins, binner, op, col); - __syncthreads(); - for (auto j = threadIdx.x; j < (int)nbins; j += blockDim.x) { - auto shift = j % Bits::WORD_BINS * Bits::BIN_BITS; - int count = sbins[j / Bits::WORD_BINS] >> shift & Bits::BIN_MASK; - if (count > 0) raft::myAtomicAdd(bins + binOffset + j, count); - } -} - -template -void smemBitsHist(int* bins, - IdxT nbins, - const DataT* data, - IdxT nrows, - IdxT ncols, - BinnerOp binner, - cudaStream_t stream) -{ - typedef BitsInfo Bits; - auto blks = computeGridDim( - nrows, ncols, (const void*)smemBitsHistKernel); - size_t smemSize = raft::ceildiv(nbins, Bits::WORD_BITS / Bits::BIN_BITS) * sizeof(int); - smemBitsHistKernel - <<>>(bins, data, nrows, nbins, binner); -} - -#define INVALID_KEY -1 - -DI void clearHashTable(int2* ht, int hashSize) -{ - for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) { - ht[i] = {INVALID_KEY, 0}; - } -} - -DI int findEntry(int2* ht, int hashSize, int binId, int threshold) -{ - int idx = binId % hashSize; - int t; - int count = 0; - while ((t = atomicCAS(&(ht[idx].x), INVALID_KEY, binId)) != INVALID_KEY && t != binId) { - ++count; - if (count >= threshold) { - idx = INVALID_KEY; - break; - } - ++idx; - if (idx >= hashSize) { idx = 0; } - } - return idx; -} - -DI void flushHashTable(int2* ht, int hashSize, int* bins, int nbins, int col) -{ - int binOffset = col * nbins; - for (auto i = threadIdx.x; i < hashSize; i += blockDim.x) { - if (ht[i].x != INVALID_KEY && ht[i].y > 0) { - raft::myAtomicAdd(bins + binOffset + ht[i].x, ht[i].y); - } - ht[i] = {INVALID_KEY, 0}; - } -} - -#undef INVALID_KEY - -///@todo: honor VecLen template param -template -RAFT_KERNEL smemHashHistKernel(int* bins, - const DataT* data, - IdxT nrows, - IdxT nbins, - BinnerOp binner, - int hashSize, - int threshold) -{ - extern __shared__ int2 ht[]; - int* needFlush = (int*)&(ht[hashSize]); - if (threadIdx.x == 0) { needFlush[0] = 0; } - clearHashTable(ht, hashSize); - __syncthreads(); - auto op = [=] __device__(int binId, IdxT row, IdxT col) { - bool iNeedFlush = false; - if (row < nrows) { - int hidx = findEntry(ht, hashSize, binId, threshold); - if (hidx >= 0) { - raft::myAtomicAdd(&(ht[hidx].y), 1); - } else { - needFlush[0] = 1; - iNeedFlush = true; - } - } - __syncthreads(); - if (needFlush[0]) { - flushHashTable(ht, hashSize, bins, nbins, col); - __syncthreads(); - if (threadIdx.x == 0) { needFlush[0] = 0; } - __syncthreads(); - } - if (iNeedFlush) { - int hidx = findEntry(ht, hashSize, binId, threshold); - // all threads are bound to get one valid entry as all threads in this - // block will make forward progress due to the __syncthreads call in the - // subsequent iteration - raft::myAtomicAdd(&(ht[hidx].y), 1); - } - }; - IdxT col = blockIdx.y; - histCoreOp(data, nrows, nbins, binner, op, col); - __syncthreads(); - flushHashTable(ht, hashSize, bins, nbins, col); -} - -inline int computeHashTableSize() -{ - // we shouldn't have this much of shared memory available anytime soon! - static const unsigned maxBinsEverPossible = 256 * 1024; - static raft::common::Seive primes(maxBinsEverPossible); - unsigned smem = raft::getSharedMemPerBlock(); - // divide-by-2 because hash table entry stores 2 elements: idx and count - auto binsPossible = smem / sizeof(unsigned) / 2; - for (; binsPossible > 1; --binsPossible) { - if (primes.isPrime(binsPossible)) return (int)binsPossible; - } - return 1; // should not happen! -} - -template -void smemHashHist(int* bins, - IdxT nbins, - const DataT* data, - IdxT nrows, - IdxT ncols, - BinnerOp binner, - cudaStream_t stream) -{ - static const int flushThreshold = 10; - auto blks = computeGridDim( - nrows, ncols, (const void*)smemHashHistKernel); - int hashSize = computeHashTableSize(); - size_t smemSize = hashSize * sizeof(int2) + sizeof(int); - smemHashHistKernel<<>>( - bins, data, nrows, nbins, binner, hashSize, flushThreshold); -} - -template -void histogramVecLen(HistType type, - int* bins, - IdxT nbins, - const DataT* data, - IdxT nrows, - IdxT ncols, - cudaStream_t stream, - BinnerOp binner) -{ - RAFT_CUDA_TRY(cudaMemsetAsync(bins, 0, ncols * nbins * sizeof(int), stream)); - switch (type) { - case HistTypeGmem: - gmemHist(bins, nbins, data, nrows, ncols, binner, stream); - break; - case HistTypeSmem: - smemHist( - bins, nbins, data, nrows, ncols, binner, stream); - break; - case HistTypeSmemMatchAny: - smemHist( - bins, nbins, data, nrows, ncols, binner, stream); - break; - case HistTypeSmemBits16: - smemBitsHist( - bins, nbins, data, nrows, ncols, binner, stream); - break; - case HistTypeSmemBits8: - smemBitsHist( - bins, nbins, data, nrows, ncols, binner, stream); - break; - case HistTypeSmemBits4: - smemBitsHist( - bins, nbins, data, nrows, ncols, binner, stream); - break; - case HistTypeSmemBits2: - smemBitsHist( - bins, nbins, data, nrows, ncols, binner, stream); - break; - case HistTypeSmemBits1: - smemBitsHist( - bins, nbins, data, nrows, ncols, binner, stream); - break; - case HistTypeSmemHash: - smemHashHist(bins, nbins, data, nrows, ncols, binner, stream); - break; - default: ASSERT(false, "histogram: Invalid type passed '%d'!", type); - }; - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void histogramImpl(HistType type, - int* bins, - IdxT nbins, - const DataT* data, - IdxT nrows, - IdxT ncols, - cudaStream_t stream, - BinnerOp binner) -{ - size_t bytes = nrows * sizeof(DataT); - if (nrows <= 0) return; - if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) { - histogramVecLen( - type, bins, nbins, data, nrows, ncols, stream, binner); - } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) { - histogramVecLen( - type, bins, nbins, data, nrows, ncols, stream, binner); - } else if (4 % sizeof(DataT) == 0 && bytes % 4 == 0) { - histogramVecLen( - type, bins, nbins, data, nrows, ncols, stream, binner); - } else if (2 % sizeof(DataT) == 0 && bytes % 2 == 0) { - histogramVecLen( - type, bins, nbins, data, nrows, ncols, stream, binner); - } else { - histogramVecLen( - type, bins, nbins, data, nrows, ncols, stream, binner); - } -} - -template -HistType selectBestHistAlgo(IdxT nbins) -{ - size_t smem = raft::getSharedMemPerBlock(); - size_t requiredSize = nbins * sizeof(unsigned); - if (requiredSize <= smem) { return HistTypeSmem; } - for (int bits = 16; bits >= 1; bits >>= 1) { - auto nBytesForBins = raft::ceildiv(bits * nbins, 8); - requiredSize = raft::alignTo(nBytesForBins, sizeof(unsigned)); - if (requiredSize <= smem) { return static_cast(bits); } - } - return HistTypeGmem; -} - -/** - * @brief Perform histogram on the input data. It chooses the right load size - * based on the input data vector length. It also supports large-bin cases - * using a specialized smem-based hashing technique. - * @tparam DataT input data type - * @tparam IdxT data type used to compute indices - * @tparam BinnerOp takes the input data and computes its bin index - * @param type histogram implementation type to choose - * @param bins the output bins (length = ncols * nbins) - * @param nbins number of bins - * @param data input data (length = ncols * nrows) - * @param nrows data array length in each column (or batch) - * @param ncols number of columns (or batch size) - * @param stream cuda stream - * @param binner the operation that computes the bin index of the input data - * - * @note signature of BinnerOp is `int func(DataT, IdxT);` - */ -template > -void histogram(HistType type, - int* bins, - IdxT nbins, - const DataT* data, - IdxT nrows, - IdxT ncols, - cudaStream_t stream, - BinnerOp binner = IdentityBinner()) -{ - HistType computedType = type; - if (type == HistTypeAuto) { computedType = selectBestHistAlgo(nbins); } - histogramImpl( - computedType, bins, nbins, data, nrows, ncols, stream, binner); -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/homogeneity_score.cuh b/cpp/include/cuvs/stats/detail/homogeneity_score.cuh deleted file mode 100644 index f63873df6..000000000 --- a/cpp/include/cuvs/stats/detail/homogeneity_score.cuh +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file homogeneity_score.cuh - * - * @brief A clustering result satisfies homogeneity if all of its clusters - * contain only data points which are members of a single class. - */ - -#pragma once - -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { -/** - * @brief Function to calculate the homogeneity score between two clusters - * more info on mutual - * information - * @param truthClusterArray: the array of truth classes of type T - * @param predClusterArray: the array of predicted classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - */ -template -double homogeneity_score(const T* truthClusterArray, - const T* predClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream) -{ - if (size == 0) return 1.0; - - double computedMI, computedEntropy; - - computedMI = raft::stats::mutual_info_score( - truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream); - computedEntropy = - raft::stats::entropy(truthClusterArray, size, lowerLabelRange, upperLabelRange, stream); - - double homogeneity; - - if (computedEntropy) { - homogeneity = computedMI / computedEntropy; - } else - homogeneity = 1.0; - - return homogeneity; -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/kl_divergence.cuh b/cpp/include/cuvs/stats/detail/kl_divergence.cuh deleted file mode 100644 index 83f1b64b0..000000000 --- a/cpp/include/cuvs/stats/detail/kl_divergence.cuh +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file kl_divergence.cuh - * @brief The KL divergence tells us how well the probability distribution Q AKA candidatePDF - * approximates the probability distribution P AKA modelPDF. - */ - -#pragma once - -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief the KL Diverence mapping function - * - * @tparam Type: Data type of the input - * @param modelPDF: the model probability density function of type DataT - * @param candidatePDF: the candidate probability density function of type DataT - */ -template -struct KLDOp { - HDI Type operator()(Type modelPDF, Type candidatePDF) - { - if (modelPDF == 0.0) - return 0; - - else - return modelPDF * (log(modelPDF) - log(candidatePDF)); - } -}; - -/** - * @brief Function to calculate KL Divergence - * more info on KL - * Divergence - * - * @tparam DataT: Data type of the input array - * @param modelPDF: the model array of probability density functions of type DataT - * @param candidatePDF: the candidate array of probability density functions of type DataT - * @param size: the size of the data points of type int - * @param stream: the cudaStream object - */ -template -DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream) -{ - rmm::device_scalar d_KLDVal(stream); - RAFT_CUDA_TRY(cudaMemsetAsync(d_KLDVal.data(), 0, sizeof(DataT), stream)); - - raft::linalg::mapThenSumReduce, size_t, 256, const DataT*>( - d_KLDVal.data(), (size_t)size, KLDOp(), stream, modelPDF, candidatePDF); - - DataT h_KLDVal; - - raft::update_host(&h_KLDVal, d_KLDVal.data(), 1, stream); - - raft::interruptible::synchronize(stream); - - return h_KLDVal; -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/mean.cuh b/cpp/include/cuvs/stats/detail/mean.cuh deleted file mode 100644 index 092fa2de0..000000000 --- a/cpp/include/cuvs/stats/detail/mean.cuh +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -namespace cuvs { -namespace stats { -namespace detail { - -///@todo: ColsPerBlk has been tested only for 32! -template -RAFT_KERNEL meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) - thread_data += (colId < D) ? data[i * D + colId] : Type(0); - __shared__ Type smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_data); - __syncthreads(); - if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -RAFT_KERNEL meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - thread_data += data[idx]; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; } -} - -template -void mean( - Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) -{ - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); - meanKernelRowMajor<<>>(mu, data, D, N); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); - raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); - } else { - meanKernelColMajor<<>>(mu, data, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -} // namespace detail -} // namespace stats -} // namespace cuvs \ No newline at end of file diff --git a/cpp/include/cuvs/stats/detail/mean_center.cuh b/cpp/include/cuvs/stats/detail/mean_center.cuh deleted file mode 100644 index 2f281addb..000000000 --- a/cpp/include/cuvs/stats/detail/mean_center.cuh +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief Center the input matrix wrt its mean - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads per block of the cuda kernel launched - * @param out the output mean-centered matrix - * @param data input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param rowMajor whether input is row or col major - * @param bcastAlongRows whether to broadcast vector along rows or columns - * @param stream cuda stream where to launch work - */ -template -void meanCenter(Type* out, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ - raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, raft::sub_op{}, stream); -} - -/** - * @brief Add the input matrix wrt its mean - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads per block of the cuda kernel launched - * @param out the output mean-added matrix - * @param data input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param rowMajor whether input is row or col major - * @param bcastAlongRows whether to broadcast vector along rows or columns - * @param stream cuda stream where to launch work - */ -template -void meanAdd(Type* out, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ - raft::linalg::matrixVectorOp( - out, data, mu, D, N, rowMajor, bcastAlongRows, raft::add_op{}, stream); -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/meanvar.cuh b/cpp/include/cuvs/stats/detail/meanvar.cuh deleted file mode 100644 index c286d5ed9..000000000 --- a/cpp/include/cuvs/stats/detail/meanvar.cuh +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -namespace raft::stats::detail { - -template -class mean_var { - private: - T w; - T m; - T s; - - public: - /** Monoidal neutral. */ - HDI mean_var() : w(0.0), m(0.0), s(0.0) {} - /** Lift a single value. */ - HDI explicit mean_var(T x) : w(1.0), m(x), s(0.0) {} - - /** - * Monoidal binary op: combine means and vars of two sets. - * (associative and commutative) - */ - friend HDI auto operator+(mean_var a, mean_var const& b) -> mean_var - { - a += b; - return a; - } - - /** - * Combine means and vars of two sets. - * - * Similar to: - * https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm - */ - HDI auto operator+=(mean_var const& b) & -> mean_var& - { - mean_var& a(*this); - T cw = a.w + b.w; - if (cw == 0) return a; - T aw_frac = a.w / cw; - T bw_frac = b.w / cw; - a.w = cw; - T d = a.m - b.m; - a.s += b.s + cw * (d * aw_frac) * (d * bw_frac); - a.m = a.m * aw_frac + b.m * bw_frac; - return a; - } - - /** Get the computed mean. */ - HDI auto mean() const -> T { return m; } - - /** - * @brief Get the computed variance. - * - * @param [in] sample whether to produce sample variance (divide by `N - 1` instead of `N`). - * @return variance - */ - HDI auto var(bool sample) const -> T { return s / max(T(1.0), sample ? w - T(1.0) : w); } - - HDI void load(volatile mean_var* address) - { - this->m = address->m; - this->s = address->s; - this->w = address->w; - } - - HDI void store(volatile mean_var* address) - { - address->m = this->m; - address->s = this->s; - address->w = this->w; - } -}; - -/* -NB: current implementation here is not optimal, especially the rowmajor version; - leaving this for further work (perhaps, as a more generic "linewiseReduce"). - Vectorized loads/stores could speed things up a lot. - */ -/** - * meanvar kernel - row-major version - * - * Assumptions: - * - * 1. blockDim.x == raft::WarpSize - * 2. Dimension X goes along columns (D) - * 3. Dimension Y goes along rows (N) - * - * - * @tparam T element type - * @tparam I indexing type - * @tparam BlockSize must be equal to blockDim.x * blockDim.y * blockDim.z - * @param data input data - * @param mvs meanvars -- output - * @param locks guards for updating meanvars - * @param len total length of input data (N * D) - * @param D number of columns in the input data. - */ -template -RAFT_KERNEL __launch_bounds__(BlockSize) - meanvar_kernel_rowmajor(const T* data, volatile mean_var* mvs, int* locks, I len, I D) -{ - // read the data - const I col = threadIdx.x + blockDim.x * blockIdx.x; - mean_var thread_data; - if (col < D) { - const I step = D * blockDim.y * gridDim.y; - for (I i = col + D * (threadIdx.y + blockDim.y * blockIdx.y); i < len; i += step) { - thread_data += mean_var(data[i]); - } - } - - // aggregate within block - if (blockDim.y > 1) { - __shared__ uint8_t shm_bytes[BlockSize * sizeof(mean_var)]; - auto shm = (mean_var*)shm_bytes; - int tid = threadIdx.x + threadIdx.y * blockDim.x; - shm[tid] = thread_data; - for (int bs = BlockSize >> 1; bs >= blockDim.x; bs = bs >> 1) { - __syncthreads(); - if (tid < bs) { shm[tid] += shm[tid + bs]; } - } - thread_data = shm[tid]; - } - - // aggregate across blocks - if (threadIdx.y == 0) { - int* lock = locks + blockIdx.x; - if (threadIdx.x == 0 && col < D) { - while (atomicCAS(lock, 0, 1) == 1) { - __threadfence(); - } - } - __syncthreads(); - if (col < D) { - __threadfence(); - mean_var global_data; - global_data.load(mvs + col); - global_data += thread_data; - global_data.store(mvs + col); - __threadfence(); - } - __syncthreads(); - if (threadIdx.x == 0 && col < D) { __stwt(lock, 0); } - } -} - -template -RAFT_KERNEL __launch_bounds__(BlockSize) - meanvar_kernel_colmajor(T* mean, T* var, const T* data, I D, I N, bool sample) -{ - using BlockReduce = cub::BlockReduce, BlockSize>; - __shared__ typename BlockReduce::TempStorage shm; - - const T* block_data = data + N * blockIdx.x; - mean_var thread_data; - for (I i = threadIdx.x; i < N; i += BlockSize) { - thread_data += mean_var(block_data[i]); - } - mean_var acc = BlockReduce(shm).Sum(thread_data); - if (threadIdx.x == 0) { - mean[blockIdx.x] = acc.mean(); - var[blockIdx.x] = acc.var(sample); - } -} - -template -RAFT_KERNEL meanvar_kernel_fill(T* mean, T* var, const mean_var* aggr, I D, bool sample) -{ - I i = threadIdx.x + blockDim.x * blockIdx.x; - if (i >= D) return; - auto x = aggr[i]; - mean[i] = x.mean(); - var[i] = x.var(sample); -} - -template -void meanvar( - T* mean, T* var, const T* data, I D, I N, bool sample, bool rowMajor, cudaStream_t stream) -{ - if (rowMajor) { - static_assert(BlockSize >= raft::WarpSize, - "Block size must be not smaller than the warp size."); - const dim3 bs(WarpSize, BlockSize / raft::WarpSize, 1); - dim3 gs(raft::ceildiv(D, bs.x), raft::ceildiv(N, bs.y), 1); - - // Don't create more blocks than necessary to occupy the GPU - int occupancy; - RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &occupancy, meanvar_kernel_rowmajor, BlockSize, 0)); - gs.y = - std::min(gs.y, raft::ceildiv(occupancy * getMultiProcessorCount(), gs.x)); - - // Global memory: one mean_var for each column - // one lock per all blocks working on the same set of columns - rmm::device_buffer buf(sizeof(mean_var) * D + sizeof(int) * gs.x, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(buf.data(), 0, buf.size(), stream)); - mean_var* mvs = static_cast*>(buf.data()); - int* locks = static_cast(static_cast(mvs + D)); - - const uint64_t len = uint64_t(D) * uint64_t(N); - ASSERT(len <= uint64_t(std::numeric_limits::max()), "N * D does not fit the indexing type"); - meanvar_kernel_rowmajor<<>>(data, mvs, locks, len, D); - meanvar_kernel_fill - <<(D, BlockSize), BlockSize, 0, stream>>>(mean, var, mvs, D, sample); - } else { - meanvar_kernel_colmajor - <<>>(mean, var, data, D, N, sample); - } - RAFT_CHECK_CUDA(stream); -} - -}; // namespace raft::stats::detail diff --git a/cpp/include/cuvs/stats/detail/minmax.cuh b/cpp/include/cuvs/stats/detail/minmax.cuh deleted file mode 100644 index 6867984b6..000000000 --- a/cpp/include/cuvs/stats/detail/minmax.cuh +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -namespace cuvs { -namespace stats { -namespace detail { - -// TODO: replace with `std::bitcast` once we adopt C++20 or libcu++ adds it -template -constexpr To bit_cast(const From& from) noexcept -{ - To to{}; - static_assert(sizeof(To) == sizeof(From)); - memcpy(&to, &from, sizeof(To)); - return to; -} - -template -struct encode_traits {}; - -template <> -struct encode_traits { - using E = int; -}; - -template <> -struct encode_traits { - using E = long long; -}; - -HDI int encode(float val) -{ - int i = detail::bit_cast(val); - return i >= 0 ? i : (1 << 31) | ~i; -} - -HDI long long encode(double val) -{ - std::int64_t i = detail::bit_cast(val); - return i >= 0 ? i : (1ULL << 63) | ~i; -} - -HDI float decode(int val) -{ - if (val < 0) val = (1 << 31) | ~val; - return detail::bit_cast(val); -} - -HDI double decode(long long val) -{ - if (val < 0) val = (1ULL << 63) | ~val; - return detail::bit_cast(val); -} - -template -DI T atomicMaxBits(T* address, T val) -{ - E old = atomicMax((E*)address, encode(val)); - return decode(old); -} - -template -DI T atomicMinBits(T* address, T val) -{ - E old = atomicMin((E*)address, encode(val)); - return decode(old); -} - -template -RAFT_KERNEL decodeKernel(T* globalmin, T* globalmax, int ncols) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid < ncols) { - globalmin[tid] = decode(*(E*)&globalmin[tid]); - globalmax[tid] = decode(*(E*)&globalmax[tid]); - } -} - -///@todo: implement a proper "fill" kernel -template -RAFT_KERNEL minmaxInitKernel(int ncols, T* globalmin, T* globalmax, T init_val) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid >= ncols) return; - *(E*)&globalmin[tid] = encode(init_val); - *(E*)&globalmax[tid] = encode(-init_val); -} - -template -RAFT_KERNEL minmaxKernel(const T* data, - const unsigned int* rowids, - const unsigned int* colids, - int nrows, - int ncols, - int row_stride, - T* g_min, - T* g_max, - T* sampledcols, - T init_min_val, - int batch_ncols, - int num_batches) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ char shmem[]; - T* s_min = (T*)shmem; - T* s_max = (T*)(shmem + sizeof(T) * batch_ncols); - - int last_batch_ncols = ncols % batch_ncols; - if (last_batch_ncols == 0) { last_batch_ncols = batch_ncols; } - int orig_batch_ncols = batch_ncols; - - for (int batch_id = 0; batch_id < num_batches; batch_id++) { - if (batch_id == num_batches - 1) { batch_ncols = last_batch_ncols; } - - for (int i = threadIdx.x; i < batch_ncols; i += blockDim.x) { - *(E*)&s_min[i] = encode(init_min_val); - *(E*)&s_max[i] = encode(-init_min_val); - } - __syncthreads(); - - for (int i = tid; i < nrows * batch_ncols; i += blockDim.x * gridDim.x) { - int col = (batch_id * orig_batch_ncols) + (i / nrows); - int row = i % nrows; - if (colids != nullptr) { col = colids[col]; } - if (rowids != nullptr) { row = rowids[row]; } - int index = row + col * row_stride; - T coldata = data[index]; - if (!isnan(coldata)) { - // Min max values are saved in shared memory and global memory as per the shuffled colids. - atomicMinBits(&s_min[(int)(i / nrows)], coldata); - atomicMaxBits(&s_max[(int)(i / nrows)], coldata); - } - if (sampledcols != nullptr) { sampledcols[batch_id * orig_batch_ncols + i] = coldata; } - } - __syncthreads(); - - // finally, perform global mem atomics - for (int j = threadIdx.x; j < batch_ncols; j += blockDim.x) { - atomicMinBits(&g_min[batch_id * orig_batch_ncols + j], decode(*(E*)&s_min[j])); - atomicMaxBits(&g_max[batch_id * orig_batch_ncols + j], decode(*(E*)&s_max[j])); - } - __syncthreads(); - } -} - -/** - * @brief Computes min/max across every column of the input matrix, as well as - * optionally allow to subsample based on the given row/col ID mapping vectors - * - * @tparam T the data type - * @tparam TPB number of threads per block - * @param data input data - * @param rowids actual row ID mappings. It is of length nrows. If you want to - * skip this index lookup entirely, pass nullptr - * @param colids actual col ID mappings. It is of length ncols. If you want to - * skip this index lookup entirely, pass nullptr - * @param nrows number of rows of data to be worked upon. The actual rows of the - * input "data" can be bigger than this! - * @param ncols number of cols of data to be worked upon. The actual cols of the - * input "data" can be bigger than this! - * @param row_stride stride (in number of elements) between 2 adjacent columns - * @param globalmin final col-wise global minimum (size = ncols) - * @param globalmax final col-wise global maximum (size = ncols) - * @param sampledcols output sampled data. Pass nullptr if you don't need this - * @param stream cuda stream - * @note This method makes the following assumptions: - * 1. input and output matrices are assumed to be col-major - * 2. ncols is small enough to fit the whole of min/max values across all cols - * in shared memory - */ -template -void minmax(const T* data, - const unsigned* rowids, - const unsigned* colids, - int nrows, - int ncols, - int row_stride, - T* globalmin, - T* globalmax, - T* sampledcols, - cudaStream_t stream) -{ - using E = typename encode_traits::E; - int nblks = raft::ceildiv(ncols, TPB); - T init_val = std::numeric_limits::max(); - minmaxInitKernel<<>>(ncols, globalmin, globalmax, init_val); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - nblks = raft::ceildiv(nrows * ncols, TPB); - nblks = min(nblks, 65536); - size_t smemSize = sizeof(T) * 2 * ncols; - - // Compute the batch_ncols, in [1, ncols] range, that meet the available - // shared memory constraints. - auto smemPerBlk = raft::getSharedMemPerBlock(); - int batch_ncols = min(ncols, (int)(smemPerBlk / (sizeof(T) * 2))); - int num_batches = raft::ceildiv(ncols, batch_ncols); - smemSize = sizeof(T) * 2 * batch_ncols; - - minmaxKernel<<>>(data, - rowids, - colids, - nrows, - ncols, - row_stride, - globalmin, - globalmax, - sampledcols, - init_val, - batch_ncols, - num_batches); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - decodeKernel<<>>(globalmin, globalmax, ncols); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/mutual_info_score.cuh b/cpp/include/cuvs/stats/detail/mutual_info_score.cuh deleted file mode 100644 index 0d8da56bd..000000000 --- a/cpp/include/cuvs/stats/detail/mutual_info_score.cuh +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file mutual_info_score.cuh - * @brief The Mutual Information is a measure of the similarity between two labels of - * the same data.This metric is independent of the absolute values of the labels: - * a permutation of the class or cluster label values won't change the - * score value in any way. - * This metric is furthermore symmetric.This can be useful to - * measure the agreement of two independent label assignments strategies - * on the same dataset when the real ground truth is not known. - */ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief kernel to calculate the mutual info score - * @param dContingencyMatrix: the contingency matrix corresponding to the two clusters - * @param a: the row wise sum of the contingency matrix, which is also the bin counts of first - * cluster array - * @param b: the column wise sum of the contingency matrix, which is also the bin counts of second - * cluster array - * @param numUniqueClasses: number of unique classes - * @param size: the size of array a and b (size of the contingency matrix is (size x size)) - * @param d_MI: pointer to the device memory that stores the aggregate mutual information - */ -template -RAFT_KERNEL mutual_info_kernel(const int* dContingencyMatrix, - const int* a, - const int* b, - int numUniqueClasses, - int size, - double* d_MI) -{ - // calculating the indices of pairs of datapoints compared by the current thread - int j = threadIdx.x + blockIdx.x * blockDim.x; - int i = threadIdx.y + blockIdx.y * blockDim.y; - - // thread-local variable to count the mutual info - double localMI = 0.0; - - if (i < numUniqueClasses && j < numUniqueClasses && a[i] * b[j] != 0 && - dContingencyMatrix[i * numUniqueClasses + j] != 0) { - localMI += (double(dContingencyMatrix[i * numUniqueClasses + j])) * - (log(double(size) * double(dContingencyMatrix[i * numUniqueClasses + j])) - - log(double(a[i] * b[j]))); - } - - // specialize blockReduce for a 2D block of 1024 threads of type uint64_t - typedef cub::BlockReduce - BlockReduce; - - // Allocate shared memory for blockReduce - __shared__ typename BlockReduce::TempStorage temp_storage; - - // summing up thread-local counts specific to a block - localMI = BlockReduce(temp_storage).Sum(localMI); - __syncthreads(); - - // executed once per block - if (threadIdx.x == 0 && threadIdx.y == 0) { raft::myAtomicAdd(d_MI, localMI); } -} - -/** - * @brief Function to calculate the mutual information between two clusters - * more info on mutual information - * @param firstClusterArray: the array of classes of type T - * @param secondClusterArray: the array of classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - */ -template -double mutual_info_score(const T* firstClusterArray, - const T* secondClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream) -{ - int numUniqueClasses = upperLabelRange - lowerLabelRange + 1; - - // declaring, allocating and initializing memory for the contingency marix - rmm::device_uvector dContingencyMatrix(numUniqueClasses * numUniqueClasses, stream); - RAFT_CUDA_TRY(cudaMemsetAsync( - dContingencyMatrix.data(), 0, numUniqueClasses * numUniqueClasses * sizeof(int), stream)); - - // workspace allocation - size_t workspaceSz = raft::stats::getContingencyMatrixWorkspaceSize( - size, firstClusterArray, stream, lowerLabelRange, upperLabelRange); - rmm::device_uvector pWorkspace(workspaceSz, stream); - - // calculating the contingency matrix - raft::stats::contingencyMatrix(firstClusterArray, - secondClusterArray, - (int)size, - (int*)dContingencyMatrix.data(), - stream, - (void*)pWorkspace.data(), - workspaceSz, - lowerLabelRange, - upperLabelRange); - - // creating device buffers for all the parameters involved in ARI calculation - // device variables - rmm::device_uvector a(numUniqueClasses, stream); - rmm::device_uvector b(numUniqueClasses, stream); - rmm::device_scalar d_MI(stream); - - // host variables - double h_MI; - - // initializing device memory - RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, numUniqueClasses * sizeof(int), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, numUniqueClasses * sizeof(int), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_MI.data(), 0, sizeof(double), stream)); - - // calculating the row-wise sums - raft::linalg::reduce( - a.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, true, true, stream); - - // calculating the column-wise sums - raft::linalg::reduce(b.data(), - dContingencyMatrix.data(), - numUniqueClasses, - numUniqueClasses, - 0, - true, - false, - stream); - - // kernel configuration - static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; - dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 numBlocks(raft::ceildiv(numUniqueClasses, numThreadsPerBlock.x), - raft::ceildiv(numUniqueClasses, numThreadsPerBlock.y)); - - // calling the kernel - mutual_info_kernel<<>>( - dContingencyMatrix.data(), a.data(), b.data(), numUniqueClasses, size, d_MI.data()); - - // updating in the host memory - h_MI = d_MI.value(stream); - - raft::interruptible::synchronize(stream); - - return h_MI / size; -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/neighborhood_recall.cuh b/cpp/include/cuvs/stats/detail/neighborhood_recall.cuh deleted file mode 100644 index 11d044816..000000000 --- a/cpp/include/cuvs/stats/detail/neighborhood_recall.cuh +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#include - -namespace raft::stats::detail { - -template -RAFT_KERNEL neighborhood_recall( - raft::device_matrix_view indices, - raft::device_matrix_view ref_indices, - std::optional> - distances, - std::optional> - ref_distances, - raft::device_scalar_view recall_score, - DistanceValueType const eps) -{ - auto constexpr kThreadsPerBlock = 32; - IndexType const row_idx = blockIdx.x; - auto const lane_idx = threadIdx.x % kThreadsPerBlock; - - // Each warp stores a recall score computed across the columns per row - IndexType thread_recall_score = 0; - for (IndexType col_idx = lane_idx; col_idx < indices.extent(1); col_idx += kThreadsPerBlock) { - for (IndexType ref_col_idx = 0; ref_col_idx < ref_indices.extent(1); ref_col_idx++) { - if (indices(row_idx, col_idx) == ref_indices(row_idx, ref_col_idx)) { - thread_recall_score += 1; - break; - } else if (distances.has_value()) { - auto dist = distances.value()(row_idx, col_idx); - auto ref_dist = ref_distances.value()(row_idx, ref_col_idx); - DistanceValueType diff = raft::abs(dist - ref_dist); - DistanceValueType m = std::max(raft::abs(dist), raft::abs(ref_dist)); - DistanceValueType ratio = diff > eps ? diff / m : diff; - - if (ratio <= eps) { - thread_recall_score += 1; - break; - } - } - } - } - - // Reduce across a warp for row score - typedef cub::BlockReduce BlockReduce; - - __shared__ typename BlockReduce::TempStorage temp_storage; - - ScalarType row_recall_score = BlockReduce(temp_storage).Sum(thread_recall_score); - - // Reduce across all rows for global score - if (lane_idx == 0) { - cuda::atomic_ref device_recall_score{ - *recall_score.data_handle()}; - std::size_t const total_count = indices.extent(0) * indices.extent(1); - device_recall_score.fetch_add(row_recall_score / total_count); - } -} - -template -void neighborhood_recall( - raft::resources const& res, - raft::device_matrix_view indices, - raft::device_matrix_view ref_indices, - std::optional> - distances, - std::optional> - ref_distances, - raft::device_scalar_view recall_score, - DistanceValueType const eps) -{ - // One warp per row, launch a warp-width block per-row kernel - auto constexpr kThreadsPerBlock = 32; - auto const num_blocks = indices.extent(0); - - neighborhood_recall<<>>( - indices, ref_indices, distances, ref_distances, recall_score, eps); -} - -} // end namespace raft::stats::detail diff --git a/cpp/include/cuvs/stats/detail/rand_index.cuh b/cpp/include/cuvs/stats/detail/rand_index.cuh deleted file mode 100644 index f87ee66fa..000000000 --- a/cpp/include/cuvs/stats/detail/rand_index.cuh +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file rand_index.cuh - * @todo TODO(Ganesh Venkataramana): - *
- * The below rand_index calculation implementation is a Brute force one that uses
- (nElements*nElements) threads (2 dimensional grids and blocks)
- * For small datasets, this will suffice; but for larger ones, work done by the threads increase
- dramatically.
- * A more mathematically intensive implementation that uses half the above threads can be done,
- which will prove to be more efficient for larger datasets
- * the idea is as follows:
-  * instead of 2D block and grid configuration with a total of (nElements*nElements) threads (where
- each (i,j) through these threads represent an ordered pair selection of 2 data points), a 1D block
- and grid configuration with a total of (nElements*(nElements))/2 threads (each thread index
- represents an element part of the set of unordered pairwise selections from the dataset (nChoose2))
-  * In this setup, one has to generate a one-to-one mapping between this 1D thread index (for each
- kernel) and the unordered pair of chosen datapoints.
-  * More specifically, thread0-> {dataPoint1, dataPoint0}, thread1-> {dataPoint2, dataPoint0},
- thread2-> {dataPoint2, dataPoint1} ... thread((nElements*(nElements))/2 - 1)->
- {dataPoint(nElements-1),dataPoint(nElements-2)}
-  * say ,
-     * threadNum: thread index | threadNum = threadIdx.x + BlockIdx.x*BlockDim.x,
-     * i : index of dataPoint i
-     * j : index of dataPoint j
-  * then the mapping is as follows:
-     * i = ceil((-1 + sqrt(1 + 8*(1 + threadNum)))/2) = floor((1 + sqrt(1 + 8*threadNum))/2)
-     * j = threadNum - i(i-1)/2
-  * after obtaining the the pair of datapoints, calculation of rand index is the same as done in
- this implementation
- * Caveat: since the kernel implementation involves use of emulated sqrt() operations:
-  * the number of instructions executed per kernel is ~40-50 times
-  * as the O(nElements*nElements) increase beyond the floating point limit, floating point
- inaccuracies occur, and hence the above floor(...) !=  ceil(...)
- * 
- */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief kernel to calculate the values of a and b - * @param firstClusterArray: the array of classes of type T - * @param secondClusterArray: the array of classes of type T - * @param size: the size of the data points - * @param a: number of pairs of points that both the clusters have classified the same - * @param b: number of pairs of points that both the clusters have classified differently - */ -template -RAFT_KERNEL computeTheNumerator( - const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b) -{ - // calculating the indices of pairs of datapoints compared by the current thread - uint64_t j = threadIdx.x + blockIdx.x * blockDim.x; - uint64_t i = threadIdx.y + blockIdx.y * blockDim.y; - - // thread-local variables to count a and b - uint64_t myA = 0, myB = 0; - - if (i < size && j < size && j < i) { - // checking if the pair have been classified the same by both the clusters - if (firstClusterArray[i] == firstClusterArray[j] && - secondClusterArray[i] == secondClusterArray[j]) { - ++myA; - } - - // checking if the pair have been classified differently by both the clusters - else if (firstClusterArray[i] != firstClusterArray[j] && - secondClusterArray[i] != secondClusterArray[j]) { - ++myB; - } - } - - // specialize blockReduce for a 2D block of 1024 threads of type uint64_t - typedef cub::BlockReduce - BlockReduce; - - // Allocate shared memory for blockReduce - __shared__ typename BlockReduce::TempStorage temp_storage; - - // summing up thread-local counts specific to a block - myA = BlockReduce(temp_storage).Sum(myA); - __syncthreads(); - myB = BlockReduce(temp_storage).Sum(myB); - __syncthreads(); - - // executed once per block - if (threadIdx.x == 0 && threadIdx.y == 0) { - raft::myAtomicAdd((unsigned long long int*)a, myA); - raft::myAtomicAdd((unsigned long long int*)b, myB); - } -} - -/** - * @brief Function to calculate RandIndex - * more info on rand index - * @param firstClusterArray: the array of classes of type T - * @param secondClusterArray: the array of classes of type T - * @param size: the size of the data points of type uint64_t - * @param stream: the cudaStream object - */ -template -double compute_rand_index(const T* firstClusterArray, - const T* secondClusterArray, - uint64_t size, - cudaStream_t stream) -{ - // rand index for size less than 2 is not defined - ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); - - // allocating and initializing memory for a and b in the GPU - rmm::device_uvector arr_buf(2, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); - - // kernel configuration - static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; - dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 numBlocks(raft::ceildiv(size, numThreadsPerBlock.x), - raft::ceildiv(size, numThreadsPerBlock.y)); - - // calling the kernel - computeTheNumerator<<>>( - firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1); - - // synchronizing and updating the calculated values of a and b from device to host - uint64_t ab_host[2] = {0}; - raft::update_host(ab_host, arr_buf.data(), 2, stream); - raft::interruptible::synchronize(stream); - - // error handling - RAFT_CUDA_TRY(cudaGetLastError()); - - // denominator - uint64_t nChooseTwo = size * (size - 1) / 2; - - // calculating the rand_index - return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo); -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/scores.cuh b/cpp/include/cuvs/stats/detail/scores.cuh deleted file mode 100644 index f2e21ea2b..000000000 --- a/cpp/include/cuvs/stats/detail/scores.cuh +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define N_THREADS 512 - -namespace cuvs { -namespace stats { -namespace detail { -/** - * Calculates the "Coefficient of Determination" (R-Squared) score - * normalizing the sum of squared errors by the total sum of squares. - * - * This score indicates the proportionate amount of variation in an - * expected response variable is explained by the independent variables - * in a linear regression model. The larger the R-squared value, the - * more variability is explained by the linear regression model. - * - * @param y: Array of ground-truth response variables - * @param y_hat: Array of predicted response variables - * @param n: Number of elements in y and y_hat - * @param stream: cuda stream - * @return: The R-squared value. - */ -template -math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream) -{ - rmm::device_scalar y_bar(stream); - - raft::stats::mean(y_bar.data(), y, 1, n, false, false, stream); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - rmm::device_uvector sse_arr(n, stream); - - raft::linalg::eltwiseSub(sse_arr.data(), y, y_hat, n, stream); - raft::linalg::powerScalar(sse_arr.data(), sse_arr.data(), math_t(2.0), n, stream); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - rmm::device_uvector ssto_arr(n, stream); - - raft::linalg::subtractDevScalar(ssto_arr.data(), y, y_bar.data(), n, stream); - raft::linalg::powerScalar(ssto_arr.data(), ssto_arr.data(), math_t(2.0), n, stream); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - thrust::device_ptr d_sse = thrust::device_pointer_cast(sse_arr.data()); - thrust::device_ptr d_ssto = thrust::device_pointer_cast(ssto_arr.data()); - - math_t sse = thrust::reduce(thrust::cuda::par.on(stream), d_sse, d_sse + n); - math_t ssto = thrust::reduce(thrust::cuda::par.on(stream), d_ssto, d_ssto + n); - - return 1.0 - sse / ssto; -} - -/** - * @brief Compute accuracy of predictions. Useful for classification. - * @tparam math_t: data type for predictions (e.g., int for classification) - * @param[in] predictions: array of predictions (GPU pointer). - * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer). - * @param[in] n: number of elements in each of predictions, ref_predictions. - * @param[in] stream: cuda stream. - * @return: Accuracy score in [0, 1]; higher is better. - */ -template -float accuracy_score(const math_t* predictions, - const math_t* ref_predictions, - int n, - cudaStream_t stream) -{ - unsigned long long correctly_predicted = 0ULL; - rmm::device_uvector diffs_array(n, stream); - - // TODO could write a kernel instead - raft::linalg::eltwiseSub(diffs_array.data(), predictions, ref_predictions, n, stream); - RAFT_CUDA_TRY(cudaGetLastError()); - correctly_predicted = - thrust::count(thrust::cuda::par.on(stream), diffs_array.data(), diffs_array.data() + n, 0); - - float accuracy = correctly_predicted * 1.0f / n; - return accuracy; -} - -template -RAFT_KERNEL reg_metrics_kernel( - const T* predictions, const T* ref_predictions, int n, double* abs_diffs, double* tmp_sums) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - __shared__ double shmem[2]; // {abs_difference_sum, squared difference sum} - - for (int i = threadIdx.x; i < 2; i += blockDim.x) { - shmem[i] = 0; - } - __syncthreads(); - - for (int i = tid; i < n; i += blockDim.x * gridDim.x) { - double diff = predictions[i] - ref_predictions[i]; - double abs_diff = abs(diff); - raft::myAtomicAdd(&shmem[0], abs_diff); - raft::myAtomicAdd(&shmem[1], diff * diff); - - // update absolute difference in global memory for subsequent abs. median computation - abs_diffs[i] = abs_diff; - } - __syncthreads(); - - // Update tmp_sum w/ total abs_difference_sum and squared difference sum. - for (int i = threadIdx.x; i < 2; i += blockDim.x) { - raft::myAtomicAdd(&tmp_sums[i], shmem[i]); - } -} - -/** - * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error - * @tparam T: data type for predictions (e.g., float or double for regression). - * @param[in] predictions: array of predictions (GPU pointer). - * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer). - * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0. - * @param[in] stream: cuda stream. - * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] - - * ref_predictions[i]|) / n. - * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] - - * ref_predictions[i])^2) / n. - * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] - - * ref_predictions[i]| for i in [0, n). - */ -template -void regression_metrics(const T* predictions, - const T* ref_predictions, - int n, - cudaStream_t stream, - double& mean_abs_error, - double& mean_squared_error, - double& median_abs_error) -{ - std::vector mean_errors(2); - std::vector h_sorted_abs_diffs(n); - int thread_cnt = 256; - int block_cnt = raft::ceildiv(n, thread_cnt); - - int array_size = n * sizeof(double); - rmm::device_uvector abs_diffs_array(array_size, stream); - rmm::device_uvector sorted_abs_diffs(array_size, stream); - rmm::device_uvector tmp_sums(2 * sizeof(double), stream); - RAFT_CUDA_TRY(cudaMemsetAsync(tmp_sums.data(), 0, 2 * sizeof(double), stream)); - - reg_metrics_kernel<<>>( - predictions, ref_predictions, n, abs_diffs_array.data(), tmp_sums.data()); - RAFT_CUDA_TRY(cudaGetLastError()); - raft::update_host(&mean_errors[0], tmp_sums.data(), 2, stream); - raft::interruptible::synchronize(stream); - - mean_abs_error = mean_errors[0] / n; - mean_squared_error = mean_errors[1] / n; - - // Compute median error. Sort diffs_array and pick median value - char* temp_storage = nullptr; - size_t temp_storage_bytes; - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage, - temp_storage_bytes, - abs_diffs_array.data(), - sorted_abs_diffs.data(), - n, - 0, - 8 * sizeof(double), - stream)); - rmm::device_uvector temp_storage_v(temp_storage_bytes, stream); - temp_storage = temp_storage_v.data(); - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage, - temp_storage_bytes, - abs_diffs_array.data(), - sorted_abs_diffs.data(), - n, - 0, - 8 * sizeof(double), - stream)); - - raft::update_host(h_sorted_abs_diffs.data(), sorted_abs_diffs.data(), n, stream); - raft::interruptible::synchronize(stream); - - int middle = n / 2; - if (n % 2 == 1) { - median_abs_error = h_sorted_abs_diffs[middle]; - } else { - median_abs_error = (h_sorted_abs_diffs[middle] + h_sorted_abs_diffs[middle - 1]) / 2; - } -} -} // namespace detail -} // namespace stats -} // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/silhouette_score.cuh b/cpp/include/cuvs/stats/detail/silhouette_score.cuh deleted file mode 100644 index ac5243e74..000000000 --- a/cpp/include/cuvs/stats/detail/silhouette_score.cuh +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief kernel that calculates the average intra-cluster distance for every sample data point and - * updates the cluster distance to max value - * @tparam DataT: type of the data samples - * @tparam LabelT: type of the labels - * @param sampleToClusterSumOfDistances: the pointer to the 2D array that contains the sum of - * distances from every sample to every cluster (nRows x nLabels) - * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x - * nLabels) - * @param d_aArray: the pointer to the array of average intra-cluster distances for every sample in - * device memory (1 x nRows) - * @param labels: the pointer to the array containing labels for every data sample (1 x nRows) - * @param nRows: number of data samples - * @param nLabels: number of Labels - * @param MAX_VAL: DataT specific upper limit - */ -template -RAFT_KERNEL populateAKernel(DataT* sampleToClusterSumOfDistances, - DataT* binCountArray, - DataT* d_aArray, - const LabelT* labels, - int nRows, - int nLabels, - const DataT MAX_VAL) -{ - // getting the current index - int sampleIndex = threadIdx.x + blockIdx.x * blockDim.x; - - if (sampleIndex >= nRows) return; - - // sampleDistanceVector is an array that stores that particular row of the distanceMatrix - DataT* sampleToClusterSumOfDistancesVector = - &sampleToClusterSumOfDistances[sampleIndex * nLabels]; - - LabelT sampleCluster = labels[sampleIndex]; - - int sampleClusterIndex = (int)sampleCluster; - - if (binCountArray[sampleClusterIndex] - 1 <= 0) { - d_aArray[sampleIndex] = -1; - return; - - } - - else { - d_aArray[sampleIndex] = (sampleToClusterSumOfDistancesVector[sampleClusterIndex]) / - (binCountArray[sampleClusterIndex] - 1); - - // modifying the sampleDistanceVector to give sample average distance - sampleToClusterSumOfDistancesVector[sampleClusterIndex] = MAX_VAL; - } -} - -/** - * @brief function to calculate the bincounts of number of samples in every label - * @tparam DataT: type of the data samples - * @tparam LabelT: type of the labels - * @param labels: the pointer to the array containing labels for every data sample (1 x nRows) - * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x - * nLabels) - * @param nRows: number of data samples - * @param nUniqueLabels: number of Labels - * @param workspace: device buffer containing workspace memory - * @param stream: the cuda stream where to launch this kernel - */ -template -void countLabels(const LabelT* labels, - DataT* binCountArray, - int nRows, - int nUniqueLabels, - rmm::device_uvector& workspace, - cudaStream_t stream) -{ - int num_levels = nUniqueLabels + 1; - LabelT lower_level = 0; - LabelT upper_level = nUniqueLabels; - size_t temp_storage_bytes = 0; - - rmm::device_uvector countArray(nUniqueLabels, stream); - - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr, - temp_storage_bytes, - labels, - binCountArray, - num_levels, - lower_level, - upper_level, - nRows, - stream)); - - workspace.resize(temp_storage_bytes, stream); - - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(), - temp_storage_bytes, - labels, - binCountArray, - num_levels, - lower_level, - upper_level, - nRows, - stream)); -} - -/** - * @brief structure that defines the division Lambda for elementwise op - */ -template -struct DivOp { - HDI DataT operator()(DataT a, int b, int c) - { - if (b == 0) - return ULLONG_MAX; - else - return a / b; - } -}; - -/** - * @brief structure that defines the elementwise operation to calculate silhouette score using - * params 'a' and 'b' - */ -template -struct SilOp { - HDI DataT operator()(DataT a, DataT b) - { - if (a == 0 && b == 0 || a == b) - return 0; - else if (a == -1) - return 0; - else if (a > b) - return (b - a) / a; - else - return (b - a) / b; - } -}; - -/** - * @brief main function that returns the average silhouette score for a given set of data and its - * clusterings - * @tparam DataT: type of the data samples - * @tparam LabelT: type of the labels - * @param X_in: pointer to the input Data samples array (nRows x nCols) - * @param nRows: number of data samples - * @param nCols: number of features - * @param labels: the pointer to the array containing labels for every data sample (1 x nRows) - * @param nLabels: number of Labels - * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and - * is populated with the silhouette score for every sample (1 x nRows) - * @param stream: the cuda stream where to launch this kernel - * @param metric: the numerical value that maps to the type of distance metric to be used in the - * calculations - */ -template -DataT silhouette_score( - raft::resources const& handle, - const DataT* X_in, - int nRows, - int nCols, - const LabelT* labels, - int nLabels, - DataT* silhouette_scorePerSample, - cudaStream_t stream, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded) -{ - ASSERT(nLabels >= 2 && nLabels <= (nRows - 1), - "silhouette Score not defined for the given number of labels!"); - - // compute the distance matrix - rmm::device_uvector distanceMatrix(nRows * nRows, stream); - rmm::device_uvector workspace(1, stream); - - cuvs::distance::pairwise_distance( - handle, X_in, X_in, distanceMatrix.data(), nRows, nRows, nCols, metric); - - // deciding on the array of silhouette scores for each dataPoint - rmm::device_uvector silhouette_scoreSamples(0, stream); - DataT* perSampleSilScore = nullptr; - if (silhouette_scorePerSample == nullptr) { - silhouette_scoreSamples.resize(nRows, stream); - perSampleSilScore = silhouette_scoreSamples.data(); - } else { - perSampleSilScore = silhouette_scorePerSample; - } - RAFT_CUDA_TRY(cudaMemsetAsync(perSampleSilScore, 0, nRows * sizeof(DataT), stream)); - - // getting the sample count per cluster - rmm::device_uvector binCountArray(nLabels, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(binCountArray.data(), 0, nLabels * sizeof(DataT), stream)); - countLabels(labels, binCountArray.data(), nRows, nLabels, workspace, stream); - - // calculating the sample-cluster-distance-sum-array - rmm::device_uvector sampleToClusterSumOfDistances(nRows * nLabels, stream); - RAFT_CUDA_TRY(cudaMemsetAsync( - sampleToClusterSumOfDistances.data(), 0, nRows * nLabels * sizeof(DataT), stream)); - raft::linalg::reduce_cols_by_key(distanceMatrix.data(), - labels, - sampleToClusterSumOfDistances.data(), - nRows, - nRows, - nLabels, - stream); - - // creating the a array and b array - rmm::device_uvector d_aArray(nRows, stream); - rmm::device_uvector d_bArray(nRows, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(d_aArray.data(), 0, nRows * sizeof(DataT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_bArray.data(), 0, nRows * sizeof(DataT), stream)); - - // kernel that populates the d_aArray - // kernel configuration - dim3 numThreadsPerBlock(32, 1, 1); - dim3 numBlocks(raft::ceildiv(nRows, numThreadsPerBlock.x), 1, 1); - - // calling the kernel - populateAKernel<<>>( - sampleToClusterSumOfDistances.data(), - binCountArray.data(), - d_aArray.data(), - labels, - nRows, - nLabels, - std::numeric_limits::max()); - - // elementwise dividing by bincounts - rmm::device_uvector averageDistanceBetweenSampleAndCluster(nRows * nLabels, stream); - RAFT_CUDA_TRY(cudaMemsetAsync( - averageDistanceBetweenSampleAndCluster.data(), 0, nRows * nLabels * sizeof(DataT), stream)); - - raft::linalg::matrixVectorOp(averageDistanceBetweenSampleAndCluster.data(), - sampleToClusterSumOfDistances.data(), - binCountArray.data(), - binCountArray.data(), - nLabels, - nRows, - true, - true, - DivOp(), - stream); - - // calculating row-wise minimum - raft::linalg::reduce( - d_bArray.data(), - averageDistanceBetweenSampleAndCluster.data(), - nLabels, - nRows, - std::numeric_limits::max(), - true, - true, - stream, - false, - raft::identity_op{}, - raft::min_op{}); - - // calculating the silhouette score per sample using the d_aArray and d_bArray - raft::linalg::binaryOp>( - perSampleSilScore, d_aArray.data(), d_bArray.data(), nRows, SilOp(), stream); - - // calculating the sum of all the silhouette score - rmm::device_scalar d_avgSilhouetteScore(stream); - RAFT_CUDA_TRY(cudaMemsetAsync(d_avgSilhouetteScore.data(), 0, sizeof(DataT), stream)); - - raft::linalg::mapThenSumReduce(d_avgSilhouetteScore.data(), - nRows, - raft::identity_op(), - stream, - perSampleSilScore, - perSampleSilScore); - - DataT avgSilhouetteScore = d_avgSilhouetteScore.value(stream); - - resource::sync_stream(handle, stream); - - avgSilhouetteScore /= nRows; - - return avgSilhouetteScore; -} - -}; // namespace detail -}; // namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/stddev.cuh b/cpp/include/cuvs/stats/detail/stddev.cuh deleted file mode 100644 index c5a725872..000000000 --- a/cpp/include/cuvs/stats/detail/stddev.cuh +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -namespace cuvs { -namespace stats { -namespace detail { - -///@todo: ColPerBlk has been tested only for 32! -template -RAFT_KERNEL stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N) -{ - const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) { - Type val = (colId < D) ? data[i * D + colId] : Type(0); - thread_data += val * val; - } - __shared__ Type sstd[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) sstd[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(sstd + thisColId, thread_data); - __syncthreads(); - if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(std + colId, sstd[thisColId]); -} - -template -RAFT_KERNEL stddevKernelColMajor(Type* std, const Type* data, const Type* mu, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - Type diff = data[idx] - m; - thread_data += diff * diff; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { std[blockIdx.x] = raft::sqrt(acc / N); } -} - -template -RAFT_KERNEL varsKernelColMajor(Type* var, const Type* data, const Type* mu, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - Type diff = data[idx] - m; - thread_data += diff * diff; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; } -} - -/** - * @brief Compute stddev of the input matrix - * - * Stddev operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param std the output stddev vector - * @param data the input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param sample whether to evaluate sample stddev or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - * @param rowMajor whether the input data is row or col major - * @param stream cuda stream where to launch work - */ -template -void stddev(Type* std, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool sample, - bool rowMajor, - cudaStream_t stream) -{ - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemset(std, 0, sizeof(Type) * D)); - stddevKernelRowMajor<<>>(std, data, D, N); - Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); - raft::linalg::binaryOp( - std, - std, - mu, - D, - [ratio] __device__(Type a, Type b) { return raft::sqrt(a * ratio - b * b); }, - stream); - } else { - stddevKernelColMajor<<>>(std, data, mu, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -/** - * @brief Compute variance of the input matrix - * - * Variance operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param var the output stddev vector - * @param data the input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param sample whether to evaluate sample stddev or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - * @param rowMajor whether the input data is row or col major - * @param stream cuda stream where to launch work - */ -template -void vars(Type* var, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool sample, - bool rowMajor, - cudaStream_t stream) -{ - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemset(var, 0, sizeof(Type) * D)); - stddevKernelRowMajor<<>>(var, data, D, N); - Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); - raft::linalg::binaryOp( - var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); - } else { - varsKernelColMajor<<>>(var, data, mu, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -} // namespace detail -} // namespace stats -} // namespace cuvs \ No newline at end of file diff --git a/cpp/include/cuvs/stats/detail/sum.cuh b/cpp/include/cuvs/stats/detail/sum.cuh deleted file mode 100644 index 6014c56f7..000000000 --- a/cpp/include/cuvs/stats/detail/sum.cuh +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include - -namespace cuvs { -namespace stats { -namespace detail { - -///@todo: ColsPerBlk has been tested only for 32! -template -RAFT_KERNEL sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) - thread_data += (colId < D) ? data[i * D + colId] : Type(0); - __shared__ Type smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_data); - __syncthreads(); - if (threadIdx.x < ColsPerBlk) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -RAFT_KERNEL sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - thread_data += data[idx]; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { mu[blockIdx.x] = acc; } -} - -template -void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) -{ - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemset(output, 0, sizeof(Type) * D)); - sumKernelRowMajor - <<>>(output, input, D, N); - } else { - sumKernelColMajor<<>>(output, input, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -} // namespace detail -} // namespace stats -} // namespace cuvs \ No newline at end of file diff --git a/cpp/include/cuvs/stats/detail/trustworthiness_score.cuh b/cpp/include/cuvs/stats/detail/trustworthiness_score.cuh deleted file mode 100644 index 22608c527..000000000 --- a/cpp/include/cuvs/stats/detail/trustworthiness_score.cuh +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include - -#define N_THREADS 512 - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief Build the lookup table - * @param[out] lookup_table: Lookup table giving nearest neighbor order - * of pairwise distance calculations given sample index - * @param[in] X_ind: Sorted indexes of pairwise distance calculations of X - * @param n: Number of samples - * @param work: Number of elements to consider - */ -RAFT_KERNEL build_lookup_table(int* lookup_table, const int* X_ind, int n, int work) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= work) return; - - int sample_idx = i / n; - int nn_idx = i % n; - - int idx = X_ind[i]; - lookup_table[(sample_idx * n) + idx] = nn_idx; -} - -/** - * @brief Compute a the rank of trustworthiness score - * @param[out] rank: Resulting rank - * @param[out] lookup_table: Lookup table giving nearest neighbor order - * of pairwise distance calculations given sample index - * @param[in] emb_ind: Indexes of KNN on embeddings - * @param n: Number of samples - * @param n_neighbors: Number of neighbors considered by trustworthiness score - * @param work: Batch to consider (to do it at once use n * n_neighbors) - */ -template -RAFT_KERNEL compute_rank(double* rank, - const int* lookup_table, - const knn_index_t* emb_ind, - int n, - int n_neighbors, - int work) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= work) return; - - int sample_idx = i / n_neighbors; - - knn_index_t emb_nn_ind = emb_ind[i]; - - int r = lookup_table[(sample_idx * n) + emb_nn_ind]; - int tmp = r - n_neighbors + 1; - if (tmp > 0) raft::myAtomicAdd(rank, tmp); -} - -/** - * @brief Compute a kNN and returns the indices of the nearest neighbors - * @param h Raft handle - * @param[in] input Input matrix containing the dataset - * @param n Number of samples - * @param d Number of features - * @param n_neighbors number of neighbors - * @param[out] indices KNN indexes - * @param[out] distances KNN distances - */ -template -void run_knn(const raft::resources& h, - math_t* input, - int n, - int d, - int n_neighbors, - int64_t* indices, - math_t* distances) -{ - std::vector ptrs(1); - std::vector sizes(1); - ptrs[0] = input; - sizes[0] = n; - - cuvs::spatial::knn::brute_force_knn(h, - ptrs, - sizes, - d, - input, - n, - indices, - distances, - n_neighbors, - true, - true, - nullptr, - distance_type); -} - -/** - * @brief Compute the trustworthiness score - * @param h Raft handle - * @param X[in]: Data in original dimension - * @param X_embedded[in]: Data in target dimension (embedding) - * @param n: Number of samples - * @param m: Number of features in high/original dimension - * @param d: Number of features in low/embedded dimension - * @param n_neighbors Number of neighbors considered by trustworthiness score - * @param batchSize Batch size - * @return Trustworthiness score - */ -template -double trustworthiness_score(const raft::resources& h, - const math_t* X, - math_t* X_embedded, - int n, - int m, - int d, - int n_neighbors, - int batchSize = 512) -{ - cudaStream_t stream = resource::get_cuda_stream(h); - - const int KNN_ALLOC = n * (n_neighbors + 1); - rmm::device_uvector emb_ind(KNN_ALLOC, stream); - rmm::device_uvector emb_dist(KNN_ALLOC, stream); - - run_knn(h, X_embedded, n, d, n_neighbors + 1, emb_ind.data(), emb_dist.data()); - - const int PAIRWISE_ALLOC = batchSize * n; - rmm::device_uvector X_ind(PAIRWISE_ALLOC, stream); - rmm::device_uvector X_dist(PAIRWISE_ALLOC, stream); - rmm::device_uvector lookup_table(PAIRWISE_ALLOC, stream); - - double t = 0.0; - rmm::device_scalar t_dbuf(stream); - - int toDo = n; - while (toDo > 0) { - int curBatchSize = min(toDo, batchSize); - - // Takes at most batchSize vectors at a time - cuvs::distance::pairwise_distance( - h, &X[(n - toDo) * m], X, X_dist.data(), curBatchSize, n, m, distance_type); - - size_t colSortWorkspaceSize = 0; - bool bAllocWorkspace = false; - - raft::matrix::sort_cols_per_row(X_dist.data(), - X_ind.data(), - curBatchSize, - n, - bAllocWorkspace, - nullptr, - colSortWorkspaceSize, - stream); - - if (bAllocWorkspace) { - rmm::device_uvector sortColsWorkspace(colSortWorkspaceSize, stream); - - raft::matrix::sort_cols_per_row(X_dist.data(), - X_ind.data(), - curBatchSize, - n, - bAllocWorkspace, - sortColsWorkspace.data(), - colSortWorkspaceSize, - stream); - } - - int work = curBatchSize * n; - int n_blocks = raft::ceildiv(work, N_THREADS); - build_lookup_table<<>>( - lookup_table.data(), X_ind.data(), n, work); - - RAFT_CUDA_TRY(cudaMemsetAsync(t_dbuf.data(), 0, sizeof(double), stream)); - - work = curBatchSize * (n_neighbors + 1); - n_blocks = raft::ceildiv(work, N_THREADS); - compute_rank<<>>( - t_dbuf.data(), - lookup_table.data(), - &emb_ind.data()[(n - toDo) * (n_neighbors + 1)], - n, - n_neighbors + 1, - work); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - t += t_dbuf.value(stream); - - toDo -= curBatchSize; - } - - t = 1.0 - ((2.0 / ((n * n_neighbors) * ((2.0 * n) - (3.0 * n_neighbors) - 1.0))) * t); - - return t; -} - -} // namespace detail -} // namespace stats -} // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/v_measure.cuh b/cpp/include/cuvs/stats/detail/v_measure.cuh deleted file mode 100644 index 3a0e5c396..000000000 --- a/cpp/include/cuvs/stats/detail/v_measure.cuh +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file v_measure.cuh - */ - -#include - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief Function to calculate the v-measure between two clusters - * - * @param truthClusterArray: the array of truth classes of type T - * @param predClusterArray: the array of predicted classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - * @param beta: v_measure parameter - */ -template -double v_measure(const T* truthClusterArray, - const T* predClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream, - double beta = 1.0) -{ - double computedHomogeity, computedCompleteness, computedVMeasure; - - computedHomogeity = raft::stats::homogeneity_score( - truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream); - computedCompleteness = raft::stats::homogeneity_score( - predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream); - - if (computedCompleteness + computedHomogeity == 0.0) - computedVMeasure = 0.0; - else - computedVMeasure = ((1 + beta) * computedHomogeity * computedCompleteness / - (beta * computedHomogeity + computedCompleteness)); - - return computedVMeasure; -} - -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs diff --git a/cpp/include/cuvs/stats/detail/weighted_mean.cuh b/cpp/include/cuvs/stats/detail/weighted_mean.cuh deleted file mode 100644 index 803c45fae..000000000 --- a/cpp/include/cuvs/stats/detail/weighted_mean.cuh +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -namespace cuvs { -namespace stats { -namespace detail { - -/** - * @brief Compute the row-wise weighted mean of the input matrix with a - * vector of weights - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param mu the output mean vector - * @param data the input matrix - * @param weights weight of size D if along_row is true, else of size N - * @param D number of columns of data - * @param N number of rows of data - * @param row_major data input matrix is row-major or not - * @param along_rows whether to reduce along rows or columns - * @param stream cuda stream to launch work on - */ -template -void weightedMean(Type* mu, - const Type* data, - const Type* weights, - IdxType D, - IdxType N, - bool row_major, - bool along_rows, - cudaStream_t stream) -{ - // sum the weights & copy back to CPU - auto weight_size = along_rows ? D : N; - Type WS = 0; - raft::stats::sum(mu, weights, (IdxType)1, weight_size, false, stream); - raft::update_host(&WS, mu, 1, stream); - - raft::linalg::reduce( - mu, - data, - D, - N, - (Type)0, - row_major, - along_rows, - stream, - false, - [weights] __device__(Type v, IdxType i) { return v * weights[i]; }, - raft::add_op{}, - raft::div_const_op(WS)); -} -}; // end namespace detail -}; // end namespace stats -}; // namespace cuvs \ No newline at end of file diff --git a/cpp/include/cuvs/stats/dispersion.cuh b/cpp/include/cuvs/stats/dispersion.cuh deleted file mode 100644 index 7cddd679a..000000000 --- a/cpp/include/cuvs/stats/dispersion.cuh +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __DISPERSION_H -#define __DISPERSION_H - -#pragma once - -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Compute cluster dispersion metric. This is very useful for - * automatically finding the 'k' (in kmeans) that improves this metric. - * @tparam DataT data type - * @tparam IdxT index type - * @tparam TPB threads block for kernels launched - * @param centroids the cluster centroids. This is assumed to be row-major - * and of dimension (nClusters x dim) - * @param clusterSizes number of points in the dataset which belong to each - * cluster. This is of length nClusters - * @param globalCentroid compute the global weighted centroid of all cluster - * centroids. This is of length dim. Pass a nullptr if this is not needed - * @param nClusters number of clusters - * @param nPoints number of points in the dataset - * @param dim dataset dimensionality - * @param stream cuda stream - * @return the cluster dispersion value - */ -template -DataT dispersion(const DataT* centroids, - const IdxT* clusterSizes, - DataT* globalCentroid, - IdxT nClusters, - IdxT nPoints, - IdxT dim, - cudaStream_t stream) -{ - return detail::dispersion( - centroids, clusterSizes, globalCentroid, nClusters, nPoints, dim, stream); -} - -/** - * @defgroup stats_cluster_dispersion Cluster Dispersion Metric - * @{ - */ - -/** - * @brief Compute cluster dispersion metric. This is very useful for - * automatically finding the 'k' (in kmeans) that improves this metric. - * The cluster dispersion metric is defined as the square root of the sum of the - * squared distances between the cluster centroids and the global centroid - * @tparam value_t data type - * @tparam idx_t index type - * @param[in] handle the raft handle - * @param[in] centroids the cluster centroids. This is assumed to be row-major - * and of dimension (n_clusters x dim) - * @param[in] cluster_sizes number of points in the dataset which belong to each - * cluster. This is of length n_clusters - * @param[out] global_centroid compute the global weighted centroid of all cluster - * centroids. This is of length dim. Use std::nullopt to not return it. - * @param[in] n_points number of points in the dataset - * @return the cluster dispersion value - */ -template -value_t cluster_dispersion( - raft::resources const& handle, - raft::device_matrix_view centroids, - raft::device_vector_view cluster_sizes, - std::optional> global_centroid, - const idx_t n_points) -{ - RAFT_EXPECTS(cluster_sizes.extent(0) == centroids.extent(0), "Size mismatch"); - RAFT_EXPECTS(cluster_sizes.is_exhaustive(), "cluster_sizes must be contiguous"); - - value_t* global_centroid_ptr = nullptr; - if (global_centroid.has_value()) { - RAFT_EXPECTS(global_centroid.value().extent(0) == centroids.extent(1), - "Size mismatch between global_centroid and centroids"); - RAFT_EXPECTS(global_centroid.value().is_exhaustive(), "global_centroid must be contiguous"); - global_centroid_ptr = global_centroid.value().data_handle(); - } - return detail::dispersion(centroids.data_handle(), - cluster_sizes.data_handle(), - global_centroid_ptr, - centroids.extent(0), - n_points, - centroids.extent(1), - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_cluster_dispersion - -/** - * @brief Overload of `cluster_dispersion` to help the - * compiler find the above overload, in case users pass in - * `std::nullopt` for the optional arguments. - * - * Please see above for documentation of `cluster_dispersion`. - */ -template -value_t cluster_dispersion( - raft::resources const& handle, - raft::device_matrix_view centroids, - raft::device_vector_view cluster_sizes, - std::nullopt_t global_centroid, - const idx_t n_points) -{ - std::optional> opt_centroid = global_centroid; - return cluster_dispersion(handle, centroids, cluster_sizes, opt_centroid, n_points); -} -} // end namespace stats -} // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/entropy.cuh b/cpp/include/cuvs/stats/entropy.cuh deleted file mode 100644 index 01e188c0d..000000000 --- a/cpp/include/cuvs/stats/entropy.cuh +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ENTROPY_H -#define __ENTROPY_H - -#pragma once -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Function to calculate entropy - * more info on entropy - * - * @tparam T data type - * @param clusterArray: the array of classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - * @return the entropy score - */ -template -double entropy(const T* clusterArray, - const int size, - const T lowerLabelRange, - const T upperLabelRange, - cudaStream_t stream) -{ - return detail::entropy(clusterArray, size, lowerLabelRange, upperLabelRange, stream); -} - -/** - * @defgroup stats_entropy Entropy - * @{ - */ - -/** - * @brief Function to calculate entropy - * more info on entropy - * - * @tparam value_t data type - * @tparam idx_t index type - * @param[in] handle the raft handle - * @param[in] cluster_array: the array of classes of type value_t - * @param[in] lower_label_range: the lower bound of the range of labels - * @param[in] upper_label_range: the upper bound of the range of labels - * @return the entropy score - */ -template -double entropy(raft::resources const& handle, - raft::device_vector_view cluster_array, - const value_t lower_label_range, - const value_t upper_label_range) -{ - RAFT_EXPECTS(cluster_array.is_exhaustive(), "cluster_array must be contiguous"); - return detail::entropy(cluster_array.data_handle(), - cluster_array.extent(0), - lower_label_range, - upper_label_range, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_entropy - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/histogram.cuh b/cpp/include/cuvs/stats/histogram.cuh deleted file mode 100644 index 97127f45f..000000000 --- a/cpp/include/cuvs/stats/histogram.cuh +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __HISTOGRAM_H -#define __HISTOGRAM_H - -#pragma once - -#include -#include -#include -#include - -// This file is a shameless amalgamation of independent works done by -// Lars Nyland and Andy Adinets - -///@todo: add cub's histogram as another option - -namespace cuvs { -namespace stats { - -/** - * Default mapper which just returns the value of the data itself - */ -template -struct IdentityBinner : public detail::IdentityBinner { - IdentityBinner() : detail::IdentityBinner() {} -}; - -/** - * @brief Perform histogram on the input data. It chooses the right load size - * based on the input data vector length. It also supports large-bin cases - * using a specialized smem-based hashing technique. - * @tparam DataT input data type - * @tparam IdxT data type used to compute indices - * @tparam BinnerOp takes the input data and computes its bin index - * @param type histogram implementation type to choose - * @param bins the output bins (length = ncols * nbins) - * @param nbins number of bins - * @param data input data (length = ncols * nrows) - * @param nrows data array length in each column (or batch) - * @param ncols number of columns (or batch size) - * @param stream cuda stream - * @param binner the operation that computes the bin index of the input data - * - * @note signature of BinnerOp is `int func(DataT, IdxT);` - */ -template > -void histogram(HistType type, - int* bins, - IdxT nbins, - const DataT* data, - IdxT nrows, - IdxT ncols, - cudaStream_t stream, - BinnerOp binner = IdentityBinner()) -{ - detail::histogram(type, bins, nbins, data, nrows, ncols, stream, binner); -} - -/** - * @defgroup stats_histogram Histogram - * @{ - */ - -/** - * @brief Perform histogram on the input data. It chooses the right load size - * based on the input data vector length. It also supports large-bin cases - * using a specialized smem-based hashing technique. - * @tparam value_t input data type - * @tparam idx_t data type used to compute indices - * @tparam binner_op takes the input data and computes its bin index - * @param[in] handle the raft handle - * @param[in] type histogram implementation type to choose - * @param[in] data input data col-major (length = nrows * ncols) - * @param[out] bins the output bins col-major (length = nbins * ncols) - * @param[in] binner the operation that computes the bin index of the input data - * - * @note signature of binner_op is `int func(value_t, IdxT);` - */ -template > -void histogram(raft::resources const& handle, - HistType type, - raft::device_matrix_view data, - raft::device_matrix_view bins, - binner_op binner = IdentityBinner()) -{ - RAFT_EXPECTS(std::is_integral_v && data.extent(0) <= std::numeric_limits::max(), - "Index type not supported"); - RAFT_EXPECTS(bins.extent(1) == data.extent(1), "Size mismatch"); - RAFT_EXPECTS(bins.is_exhaustive(), "bins must be contiguous"); - RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous"); - detail::histogram(type, - bins.data_handle(), - bins.extent(0), - data.data_handle(), - data.extent(0), - data.extent(1), - resource::get_cuda_stream(handle), - binner); -} - -/** @} */ // end group stats_histogram - -}; // end namespace stats -}; // namespace cuvs - -#endif diff --git a/cpp/include/cuvs/stats/homogeneity_score.cuh b/cpp/include/cuvs/stats/homogeneity_score.cuh deleted file mode 100644 index 5ae419de0..000000000 --- a/cpp/include/cuvs/stats/homogeneity_score.cuh +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __HOMOGENEITY_SCORE_H -#define __HOMOGENEITY_SCORE_H - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Function to calculate the homogeneity score between two clusters - * more info on mutual - * information - * @param truthClusterArray: the array of truth classes of type T - * @param predClusterArray: the array of predicted classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - */ -template -double homogeneity_score(const T* truthClusterArray, - const T* predClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream) -{ - return detail::homogeneity_score( - truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream); -} - -/** - * @defgroup stats_homogeneity_score Homogeneity Score - * @{ - */ - -/** - * @brief Function to calculate the homogeneity score between two clusters - * more info on mutual - * information - * - * @tparam value_t data type - * @tparam idx_t index type - * @param[in] handle the raft handle - * @param[in] truth_cluster_array: the array of truth classes of type value_t - * @param[in] pred_cluster_array: the array of predicted classes of type value_t - * @param[in] lower_label_range: the lower bound of the range of labels - * @param[in] upper_label_range: the upper bound of the range of labels - * @return the homogeneity score - */ -template -double homogeneity_score(raft::resources const& handle, - raft::device_vector_view truth_cluster_array, - raft::device_vector_view pred_cluster_array, - value_t lower_label_range, - value_t upper_label_range) -{ - RAFT_EXPECTS(truth_cluster_array.size() == pred_cluster_array.size(), "Size mismatch"); - RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous"); - RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous"); - return detail::homogeneity_score(truth_cluster_array.data_handle(), - pred_cluster_array.data_handle(), - truth_cluster_array.extent(0), - lower_label_range, - upper_label_range, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_homogeneity_score - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/information_criterion.cuh b/cpp/include/cuvs/stats/information_criterion.cuh deleted file mode 100644 index 682a68f3f..000000000 --- a/cpp/include/cuvs/stats/information_criterion.cuh +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file information_criterion.cuh - * @brief These information criteria are used to evaluate the quality of models - * by balancing the quality of the fit and the number of parameters. - * - * See: - * - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion - * - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc - * - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion - */ - -#ifndef __INFORMATION_CRIT_H -#define __INFORMATION_CRIT_H - -#pragma once - -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * Compute the given type of information criterion - * - * @note: it is safe to do the computation in-place (i.e give same pointer - * as input and output) - * - * @param[out] d_ic Information criterion to be returned for each - * series (device) - * @param[in] d_loglikelihood Log-likelihood for each series (device) - * @param[in] ic_type Type of criterion to compute. See IC_Type - * @param[in] n_params Number of parameters in the model - * @param[in] batch_size Number of series in the batch - * @param[in] n_samples Number of samples in each series - * @param[in] stream CUDA stream - */ -template -void information_criterion_batched(ScalarT* d_ic, - const ScalarT* d_loglikelihood, - IC_Type ic_type, - IdxT n_params, - IdxT batch_size, - IdxT n_samples, - cudaStream_t stream) -{ - batched::detail::information_criterion( - d_ic, d_loglikelihood, ic_type, n_params, batch_size, n_samples, stream); -} - -/** - * @defgroup stats_information_criterion Information Criterion - * @{ - */ - -/** - * Compute the given type of information criterion - * - * @note: it is safe to do the computation in-place (i.e give same pointer - * as input and output) - * See: - * - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion - * - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc - * - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion - * - * @tparam value_t data type - * @tparam idx_t index type - * @param[in] handle the raft handle - * @param[in] d_loglikelihood Log-likelihood for each series (device) length: batch_size - * @param[out] d_ic Information criterion to be returned for each - * series (device) length: batch_size - * @param[in] ic_type Type of criterion to compute. See IC_Type - * @param[in] n_params Number of parameters in the model - * @param[in] n_samples Number of samples in each series - */ -template -void information_criterion_batched(raft::resources const& handle, - raft::device_vector_view d_loglikelihood, - raft::device_vector_view d_ic, - IC_Type ic_type, - idx_t n_params, - idx_t n_samples) -{ - RAFT_EXPECTS(d_ic.size() == d_loglikelihood.size(), "Size mismatch"); - RAFT_EXPECTS(d_ic.is_exhaustive(), "d_ic must be contiguous"); - RAFT_EXPECTS(d_loglikelihood.is_exhaustive(), "d_loglikelihood must be contiguous"); - batched::detail::information_criterion(d_ic.data_handle(), - d_loglikelihood.data_handle(), - ic_type, - n_params, - d_ic.extent(0), - n_samples, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_information_criterion - -} // namespace stats -} // namespace cuvs -#endif diff --git a/cpp/include/cuvs/stats/kl_divergence.cuh b/cpp/include/cuvs/stats/kl_divergence.cuh deleted file mode 100644 index 1aae77eaf..000000000 --- a/cpp/include/cuvs/stats/kl_divergence.cuh +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __KL_DIVERGENCE_H -#define __KL_DIVERGENCE_H - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Function to calculate KL Divergence - * more info on KL - * Divergence - * - * @tparam DataT: Data type of the input array - * @param modelPDF: the model array of probability density functions of type DataT - * @param candidatePDF: the candidate array of probability density functions of type DataT - * @param size: the size of the data points of type int - * @param stream: the cudaStream object - */ -template -DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream) -{ - return detail::kl_divergence(modelPDF, candidatePDF, size, stream); -} - -/** - * @defgroup kl_divergence Kullback-Leibler Divergence - * @{ - */ - -/** - * @brief Function to calculate KL Divergence - * more info on KL - * Divergence - * - * @tparam value_t: Data type of the input array - * @tparam idx_t index type - * @param[in] handle the raft handle - * @param[in] modelPDF: the model array of probability density functions of type value_t - * @param[in] candidatePDF: the candidate array of probability density functions of type value_t - * @return the KL Divergence value - */ -template -value_t kl_divergence(raft::resources const& handle, - raft::device_vector_view modelPDF, - raft::device_vector_view candidatePDF) -{ - RAFT_EXPECTS(modelPDF.size() == candidatePDF.size(), "Size mismatch"); - RAFT_EXPECTS(modelPDF.is_exhaustive(), "modelPDF must be contiguous"); - RAFT_EXPECTS(candidatePDF.is_exhaustive(), "candidatePDF must be contiguous"); - return detail::kl_divergence(modelPDF.data_handle(), - candidatePDF.data_handle(), - modelPDF.extent(0), - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group kl_divergence - -}; // end namespace stats -}; // namespace cuvs - -#endif diff --git a/cpp/include/cuvs/stats/mean.cuh b/cpp/include/cuvs/stats/mean.cuh deleted file mode 100644 index 4b66e85dc..000000000 --- a/cpp/include/cuvs/stats/mean.cuh +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __MEAN_H -#define __MEAN_H - -#pragma once - -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Compute mean of the input matrix - * - * Mean operation is assumed to be performed on a given column. - * - * @tparam Type: the data type - * @tparam IdxType Integer type used to for addressing - * @param mu: the output mean vector - * @param data: the input matrix - * @param D: number of columns of data - * @param N: number of rows of data - * @param sample: whether to evaluate sample mean or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - * @param rowMajor: whether the input data is row or col major - * @param stream: cuda stream - */ -template -void mean( - Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) -{ - detail::mean(mu, data, D, N, sample, rowMajor, stream); -} - -/** - * @defgroup stats_mean Mean - * @{ - */ - -/** - * @brief Compute mean of the input matrix - * - * Mean operation is assumed to be performed on a given column. - * - * @tparam value_t the data type - * @tparam idx_t index type - * @tparam layout_t Layout type of the input matrix. - * @param[in] handle the raft handle - * @param[in] data: the input matrix - * @param[out] mu: the output mean vector - * @param[in] sample: whether to evaluate sample mean or not. In other words, whether - * to normalize the output using N-1 or N, for true or false, respectively - */ -template -void mean(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view mu, - bool sample) -{ - static_assert( - std::is_same_v || std::is_same_v, - "Data layout not supported"); - RAFT_EXPECTS(data.extent(1) == mu.extent(0), "Size mismatch between data and mu"); - RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous"); - RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous"); - detail::mean(mu.data_handle(), - data.data_handle(), - data.extent(1), - data.extent(0), - sample, - std::is_same_v, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_mean - -}; // namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/mean_center.cuh b/cpp/include/cuvs/stats/mean_center.cuh deleted file mode 100644 index d4ddb9cf0..000000000 --- a/cpp/include/cuvs/stats/mean_center.cuh +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __MEAN_CENTER_H -#define __MEAN_CENTER_H - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Center the input matrix wrt its mean - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads per block of the cuda kernel launched - * @param out the output mean-centered matrix - * @param data input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param rowMajor whether input is row or col major - * @param bcastAlongRows whether to broadcast vector along rows or columns - * @param stream cuda stream where to launch work - */ -template -void meanCenter(Type* out, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ - detail::meanCenter(out, data, mu, D, N, rowMajor, bcastAlongRows, stream); -} - -/** - * @brief Add the input matrix wrt its mean - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads per block of the cuda kernel launched - * @param out the output mean-added matrix - * @param data input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param rowMajor whether input is row or col major - * @param bcastAlongRows whether to broadcast vector along rows or columns - * @param stream cuda stream where to launch work - */ -template -void meanAdd(Type* out, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - cudaStream_t stream) -{ - detail::meanAdd(out, data, mu, D, N, rowMajor, bcastAlongRows, stream); -} - -/** - * @defgroup stats_mean_center Mean Center - * @{ - */ - -/** - * @brief Center the input matrix wrt its mean - * @tparam value_t the data type - * @tparam idx_t index type - * @tparam layout_t Layout type of the input matrix. - * @param[in] handle the raft handle - * @param[in] data input matrix of size nrows * ncols - * @param[in] mu the mean vector of size ncols if bcast_along_rows else nrows - * @param[out] out the output mean-centered matrix - * @param[in] bcast_along_rows whether to broadcast vector along rows or columns - */ -template -void mean_center(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view mu, - raft::device_matrix_view out, - bool bcast_along_rows) -{ - static_assert( - std::is_same_v || std::is_same_v, - "Data layout not supported"); - auto mean_vec_size = bcast_along_rows ? data.extent(1) : data.extent(0); - RAFT_EXPECTS(out.extents() == data.extents(), "Size mismatch"); - RAFT_EXPECTS(mean_vec_size == mu.extent(0), "Size mismatch between data and mu"); - RAFT_EXPECTS(out.is_exhaustive(), "out must be contiguous"); - RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous"); - detail::meanCenter(out.data_handle(), - data.data_handle(), - mu.data_handle(), - data.extent(1), - data.extent(0), - std::is_same_v, - bcast_along_rows, - resource::get_cuda_stream(handle)); -} - -/** - * @brief Add the input matrix wrt its mean - * @tparam Type the data type - * @tparam idx_t index type - * @tparam layout_t Layout type of the input matrix. - * @tparam TPB threads per block of the cuda kernel launched - * @param[in] handle the raft handle - * @param[in] data input matrix of size nrows * ncols - * @param[in] mu the mean vector of size ncols if bcast_along_rows else nrows - * @param[out] out the output mean-centered matrix - * @param[in] bcast_along_rows whether to broadcast vector along rows or columns - */ -template -void mean_add(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view mu, - raft::device_matrix_view out, - bool bcast_along_rows) -{ - static_assert( - std::is_same_v || std::is_same_v, - "Data layout not supported"); - auto mean_vec_size = bcast_along_rows ? data.extent(1) : data.extent(0); - RAFT_EXPECTS(out.extents() == data.extents(), "Size mismatch"); - RAFT_EXPECTS(mean_vec_size == mu.extent(0), "Size mismatch between data and mu"); - RAFT_EXPECTS(out.is_exhaustive(), "out must be contiguous"); - RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous"); - detail::meanAdd(out.data_handle(), - data.data_handle(), - mu.data_handle(), - data.extent(1), - data.extent(0), - std::is_same_v, - bcast_along_rows, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_mean_center - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/meanvar.cuh b/cpp/include/cuvs/stats/meanvar.cuh deleted file mode 100644 index 5c27a6caf..000000000 --- a/cpp/include/cuvs/stats/meanvar.cuh +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __MEANVAR_H -#define __MEANVAR_H - -#pragma once - -#include -#include -#include - -namespace raft::stats { - -/** - * @brief Compute mean and variance for each column of a given matrix. - * - * The operation is performed in a single sweep. Consider using it when you need to compute - * both mean and variance, or when you need to compute variance but don't have the mean. - * It's almost twice faster than running `mean` and `vars` sequentially, because all three - * kernels are memory-bound. - * - * @tparam Type the data type - * @tparam IdxType Integer type used for addressing - * @param [out] mean the output mean vector of size D - * @param [out] var the output variance vector of size D - * @param [in] data the input matrix of size [N, D] - * @param [in] D number of columns of data - * @param [in] N number of rows of data - * @param [in] sample whether to evaluate sample variance or not. In other words, whether to - * normalize the variance using N-1 or N, for true or false respectively. - * @param [in] rowMajor whether the input data is row- or col-major, for true or false respectively. - * @param [in] stream - */ -template -void meanvar(Type* mean, - Type* var, - const Type* data, - IdxType D, - IdxType N, - bool sample, - bool rowMajor, - cudaStream_t stream) -{ - detail::meanvar(mean, var, data, D, N, sample, rowMajor, stream); -} - -/** - * @defgroup stats_mean_var Mean and Variance - * @{ - */ - -/** - * @brief Compute mean and variance for each column of a given matrix. - * - * The operation is performed in a single sweep. Consider using it when you need to compute - * both mean and variance, or when you need to compute variance but don't have the mean. - * It's almost twice faster than running `mean` and `vars` sequentially, because all three - * kernels are memory-bound. - * - * @tparam value_t the data type - * @tparam idx_t Integer type used for addressing - * @tparam layout_t Layout type of the input matrix. - * @param[in] handle the raft handle - * @param[in] data the input matrix of size [N, D] - * @param[out] mean the output mean vector of size D - * @param[out] var the output variance vector of size D - * @param[in] sample whether to evaluate sample variance or not. In other words, whether to - * normalize the variance using N-1 or N, for true or false respectively. - */ -template -void meanvar(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view mean, - raft::device_vector_view var, - bool sample) -{ - static_assert( - std::is_same_v || std::is_same_v, - "Data layout not supported"); - RAFT_EXPECTS(data.extent(1) == var.extent(0), "Size mismatch between data and var"); - RAFT_EXPECTS(mean.size() == var.size(), "Size mismatch between mean and var"); - RAFT_EXPECTS(mean.is_exhaustive(), "mean must be contiguous"); - RAFT_EXPECTS(var.is_exhaustive(), "var must be contiguous"); - RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous"); - detail::meanvar(mean.data_handle(), - var.data_handle(), - data.data_handle(), - data.extent(1), - data.extent(0), - sample, - std::is_same_v, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_mean_var - -}; // namespace raft::stats - -#endif diff --git a/cpp/include/cuvs/stats/minmax.cuh b/cpp/include/cuvs/stats/minmax.cuh deleted file mode 100644 index 9b63954e4..000000000 --- a/cpp/include/cuvs/stats/minmax.cuh +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __MINMAX_H -#define __MINMAX_H - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Computes min/max across every column of the input matrix, as well as - * optionally allow to subsample based on the given row/col ID mapping vectors - * - * @tparam T the data type - * @tparam TPB number of threads per block - * @param data input data - * @param rowids actual row ID mappings. It is of length nrows. If you want to - * skip this index lookup entirely, pass nullptr - * @param colids actual col ID mappings. It is of length ncols. If you want to - * skip this index lookup entirely, pass nullptr - * @param nrows number of rows of data to be worked upon. The actual rows of the - * input "data" can be bigger than this! - * @param ncols number of cols of data to be worked upon. The actual cols of the - * input "data" can be bigger than this! - * @param row_stride stride (in number of elements) between 2 adjacent columns - * @param globalmin final col-wise global minimum (size = ncols) - * @param globalmax final col-wise global maximum (size = ncols) - * @param sampledcols output sampled data. Pass nullptr if you don't need this - * @param stream cuda stream - * @note This method makes the following assumptions: - * 1. input and output matrices are assumed to be col-major - * 2. ncols is small enough to fit the whole of min/max values across all cols - * in shared memory - */ -template -void minmax(const T* data, - const unsigned* rowids, - const unsigned* colids, - int nrows, - int ncols, - int row_stride, - T* globalmin, - T* globalmax, - T* sampledcols, - cudaStream_t stream) -{ - detail::minmax( - data, rowids, colids, nrows, ncols, row_stride, globalmin, globalmax, sampledcols, stream); -} - -/** - * @defgroup stats_minmax Min/Max - * @{ - */ - -/** - * @brief Computes min/max across every column of the input matrix, as well as - * optionally allow to subsample based on the given row/col ID mapping vectors - * - * @tparam value_t Data type of input matrix element. - * @tparam idx_t Index type of matrix extent. - * @param[in] handle the raft handle - * @param[in] data input data col-major of size [nrows, ncols], unless rowids or - * colids length is smaller - * @param[in] rowids optional row ID mappings of length nrows. If you want to - * skip this index lookup entirely, pass std::nullopt - * @param[in] colids optional col ID mappings of length ncols. If you want to - * skip this index lookup entirely, pass std::nullopt - * @param[out] globalmin final col-wise global minimum (size = ncols) - * @param[out] globalmax final col-wise global maximum (size = ncols) - * @param[out] sampledcols output sampled data. Pass std::nullopt if you don't need this - * @note This method makes the following assumptions: - * 1. input and output matrices are assumed to be col-major - * 2. ncols is small enough to fit the whole of min/max values across all cols - * in shared memory - */ -template -void minmax(raft::resources const& handle, - raft::device_matrix_view data, - std::optional> rowids, - std::optional> colids, - raft::device_vector_view globalmin, - raft::device_vector_view globalmax, - std::optional> sampledcols) -{ - const unsigned* rowids_ptr = nullptr; - const unsigned* colids_ptr = nullptr; - value_t* sampledcols_ptr = nullptr; - auto nrows = data.extent(0); - auto ncols = data.extent(1); - auto row_stride = data.stride(1); - if (rowids.has_value()) { - rowids_ptr = rowids.value().data_handle(); - RAFT_EXPECTS(rowids.value().extent(0) <= nrows, "Rowids size is greater than nrows"); - nrows = rowids.value().extent(0); - } - if (colids.has_value()) { - colids_ptr = colids.value().data_handle(); - RAFT_EXPECTS(colids.value().extent(0) <= ncols, "Colids size is greater than ncols"); - ncols = colids.value().extent(0); - } - if (sampledcols.has_value()) { sampledcols_ptr = sampledcols.value().data_handle(); } - RAFT_EXPECTS(globalmin.extent(0) == ncols, "Size mismatch between globalmin and ncols"); - RAFT_EXPECTS(globalmax.extent(0) == ncols, "Size mismatch between globalmax and ncols"); - detail::minmax(data.data_handle(), - rowids_ptr, - colids_ptr, - nrows, - ncols, - row_stride, - globalmin.data_handle(), - globalmax.data_handle(), - sampledcols_ptr, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_minmax - -}; // namespace stats -}; // namespace cuvs -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/mutual_info_score.cuh b/cpp/include/cuvs/stats/mutual_info_score.cuh deleted file mode 100644 index 8573857b9..000000000 --- a/cpp/include/cuvs/stats/mutual_info_score.cuh +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __MUTUAL_INFO_SCORE_H -#define __MUTUAL_INFO_SCORE_H - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Function to calculate the mutual information between two clusters - * more info on mutual information - * @param firstClusterArray: the array of classes of type T - * @param secondClusterArray: the array of classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - */ -template -double mutual_info_score(const T* firstClusterArray, - const T* secondClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream) -{ - return detail::mutual_info_score( - firstClusterArray, secondClusterArray, size, lowerLabelRange, upperLabelRange, stream); -} - -/** - * @defgroup stats_mutual_info Mutual Information - * @{ - */ - -/** - * @brief Function to calculate the mutual information between two clusters - * more info on mutual information - * @tparam value_t the data type - * @tparam idx_t index type - * @param[in] handle the raft handle - * @param[in] first_cluster_array: the array of classes of type value_t - * @param[in] second_cluster_array: the array of classes of type value_t - * @param[in] lower_label_range: the lower bound of the range of labels - * @param[in] upper_label_range: the upper bound of the range of labels - * @return the mutual information score - */ -template -double mutual_info_score(raft::resources const& handle, - raft::device_vector_view first_cluster_array, - raft::device_vector_view second_cluster_array, - value_t lower_label_range, - value_t upper_label_range) -{ - RAFT_EXPECTS(first_cluster_array.extent(0) == second_cluster_array.extent(0), - "Size mismatch between first_cluster_array and second_cluster_array"); - RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous"); - RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous"); - return detail::mutual_info_score(first_cluster_array.data_handle(), - second_cluster_array.data_handle(), - first_cluster_array.extent(0), - lower_label_range, - upper_label_range, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_mutual_info - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/neighborhood_recall.cuh b/cpp/include/cuvs/stats/neighborhood_recall.cuh deleted file mode 100644 index e082bc87b..000000000 --- a/cpp/include/cuvs/stats/neighborhood_recall.cuh +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "detail/neighborhood_recall.cuh" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace raft::stats { - -/** - * @defgroup stats_neighborhood_recall Neighborhood Recall Score - * @{ - */ - -/** - * @brief Calculate Neighborhood Recall score on the device for indices, distances computed by any - * Nearest Neighbors Algorithm against reference indices, distances. Recall score is calculated by - * comparing the total number of matching indices and dividing that value by the total size of the - * indices matrix of dimensions (D, k). If distance matrices are provided, then non-matching indices - * could be considered a match if abs(dist, ref_dist) < eps. - * - * Usage example: - * @code{.cpp} - * raft::device_resources res; - * // assume D rows and N column dataset - * auto k = 64; - * auto indices = raft::make_device_matrix(res, D, k); - * auto distances = raft::make_device_matrix(res, D, k); - * // run ANN algorithm of choice - * - * auto ref_indices = raft::make_device_matrix(res, D, k); - * auto ref_distances = raft::make_device_matrix(res, D, k); - * // run brute-force KNN for reference - * - * auto scalar = 0.0f; - * auto recall_score = raft::make_device_scalar(res, scalar); - * - * raft::stats::neighborhood_recall(res, - raft::make_const_mdspan(indices.view()), - raft::make_const_mdspan(ref_indices.view()), - recall_score.view(), - raft::make_const_mdspan(distances.view()), - raft::make_const_mdspan(ref_distances.view())); - * @endcode - * - * @tparam IndicesValueType data-type of the indices - * @tparam IndexType data-type to index all matrices - * @tparam ScalarType data-type to store recall score - * @tparam DistanceValueType data-type of the distances - * @param res raft::resources object to manage resources - * @param[in] indices raft::device_matrix_view indices of neighbors - * @param[in] ref_indices raft::device_matrix_view reference indices of neighbors - * @param[out] recall_score raft::device_scalar_view output recall score - * @param[in] distances (optional) raft::device_matrix_view distances of neighbors - * @param[in] ref_distances (optional) raft::device_matrix_view reference distances of neighbors - * @param[in] eps (optional, default = 0.001) value within which distances are considered matching - */ -template -void neighborhood_recall( - raft::resources const& res, - raft::device_matrix_view indices, - raft::device_matrix_view ref_indices, - raft::device_scalar_view recall_score, - std::optional> - distances = std::nullopt, - std::optional> - ref_distances = std::nullopt, - std::optional> eps = std::nullopt) -{ - RAFT_EXPECTS(indices.extent(0) == ref_indices.extent(0), - "The number of rows in indices and reference indices should be equal"); - RAFT_EXPECTS(indices.extent(1) == ref_indices.extent(1), - "The number of columns in indices and reference indices should be equal"); - - if (distances.has_value() or ref_distances.has_value()) { - RAFT_EXPECTS(distances.has_value() and ref_distances.has_value(), - "Both distances and reference distances should have values"); - - RAFT_EXPECTS(distances.value().extent(0) == ref_distances.value().extent(0), - "The number of rows in distances and reference distances should be equal"); - RAFT_EXPECTS(distances.value().extent(1) == ref_distances.value().extent(1), - "The number of columns in indices and reference indices should be equal"); - - RAFT_EXPECTS(indices.extent(0) == distances.value().extent(0), - "The number of rows in indices and distances should be equal"); - RAFT_EXPECTS(indices.extent(1) == distances.value().extent(1), - "The number of columns in indices and distances should be equal"); - } - - DistanceValueType eps_val = 0.001; - if (eps.has_value()) { eps_val = *eps.value().data_handle(); } - - detail::neighborhood_recall( - res, indices, ref_indices, distances, ref_distances, recall_score, eps_val); -} - -/** - * @brief Calculate Neighborhood Recall score on the host for indices, distances computed by any - * Nearest Neighbors Algorithm against reference indices, distances. Recall score is calculated by - * comparing the total number of matching indices and dividing that value by the total size of the - * indices matrix of dimensions (D, k). If distance matrices are provided, then non-matching indices - * could be considered a match if abs(dist, ref_dist) < eps. - * - * Usage example: - * @code{.cpp} - * raft::device_resources res; - * // assume D rows and N column dataset - * auto k = 64; - * auto indices = raft::make_device_matrix(res, D, k); - * auto distances = raft::make_device_matrix(res, D, k); - * // run ANN algorithm of choice - * - * auto ref_indices = raft::make_device_matrix(res, D, k); - * auto ref_distances = raft::make_device_matrix(res, D, k); - * // run brute-force KNN for reference - * - * auto scalar = 0.0f; - * auto recall_score = raft::make_host_scalar(scalar); - * - * raft::stats::neighborhood_recall(res, - raft::make_const_mdspan(indices.view()), - raft::make_const_mdspan(ref_indices.view()), - recall_score.view(), - raft::make_const_mdspan(distances.view()), - raft::make_const_mdspan(ref_distances.view())); - * @endcode - * - * @tparam IndicesValueType data-type of the indices - * @tparam IndexType data-type to index all matrices - * @tparam ScalarType data-type to store recall score - * @tparam DistanceValueType data-type of the distances - * @param res raft::resources object to manage resources - * @param[in] indices raft::device_matrix_view indices of neighbors - * @param[in] ref_indices raft::device_matrix_view reference indices of neighbors - * @param[out] recall_score raft::host_scalar_view output recall score - * @param[in] distances (optional) raft::device_matrix_view distances of neighbors - * @param[in] ref_distances (optional) raft::device_matrix_view reference distances of neighbors - * @param[in] eps (optional, default = 0.001) value within which distances are considered matching - */ -template -void neighborhood_recall( - raft::resources const& res, - raft::device_matrix_view indices, - raft::device_matrix_view ref_indices, - raft::host_scalar_view recall_score, - std::optional> - distances = std::nullopt, - std::optional> - ref_distances = std::nullopt, - std::optional> eps = std::nullopt) -{ - auto recall_score_d = raft::make_device_scalar(res, *recall_score.data_handle()); - neighborhood_recall( - res, indices, ref_indices, recall_score_d.view(), distances, ref_distances, eps); - raft::update_host(recall_score.data_handle(), - recall_score_d.data_handle(), - 1, - raft::resource::get_cuda_stream(res)); - raft::resource::sync_stream(res); -} - -/** @} */ // end group stats_recall - -} // end namespace raft::stats diff --git a/cpp/include/cuvs/stats/r2_score.cuh b/cpp/include/cuvs/stats/r2_score.cuh deleted file mode 100644 index 109443cab..000000000 --- a/cpp/include/cuvs/stats/r2_score.cuh +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __R2_SCORE_H -#define __R2_SCORE_H - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * Calculates the "Coefficient of Determination" (R-Squared) score - * normalizing the sum of squared errors by the total sum of squares. - * - * This score indicates the proportionate amount of variation in an - * expected response variable is explained by the independent variables - * in a linear regression model. The larger the R-squared value, the - * more variability is explained by the linear regression model. - * - * @param y: Array of ground-truth response variables - * @param y_hat: Array of predicted response variables - * @param n: Number of elements in y and y_hat - * @param stream: cuda stream - * @return: The R-squared value. - */ -template -math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream) -{ - return detail::r2_score(y, y_hat, n, stream); -} - -/** - * @defgroup stats_r2_score Regression R2 Score - * @{ - */ - -/** - * Calculates the "Coefficient of Determination" (R-Squared) score - * normalizing the sum of squared errors by the total sum of squares. - * - * This score indicates the proportionate amount of variation in an - * expected response variable is explained by the independent variables - * in a linear regression model. The larger the R-squared value, the - * more variability is explained by the linear regression model. - * - * @tparam value_t the data type - * @tparam idx_t index type - * @param[in] handle the raft handle - * @param[in] y: Array of ground-truth response variables - * @param[in] y_hat: Array of predicted response variables - * @return: The R-squared value. - * @note The constness of y and y_hat is currently casted away. - */ -template -value_t r2_score(raft::resources const& handle, - raft::device_vector_view y, - raft::device_vector_view y_hat) -{ - RAFT_EXPECTS(y.extent(0) == y_hat.extent(0), "Size mismatch between y and y_hat"); - RAFT_EXPECTS(y.is_exhaustive(), "y must be contiguous"); - RAFT_EXPECTS(y_hat.is_exhaustive(), "y_hat must be contiguous"); - - // TODO: Change the underlying implementation to remove the need to const_cast - return detail::r2_score(const_cast(y.data_handle()), - const_cast(y_hat.data_handle()), - y.extent(0), - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_r2_score - -} // namespace stats -} // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/rand_index.cuh b/cpp/include/cuvs/stats/rand_index.cuh deleted file mode 100644 index c99f636cd..000000000 --- a/cpp/include/cuvs/stats/rand_index.cuh +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __RAND_INDEX_H -#define __RAND_INDEX_H - -#pragma once - -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Function to calculate RandIndex - * more info on rand index - * @param firstClusterArray: the array of classes of type T - * @param secondClusterArray: the array of classes of type T - * @param size: the size of the data points of type uint64_t - * @param stream: the cudaStream object - */ -template -double rand_index(T* firstClusterArray, T* secondClusterArray, uint64_t size, cudaStream_t stream) -{ - return detail::compute_rand_index(firstClusterArray, secondClusterArray, size, stream); -} - -/** - * @defgroup stats_rand_index Rand Index - * @{ - */ - -/** - * @brief Function to calculate RandIndex - * more info on rand index - * @tparam value_t the data type - * @tparam idx_t index type - * @param[in] handle the raft handle - * @param[in] first_cluster_array: the array of classes of type value_t - * @param[in] second_cluster_array: the array of classes of type value_t - * @return: The RandIndex value. - */ -template -double rand_index(raft::resources const& handle, - raft::device_vector_view first_cluster_array, - raft::device_vector_view second_cluster_array) -{ - RAFT_EXPECTS(first_cluster_array.extent(0) == second_cluster_array.extent(0), - "Size mismatch between first_cluster_array and second_cluster_array"); - RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous"); - RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous"); - return detail::compute_rand_index(first_cluster_array.data_handle(), - second_cluster_array.data_handle(), - second_cluster_array.extent(0), - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_rand_index - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/regression_metrics.cuh b/cpp/include/cuvs/stats/regression_metrics.cuh deleted file mode 100644 index 5c477424e..000000000 --- a/cpp/include/cuvs/stats/regression_metrics.cuh +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __REGRESSION_METRICS_H -#define __REGRESSION_METRICS_H - -#pragma once - -#include -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error - * @tparam T: data type for predictions (e.g., float or double for regression). - * @param[in] predictions: array of predictions (GPU pointer). - * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer). - * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0. - * @param[in] stream: cuda stream. - * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] - - * ref_predictions[i]|) / n. - * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] - - * ref_predictions[i])^2) / n. - * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] - - * ref_predictions[i]| for i in [0, n). - */ -template -void regression_metrics(const T* predictions, - const T* ref_predictions, - int n, - cudaStream_t stream, - double& mean_abs_error, - double& mean_squared_error, - double& median_abs_error) -{ - detail::regression_metrics( - predictions, ref_predictions, n, stream, mean_abs_error, mean_squared_error, median_abs_error); -} - -/** - * @defgroup stats_regression_metrics Regression Metrics - * @{ - */ - -/** - * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error - * @tparam value_t the data type for predictions (e.g., float or double for regression). - * @tparam idx_t index type - * @param[in] handle the raft handle - * @param[in] predictions: array of predictions. - * @param[in] ref_predictions: array of reference (ground-truth) predictions. - * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] - - * ref_predictions[i]|) / n. - * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] - - * ref_predictions[i])^2) / n. - * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] - - * ref_predictions[i]| for i in [0, n). - */ -template -void regression_metrics(raft::resources const& handle, - raft::device_vector_view predictions, - raft::device_vector_view ref_predictions, - raft::host_scalar_view mean_abs_error, - raft::host_scalar_view mean_squared_error, - raft::host_scalar_view median_abs_error) -{ - RAFT_EXPECTS(predictions.extent(0) == ref_predictions.extent(0), - "Size mismatch between predictions and ref_predictions"); - RAFT_EXPECTS(predictions.is_exhaustive(), "predictions must be contiguous"); - RAFT_EXPECTS(ref_predictions.is_exhaustive(), "ref_predictions must be contiguous"); - RAFT_EXPECTS(mean_abs_error.data_handle() != nullptr, "mean_abs_error view must not be empty"); - RAFT_EXPECTS(mean_squared_error.data_handle() != nullptr, - "mean_squared_error view must not be empty"); - RAFT_EXPECTS(median_abs_error.data_handle() != nullptr, - "median_abs_error view must not be empty"); - detail::regression_metrics(predictions.data_handle(), - ref_predictions.data_handle(), - predictions.extent(0), - resource::get_cuda_stream(handle), - *mean_abs_error.data_handle(), - *mean_squared_error.data_handle(), - *median_abs_error.data_handle()); -} - -/** @} */ // end group stats_regression_metrics - -} // namespace stats -} // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/silhouette_score.cuh b/cpp/include/cuvs/stats/silhouette_score.cuh deleted file mode 100644 index 78cdf66d2..000000000 --- a/cpp/include/cuvs/stats/silhouette_score.cuh +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __SILHOUETTE_SCORE_H -#define __SILHOUETTE_SCORE_H - -#pragma once - -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief main function that returns the average silhouette score for a given set of data and its - * clusterings - * @tparam DataT: type of the data samples - * @tparam LabelT: type of the labels - * @param handle: raft handle for managing expensive resources - * @param X_in: pointer to the input Data samples array (nRows x nCols) - * @param nRows: number of data samples - * @param nCols: number of features - * @param labels: the pointer to the array containing labels for every data sample (1 x nRows) - * @param nLabels: number of Labels - * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and - * is populated with the silhouette score for every sample (1 x nRows) - * @param stream: the cuda stream where to launch this kernel - * @param metric: the numerical value that maps to the type of distance metric to be used in the - * calculations - */ -template -DataT silhouette_score( - raft::resources const& handle, - DataT* X_in, - int nRows, - int nCols, - LabelT* labels, - int nLabels, - DataT* silhouette_scorePerSample, - cudaStream_t stream, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded) -{ - return detail::silhouette_score( - handle, X_in, nRows, nCols, labels, nLabels, silhouette_scorePerSample, stream, metric); -} - -template -value_t silhouette_score_batched( - raft::resources const& handle, - value_t* X, - value_idx n_rows, - value_idx n_cols, - label_idx* y, - label_idx n_labels, - value_t* scores, - value_idx chunk, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded) -{ - return batched::detail::silhouette_score( - handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric); -} - -/** - * @defgroup stats_silhouette_score Silhouette Score - * @{ - */ - -/** - * @brief main function that returns the average silhouette score for a given set of data and its - * clusterings - * @tparam value_t: type of the data samples - * @tparam label_t: type of the labels - * @tparam idx_t index type - * @param[in] handle: raft handle for managing expensive resources - * @param[in] X_in: input matrix Data in row-major format (nRows x nCols) - * @param[in] labels: the pointer to the array containing labels for every data sample (length: - * nRows) - * @param[out] silhouette_score_per_sample: optional array populated with the silhouette score - * for every sample (length: nRows) - * @param[in] n_unique_labels: number of unique labels in the labels array - * @param[in] metric: the numerical value that maps to the type of distance metric to be used in - * the calculations - * @return: The silhouette score. - */ -template -value_t silhouette_score( - raft::resources const& handle, - raft::device_matrix_view X_in, - raft::device_vector_view labels, - std::optional> silhouette_score_per_sample, - idx_t n_unique_labels, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded) -{ - RAFT_EXPECTS(labels.extent(0) == X_in.extent(0), "Size mismatch between labels and data"); - - value_t* silhouette_score_per_sample_ptr = nullptr; - if (silhouette_score_per_sample.has_value()) { - silhouette_score_per_sample_ptr = silhouette_score_per_sample.value().data_handle(); - RAFT_EXPECTS(silhouette_score_per_sample.value().extent(0) == X_in.extent(0), - "Size mismatch between silhouette_score_per_sample and data"); - } - return detail::silhouette_score(handle, - X_in.data_handle(), - X_in.extent(0), - X_in.extent(1), - labels.data_handle(), - n_unique_labels, - silhouette_score_per_sample_ptr, - resource::get_cuda_stream(handle), - metric); -} - -/** - * @brief function that returns the average silhouette score for a given set of data and its - * clusterings - * @tparam value_t: type of the data samples - * @tparam label_t: type of the labels - * @tparam idx_t index type - * @param[in] handle: raft handle for managing expensive resources - * @param[in] X: input matrix Data in row-major format (nRows x nCols) - * @param[in] labels: the pointer to the array containing labels for every data sample (length: - * nRows) - * @param[out] silhouette_score_per_sample: optional array populated with the silhouette score - * for every sample (length: nRows) - * @param[in] n_unique_labels: number of unique labels in the labels array - * @param[in] batch_size: number of samples per batch - * @param[in] metric: the numerical value that maps to the type of distance metric to be used in - * the calculations - * @return: The silhouette score. - */ -template -value_t silhouette_score_batched( - raft::resources const& handle, - raft::device_matrix_view X, - raft::device_vector_view labels, - std::optional> silhouette_score_per_sample, - idx_t n_unique_labels, - idx_t batch_size, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded) -{ - static_assert(std::is_integral_v, - "silhouette_score_batched: The index type " - "of each mdspan argument must be an integral type."); - static_assert(std::is_integral_v, - "silhouette_score_batched: The label type must be an integral type."); - RAFT_EXPECTS(labels.extent(0) == X.extent(0), "Size mismatch between labels and data"); - - value_t* scores_ptr = nullptr; - if (silhouette_score_per_sample.has_value()) { - scores_ptr = silhouette_score_per_sample.value().data_handle(); - RAFT_EXPECTS(silhouette_score_per_sample.value().extent(0) == X.extent(0), - "Size mismatch between silhouette_score_per_sample and data"); - } - return batched::detail::silhouette_score(handle, - X.data_handle(), - X.extent(0), - X.extent(1), - labels.data_handle(), - n_unique_labels, - scores_ptr, - batch_size, - metric); -} - -/** @} */ // end group stats_silhouette_score - -/** - * @brief Overload of `silhouette_score` to help the - * compiler find the above overload, in case users pass in - * `std::nullopt` for the optional arguments. - * - * Please see above for documentation of `silhouette_score`. - */ -template -value_t silhouette_score( - raft::resources const& handle, - raft::device_matrix_view X_in, - raft::device_vector_view labels, - std::nullopt_t silhouette_score_per_sample, - idx_t n_unique_labels, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded) -{ - std::optional> opt_scores = silhouette_score_per_sample; - return silhouette_score(handle, X_in, labels, opt_scores, n_unique_labels, metric); -} - -/** - * @brief Overload of `silhouette_score_batched` to help the - * compiler find the above overload, in case users pass in - * `std::nullopt` for the optional arguments. - * - * Please see above for documentation of `silhouette_score_batched`. - */ -template -value_t silhouette_score_batched( - raft::resources const& handle, - raft::device_matrix_view X, - raft::device_vector_view labels, - std::nullopt_t silhouette_score_per_sample, - idx_t n_unique_labels, - idx_t batch_size, - cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded) -{ - std::optional> opt_scores = silhouette_score_per_sample; - return silhouette_score_batched( - handle, X, labels, opt_scores, n_unique_labels, batch_size, metric); -} -}; // namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/specializations.cuh b/cpp/include/cuvs/stats/specializations.cuh deleted file mode 100644 index 9588a7f32..000000000 --- a/cpp/include/cuvs/stats/specializations.cuh +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#pragma message( \ - __FILE__ \ - " is deprecated and will be removed." \ - " Including specializations is not necessary any more." \ - " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html") diff --git a/cpp/include/cuvs/stats/stats_types.hpp b/cpp/include/cuvs/stats/stats_types.hpp deleted file mode 100644 index 638ca75bc..000000000 --- a/cpp/include/cuvs/stats/stats_types.hpp +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace raft::stats { - -/** - * @ingroup stats_histogram - * @{ - */ - -/** - * @brief Types of support histogram implementations - */ -enum HistType { - /** shared mem atomics but with bins to be 1b int's */ - HistTypeSmemBits1 = 1, - /** shared mem atomics but with bins to be 2b int's */ - HistTypeSmemBits2 = 2, - /** shared mem atomics but with bins to be 4b int's */ - HistTypeSmemBits4 = 4, - /** shared mem atomics but with bins to ba 1B int's */ - HistTypeSmemBits8 = 8, - /** shared mem atomics but with bins to be 2B int's */ - HistTypeSmemBits16 = 16, - /** use only global atomics */ - HistTypeGmem, - /** uses shared mem atomics to reduce global traffic */ - HistTypeSmem, - /** - * uses shared mem atomics with match_any intrinsic to further reduce shared - * memory traffic. This can only be enabled on Volta and later architectures. - * If one tries to enable this for older arch's, it will fall back to - * `HistTypeSmem`. - * @note This is to be used only when the input dataset leads to a lot of - * repetitions in a given warp, else, this algo can be much slower than - * `HistTypeSmem`! - */ - HistTypeSmemMatchAny, - /** builds a hashmap of active bins in shared mem */ - HistTypeSmemHash, - /** decide at runtime the best algo for the given inputs */ - HistTypeAuto -}; - -/** @} */ - -/** - * @ingroup stats_information_criterion - * @{ - */ - -/** - * @brief Supported types of information criteria - */ -enum IC_Type { AIC, AICc, BIC }; - -/** @} */ - -}; // end namespace raft::stats diff --git a/cpp/include/cuvs/stats/stddev.cuh b/cpp/include/cuvs/stats/stddev.cuh deleted file mode 100644 index d67cc5775..000000000 --- a/cpp/include/cuvs/stats/stddev.cuh +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef __STDDEV_H -#define __STDDEV_H - -#pragma once - -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Compute stddev of the input matrix - * - * Stddev operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param std the output stddev vector - * @param data the input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param sample whether to evaluate sample stddev or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - * @param rowMajor whether the input data is row or col major - * @param stream cuda stream where to launch work - */ -template -void stddev(Type* std, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool sample, - bool rowMajor, - cudaStream_t stream) -{ - detail::stddev(std, data, mu, D, N, sample, rowMajor, stream); -} - -/** - * @brief Compute variance of the input matrix - * - * Variance operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param var the output stddev vector - * @param data the input matrix - * @param mu the mean vector - * @param D number of columns of data - * @param N number of rows of data - * @param sample whether to evaluate sample stddev or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - * @param rowMajor whether the input data is row or col major - * @param stream cuda stream where to launch work - */ -template -void vars(Type* var, - const Type* data, - const Type* mu, - IdxType D, - IdxType N, - bool sample, - bool rowMajor, - cudaStream_t stream) -{ - detail::vars(var, data, mu, D, N, sample, rowMajor, stream); -} - -/** - * @defgroup stats_stddev Standard Deviation - * @{ - */ - -/** - * @brief Compute stddev of the input matrix - * - * Stddev operation is assumed to be performed on a given column. - * - * @tparam value_t the data type - * @tparam idx_t Integer type used to for addressing - * @tparam layout_t Layout type of the input matrix. - * @param[in] handle the raft handle - * @param[in] data the input matrix - * @param[in] mu the mean vector - * @param[out] std the output stddev vector - * @param[in] sample whether to evaluate sample stddev or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - */ -template -void stddev(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view mu, - raft::device_vector_view std, - bool sample) -{ - constexpr bool is_row_major = std::is_same_v; - constexpr bool is_col_major = std::is_same_v; - static_assert(is_row_major || is_col_major, - "stddev: Layout must be either " - "raft::row_major or raft::col_major (or one of their aliases)"); - RAFT_EXPECTS(mu.size() == std.size(), "Size mismatch between mu and std"); - RAFT_EXPECTS(mu.extent(0) == data.extent(1), "Size mismatch between data and mu"); - detail::stddev(std.data_handle(), - data.data_handle(), - mu.data_handle(), - data.extent(1), - data.extent(0), - sample, - is_row_major, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_stddev - -/** - * @defgroup stats_variance Variance - * @{ - */ - -/** - * @brief Compute variance of the input matrix - * - * Variance operation is assumed to be performed on a given column. - * - * @tparam value_t the data type - * @tparam idx_t Integer type used to for addressing - * @tparam layout_t Layout type of the input matrix. - * @param[in] handle the raft handle - * @param[in] data the input matrix - * @param[in] mu the mean vector - * @param[out] var the output stddev vector - * @param[in] sample whether to evaluate sample stddev or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively - */ -template -void vars(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view mu, - raft::device_vector_view var, - bool sample) -{ - constexpr bool is_row_major = std::is_same_v; - constexpr bool is_col_major = std::is_same_v; - static_assert(is_row_major || is_col_major, - "vars: Layout must be either " - "raft::row_major or raft::col_major (or one of their aliases)"); - RAFT_EXPECTS(mu.size() == var.size(), "Size mismatch between mu and std"); - RAFT_EXPECTS(mu.extent(0) == data.extent(1), "Size mismatch between data and mu"); - detail::vars(var.data_handle(), - data.data_handle(), - mu.data_handle(), - data.extent(1), - data.extent(0), - sample, - is_row_major, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_variance - -}; // namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/sum.cuh b/cpp/include/cuvs/stats/sum.cuh deleted file mode 100644 index 6802da638..000000000 --- a/cpp/include/cuvs/stats/sum.cuh +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __SUM_H -#define __SUM_H - -#pragma once - -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Compute sum of the input matrix - * - * Sum operation is assumed to be performed on a given column. - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param output the output mean vector - * @param input the input matrix - * @param D number of columns of data - * @param N number of rows of data - * @param rowMajor whether the input data is row or col major - * @param stream cuda stream where to launch work - */ -template -void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) -{ - detail::sum(output, input, D, N, rowMajor, stream); -} - -/** - * @defgroup stats_sum Sum - * @{ - */ - -/** - * @brief Compute sum of the input matrix - * - * Sum operation is assumed to be performed on a given column. - * - * @tparam value_t the data type - * @tparam idx_t Integer type used to for addressing - * @tparam layout_t Layout type of the input matrix. - * @param[in] handle the raft handle - * @param[in] input the input matrix - * @param[out] output the output mean vector - */ -template -void sum(raft::resources const& handle, - raft::device_matrix_view input, - raft::device_vector_view output) -{ - constexpr bool is_row_major = std::is_same_v; - constexpr bool is_col_major = std::is_same_v; - static_assert(is_row_major || is_col_major, - "sum: Layout must be either " - "raft::row_major or raft::col_major (or one of their aliases)"); - RAFT_EXPECTS(input.extent(1) == output.extent(0), "Size mismatch between input and output"); - detail::sum(output.data_handle(), - input.data_handle(), - input.extent(1), - input.extent(0), - is_row_major, - resource::get_cuda_stream(handle)); -} - -/** @} */ // end group stats_sum - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/trustworthiness_score.cuh b/cpp/include/cuvs/stats/trustworthiness_score.cuh deleted file mode 100644 index df427c256..000000000 --- a/cpp/include/cuvs/stats/trustworthiness_score.cuh +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __TRUSTWORTHINESS_SCORE_H -#define __TRUSTWORTHINESS_SCORE_H - -#pragma once -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Compute the trustworthiness score - * @param[in] h: raft handle - * @param[in] X: Data in original dimension - * @param[in] X_embedded: Data in target dimension (embedding) - * @param[in] n: Number of samples - * @param[in] m: Number of features in high/original dimension - * @param[in] d: Number of features in low/embedded dimension - * @param[in] n_neighbors Number of neighbors considered by trustworthiness score - * @param[in] batchSize Batch size - * @return[out] Trustworthiness score - */ -template -double trustworthiness_score(const raft::resources& h, - const math_t* X, - math_t* X_embedded, - int n, - int m, - int d, - int n_neighbors, - int batchSize = 512) -{ - return detail::trustworthiness_score( - h, X, X_embedded, n, m, d, n_neighbors, batchSize); -} - -/** - * @defgroup stats_trustworthiness Trustworthiness - * @{ - */ - -/** - * @brief Compute the trustworthiness score - * @tparam value_t the data type - * @tparam idx_t Integer type used to for addressing - * @param[in] handle the raft handle - * @param[in] X: Data in original dimension - * @param[in] X_embedded: Data in target dimension (embedding) - * @param[in] n_neighbors Number of neighbors considered by trustworthiness score - * @param[in] batch_size Batch size - * @return Trustworthiness score - * @note The constness of the data in X_embedded is currently casted away and the data is slightly - * modified. - */ -template -double trustworthiness_score( - raft::resources const& handle, - raft::device_matrix_view X, - raft::device_matrix_view X_embedded, - int n_neighbors, - int batch_size = 512) -{ - RAFT_EXPECTS(X.extent(0) == X_embedded.extent(0), "Size mismatch between X and X_embedded"); - RAFT_EXPECTS(std::is_integral_v && X.extent(0) <= std::numeric_limits::max(), - "Index type not supported"); - - // TODO: Change the underlying implementation to remove the need to const_cast X_embedded. - return detail::trustworthiness_score( - handle, - X.data_handle(), - const_cast(X_embedded.data_handle()), - X.extent(0), - X.extent(1), - X_embedded.extent(1), - n_neighbors, - batch_size); -} - -/** @} */ // end group stats_trustworthiness - -} // namespace stats -} // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/v_measure.cuh b/cpp/include/cuvs/stats/v_measure.cuh deleted file mode 100644 index f6b65989d..000000000 --- a/cpp/include/cuvs/stats/v_measure.cuh +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __V_MEASURE_H -#define __V_MEASURE_H - -#pragma once -#include -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Function to calculate the v-measure between two clusters - * - * @param truthClusterArray: the array of truth classes of type T - * @param predClusterArray: the array of predicted classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - * @param beta: v_measure parameter - */ -template -double v_measure(const T* truthClusterArray, - const T* predClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream, - double beta = 1.0) -{ - return detail::v_measure( - truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream, beta); -} - -/** - * @defgroup stats_vmeasure V-Measure - * @{ - */ - -/** - * @brief Function to calculate the v-measure between two clusters - * - * @tparam value_t the data type - * @tparam idx_t Integer type used to for addressing - * @param[in] handle the raft handle - * @param[in] truth_cluster_array: the array of truth classes of type T - * @param[in] pred_cluster_array: the array of predicted classes of type T - * @param[in] lower_label_range: the lower bound of the range of labels - * @param[in] upper_label_range: the upper bound of the range of labels - * @param[in] beta: v_measure parameter - * @return the v-measure between the two clusters - */ -template -double v_measure(raft::resources const& handle, - raft::device_vector_view truth_cluster_array, - raft::device_vector_view pred_cluster_array, - value_t lower_label_range, - value_t upper_label_range, - double beta = 1.0) -{ - RAFT_EXPECTS(truth_cluster_array.extent(0) == pred_cluster_array.extent(0), - "Size mismatch between truth_cluster_array and pred_cluster_array"); - RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous"); - RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous"); - - return detail::v_measure(truth_cluster_array.data_handle(), - pred_cluster_array.data_handle(), - truth_cluster_array.extent(0), - lower_label_range, - upper_label_range, - resource::get_cuda_stream(handle), - beta); -} - -/** @} */ // end group stats_vmeasure - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/include/cuvs/stats/weighted_mean.cuh b/cpp/include/cuvs/stats/weighted_mean.cuh deleted file mode 100644 index 64b8ade38..000000000 --- a/cpp/include/cuvs/stats/weighted_mean.cuh +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __WEIGHTED_MEAN_H -#define __WEIGHTED_MEAN_H - -#pragma once - -#include -#include -#include - -namespace cuvs { -namespace stats { - -/** - * @brief Compute the weighted mean of the input matrix with a - * vector of weights, along rows or along columns - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param mu the output mean vector - * @param data the input matrix - * @param weights weight of size D if along_row is true, else of size N - * @param D number of columns of data - * @param N number of rows of data - * @param row_major data input matrix is row-major or not - * @param along_rows whether to reduce along rows or columns - * @param stream cuda stream to launch work on - */ -template -void weightedMean(Type* mu, - const Type* data, - const Type* weights, - IdxType D, - IdxType N, - bool row_major, - bool along_rows, - cudaStream_t stream) -{ - detail::weightedMean(mu, data, weights, D, N, row_major, along_rows, stream); -} - -/** - * @brief Compute the row-wise weighted mean of the input matrix with a - * vector of column weights - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param mu the output mean vector - * @param data the input matrix (assumed to be row-major) - * @param weights per-column means - * @param D number of columns of data - * @param N number of rows of data - * @param stream cuda stream to launch work on - */ -template -void rowWeightedMean( - Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream) -{ - weightedMean(mu, data, weights, D, N, true, true, stream); -} - -/** - * @brief Compute the column-wise weighted mean of the input matrix with a - * vector of row weights - * - * @tparam Type the data type - * @tparam IdxType Integer type used to for addressing - * @param mu the output mean vector - * @param data the input matrix (assumed to be row-major) - * @param weights per-row means - * @param D number of columns of data - * @param N number of rows of data - * @param stream cuda stream to launch work on - */ -template -void colWeightedMean( - Type* mu, const Type* data, const Type* weights, IdxType D, IdxType N, cudaStream_t stream) -{ - weightedMean(mu, data, weights, D, N, true, false, stream); -} - -/** - * @defgroup stats_weighted_mean Weighted Mean - * @{ - */ - -/** - * @brief Compute the weighted mean of the input matrix with a - * vector of weights, along rows or along columns - * - * @tparam value_t the data type - * @tparam idx_t Integer type used to for addressing - * @tparam layout_t Layout type of the input matrix. - * @param[in] handle the raft handle - * @param[in] data the input matrix of size nrows * ncols - * @param[in] weights weight of size ncols if along_row is true, else of size nrows - * @param[out] mu the output mean vector of size nrows if along_row is true, else of size ncols - * @param[in] along_rows whether to reduce along rows or columns - */ -template -void weighted_mean(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view weights, - raft::device_vector_view mu, - bool along_rows) -{ - constexpr bool is_row_major = std::is_same_v; - constexpr bool is_col_major = std::is_same_v; - static_assert(is_row_major || is_col_major, - "weighted_mean: Layout must be either " - "raft::row_major or raft::col_major (or one of their aliases)"); - auto mean_vec_size = along_rows ? data.extent(0) : data.extent(1); - auto weight_size = along_rows ? data.extent(1) : data.extent(0); - - RAFT_EXPECTS(weights.extent(0) == weight_size, - "Size mismatch between weights and expected weight_size"); - RAFT_EXPECTS(mu.extent(0) == mean_vec_size, - "Size mismatch between mu and expected mean_vec_size"); - - detail::weightedMean(mu.data_handle(), - data.data_handle(), - weights.data_handle(), - data.extent(1), - data.extent(0), - is_row_major, - along_rows, - resource::get_cuda_stream(handle)); -} - -/** - * @brief Compute the row-wise weighted mean of the input matrix with a - * vector of column weights - * - * @tparam value_t the data type - * @tparam idx_t Integer type used to for addressing - * @tparam layout_t Layout type of the input matrix. - * @param[in] handle the raft handle - * @param[in] data the input matrix of size nrows * ncols - * @param[in] weights weight vector of size ncols - * @param[out] mu the output mean vector of size nrows - */ -template -void row_weighted_mean(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view weights, - raft::device_vector_view mu) -{ - weighted_mean(handle, data, weights, mu, true); -} - -/** - * @brief Compute the column-wise weighted mean of the input matrix with a - * vector of row weights - * - * @tparam value_t the data type - * @tparam idx_t Integer type used to for addressing - * @tparam layout_t Layout type of the input matrix. - * @param[in] handle the raft handle - * @param[in] data the input matrix of size nrows * ncols - * @param[in] weights weight vector of size nrows - * @param[out] mu the output mean vector of size ncols - */ -template -void col_weighted_mean(raft::resources const& handle, - raft::device_matrix_view data, - raft::device_vector_view weights, - raft::device_vector_view mu) -{ - weighted_mean(handle, data, weights, mu, false); -} - -/** @} */ // end group stats_weighted_mean - -}; // end namespace stats -}; // namespace cuvs - -#endif \ No newline at end of file diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh index 59f6ab169..079740945 100644 --- a/cpp/test/neighbors/ann_utils.cuh +++ b/cpp/test/neighbors/ann_utils.cuh @@ -17,7 +17,6 @@ #pragma once #include -#include #include // raft::make_device_matrix #include #include