From 5727843ebc20a813093beed5db5e85bcf2079e91 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 21 Nov 2023 00:19:50 -0500 Subject: [PATCH] More qualifications --- cpp/include/cuvs/cluster/detail/kmeans.cuh | 27 +++---- .../cuvs/cluster/detail/kmeans_balanced.cuh | 16 ++-- .../cuvs/cluster/detail/kmeans_common.cuh | 6 +- .../cuvs/distance/detail/fused_l2_nn.cuh | 4 +- .../cuvs/distance/detail/masked_nn.cuh | 2 +- .../detail/pairwise_distance_base.cuh | 2 +- cpp/include/cuvs/distance/distance-ext.cuh | 2 +- .../cuvs/neighbors/brute_force-inl.cuh | 4 +- cpp/include/cuvs/neighbors/cagra_types.hpp | 2 +- .../neighbors/detail/cagra/cagra_build.cuh | 9 ++- .../neighbors/detail/cagra/cagra_search.cuh | 2 +- .../detail/cagra/cagra_serialize.cuh | 7 +- .../detail/cagra/search_multi_kernel.cuh | 26 +++---- .../cuvs/neighbors/detail/div_utils.hpp | 12 +-- .../detail/faiss_select/MergeNetworkWarp.cuh | 2 +- .../neighbors/detail/faiss_select/Select.cuh | 8 +- .../faiss_select/key_value_block_select.cuh | 10 +-- .../cuvs/neighbors/detail/ivf_flat_build.cuh | 15 ++-- .../detail/ivf_flat_interleaved_scan-inl.cuh | 55 +++++++------- .../neighbors/detail/ivf_flat_search-inl.cuh | 44 +++++------ .../neighbors/detail/ivf_flat_serialize.cuh | 2 +- .../cuvs/neighbors/detail/ivf_pq_build.cuh | 75 ++++++++++--------- .../neighbors/detail/ivf_pq_codepacking.cuh | 21 +++--- .../detail/ivf_pq_compute_similarity-inl.cuh | 35 +++++---- .../detail/ivf_pq_dummy_block_sort.cuh | 5 +- .../cuvs/neighbors/detail/ivf_pq_search.cuh | 48 ++++++------ .../cuvs/neighbors/detail/knn_merge_parts.cuh | 2 +- .../cuvs/neighbors/detail/refine_device.cuh | 4 +- .../cuvs/neighbors/detail/refine_host-inl.hpp | 2 +- .../neighbors/detail/selection_faiss-inl.cuh | 4 +- .../cuvs/neighbors/ivf_flat_helpers.cuh | 5 +- cpp/include/cuvs/neighbors/ivf_flat_types.hpp | 2 +- cpp/include/cuvs/neighbors/ivf_list.hpp | 4 +- cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh | 10 ++- cpp/include/cuvs/neighbors/ivf_pq_types.hpp | 20 ++--- cpp/include/cuvs/spatial/knn/ann.cuh | 4 +- .../cuvs/spatial/knn/detail/ball_cover.cuh | 40 +++++----- .../knn/detail/ball_cover/registers-inl.cuh | 10 +-- cpp/include/cuvs/spatial/knn/knn.cuh | 19 ++--- cpp/include/cuvs/spectral/cluster_solvers.cuh | 2 +- .../cuvs/spectral/detail/matrix_wrappers.hpp | 18 ++--- cpp/include/cuvs/spectral/eigen_solvers.cuh | 4 +- .../cuvs/spectral/modularity_maximization.cuh | 4 +- cpp/include/cuvs/spectral/partition.cuh | 4 +- cpp/include/cuvs/stats/detail/meanvar.cuh | 7 +- 45 files changed, 312 insertions(+), 294 deletions(-) diff --git a/cpp/include/cuvs/cluster/detail/kmeans.cuh b/cpp/include/cuvs/cluster/detail/kmeans.cuh index b192973ad..1ed9f4ccd 100644 --- a/cpp/include/cuvs/cluster/detail/kmeans.cuh +++ b/cpp/include/cuvs/cluster/detail/kmeans.cuh @@ -66,7 +66,7 @@ void initRandom(raft::resources const& handle, raft::device_matrix_view X, raft::device_matrix_view centroids) { - common::nvtx::range fun_scope("initRandom"); + raft::common::nvtx::range fun_scope("initRandom"); cudaStream_t stream = resource::get_cuda_stream(handle); auto n_clusters = params.n_clusters; detail::shuffleAndGather(handle, X, centroids, n_clusters, params.rng_state.seed); @@ -93,7 +93,7 @@ void kmeansPlusPlus(raft::resources const& handle, raft::device_matrix_view centroidsRawData, rmm::device_uvector& workspace) { - common::nvtx::range fun_scope("kmeansPlusPlus"); + raft::common::nvtx::range fun_scope("kmeansPlusPlus"); cudaStream_t stream = resource::get_cuda_stream(handle); auto n_samples = X.extent(0); auto n_features = X.extent(1); @@ -367,7 +367,7 @@ void kmeans_fit_main(raft::resources const& handle, raft::host_scalar_view n_iter, rmm::device_uvector& workspace) { - common::nvtx::range fun_scope("kmeans_fit_main"); + raft::common::nvtx::range fun_scope("kmeans_fit_main"); logger::get(RAFT_NAME).set_level(params.verbosity); cudaStream_t stream = resource::get_cuda_stream(handle); auto n_samples = X.extent(0); @@ -524,7 +524,7 @@ void kmeans_fit_main(raft::resources const& handle, workspace); // TODO: add different templates for InType of binaryOp to avoid thrust transform - thrust::transform(resource::get_thrust_policy(handle), + thrust::transform(raft::resource::get_thrust_policy(handle), minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + minClusterAndDistance.size(), weight.data_handle(), @@ -581,7 +581,8 @@ void initScalableKMeansPlusPlus(raft::resources const& handle, raft::device_matrix_view centroidsRawData, rmm::device_uvector& workspace) { - common::nvtx::range fun_scope("initScalableKMeansPlusPlus"); + raft::common::nvtx::range fun_scope( + "initScalableKMeansPlusPlus"); cudaStream_t stream = resource::get_cuda_stream(handle); auto n_samples = X.extent(0); auto n_features = X.extent(1); @@ -826,7 +827,7 @@ void kmeans_fit(raft::resources const& handle, raft::host_scalar_view inertia, raft::host_scalar_view n_iter) { - common::nvtx::range fun_scope("kmeans_fit"); + raft::common::nvtx::range fun_scope("kmeans_fit"); auto n_samples = X.extent(0); auto n_features = X.extent(1); auto n_clusters = params.n_clusters; @@ -872,7 +873,7 @@ void kmeans_fit(raft::resources const& handle, if (sample_weight.has_value()) raft::copy(weight.data_handle(), sample_weight.value().data_handle(), n_samples, stream); else - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), weight.data_handle(), weight.data_handle() + weight.size(), 1); @@ -993,7 +994,7 @@ void kmeans_predict(raft::resources const& handle, bool normalize_weight, raft::host_scalar_view inertia) { - common::nvtx::range fun_scope("kmeans_predict"); + raft::common::nvtx::range fun_scope("kmeans_predict"); auto n_samples = X.extent(0); auto n_features = X.extent(1); cudaStream_t stream = resource::get_cuda_stream(handle); @@ -1019,7 +1020,7 @@ void kmeans_predict(raft::resources const& handle, if (sample_weight.has_value()) raft::copy(weight.data_handle(), sample_weight.value().data_handle(), n_samples, stream); else - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), weight.data_handle(), weight.data_handle() + weight.size(), 1); @@ -1065,7 +1066,7 @@ void kmeans_predict(raft::resources const& handle, // calculate cluster cost phi_x(C) rmm::device_scalar clusterCostD(stream); // TODO: add different templates for InType of binaryOp to avoid thrust transform - thrust::transform(resource::get_thrust_policy(handle), + thrust::transform(raft::resource::get_thrust_policy(handle), minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + minClusterAndDistance.size(), weight.data_handle(), @@ -1084,7 +1085,7 @@ void kmeans_predict(raft::resources const& handle, raft::value_op{}, raft::add_op{}); - thrust::transform(resource::get_thrust_policy(handle), + thrust::transform(raft::resource::get_thrust_policy(handle), minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + minClusterAndDistance.size(), labels.data_handle(), @@ -1135,7 +1136,7 @@ void kmeans_fit_predict(raft::resources const& handle, raft::host_scalar_view inertia, raft::host_scalar_view n_iter) { - common::nvtx::range fun_scope("kmeans_fit_predict"); + raft::common::nvtx::range fun_scope("kmeans_fit_predict"); if (!centroids.has_value()) { auto n_features = X.extent(1); auto centroids_matrix = @@ -1199,7 +1200,7 @@ void kmeans_transform(raft::resources const& handle, raft::device_matrix_view centroids, raft::device_matrix_view X_new) { - common::nvtx::range fun_scope("kmeans_transform"); + raft::common::nvtx::range fun_scope("kmeans_transform"); logger::get(RAFT_NAME).set_level(params.verbosity); cudaStream_t stream = resource::get_cuda_stream(handle); auto n_samples = X.extent(0); diff --git a/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh b/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh index b774a1b8f..1b946cc1e 100644 --- a/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh +++ b/cpp/include/cuvs/cluster/detail/kmeans_balanced.cuh @@ -104,7 +104,7 @@ inline std::enable_if_t> predict_core( auto minClusterAndDistance = raft::make_device_mdarray, IdxT>( handle, mr, make_extents(n_rows)); raft::KeyValuePair initial_value(0, std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + minClusterAndDistance.size(), initial_value); @@ -130,7 +130,7 @@ inline std::enable_if_t> predict_core( // todo(lsugy): use KVP + iterator in caller. // Copy keys to output labels - thrust::transform(resource::get_thrust_policy(handle), + thrust::transform(raft::resource::get_thrust_policy(handle), minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + n_rows, labels, @@ -325,7 +325,7 @@ void compute_norm(const raft::resources& handle, MappingOpT mapping_op, rmm::mr::device_memory_resource* mr = nullptr) { - common::nvtx::range fun_scope("compute_norm"); + raft::common::nvtx::range fun_scope("compute_norm"); auto stream = resource::get_cuda_stream(handle); if (mr == nullptr) { mr = resource::get_workspace_resource(handle); } rmm::device_uvector mapped_dataset(0, stream, mr); @@ -381,7 +381,7 @@ void predict(const raft::resources& handle, const MathT* dataset_norm = nullptr) { auto stream = resource::get_cuda_stream(handle); - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "predict(%zu, %u)", static_cast(n_rows), n_clusters); if (mr == nullptr) { mr = resource::get_workspace_resource(handle); } auto [max_minibatch_size, _mem_per_row] = @@ -473,7 +473,7 @@ __launch_bounds__((WarpSize * BlockDimY)) RAFT_KERNEL const MathT wc = min(static_cast(csize), static_cast(kAdjustCentersWeight)); // Weight for the datapoint used to shift the center. const MathT wd = 1.0; - for (; j < dim; j += WarpSize) { + for (; j < dim; j += raft::WarpSize) { MathT val = 0; val += wc * centers[j + dim * li]; val += wd * mapping_op(dataset[j + dim * i]); @@ -533,7 +533,7 @@ auto adjust_centers(MathT* centers, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* device_memory) -> bool { - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "adjust_centers(%zu, %u)", static_cast(n_rows), n_clusters); if (n_clusters == 0) { return false; } constexpr static std::array kPrimes{29, 71, 113, 173, 229, 281, 349, 409, 463, 541, @@ -901,7 +901,7 @@ auto build_fine_clusters(const raft::resources& handle, raft::matrix::gather(mapping_itr, dim, n_rows, mc_trainset_ids, k, mc_trainset, stream); if (params.metric == cuvs::distance::DistanceType::L2Expanded || params.metric == cuvs::distance::DistanceType::L2SqrtExpanded) { - thrust::gather(resource::get_thrust_policy(handle), + thrust::gather(raft::resource::get_thrust_policy(handle), mc_trainset_ids, mc_trainset_ids + k, dataset_norm_mptr, @@ -964,7 +964,7 @@ void build_hierarchical(const raft::resources& handle, auto stream = resource::get_cuda_stream(handle); using LabelT = uint32_t; - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "build_hierarchical(%zu, %u)", static_cast(n_rows), n_clusters); IdxT n_mesoclusters = std::min(n_clusters, static_cast(std::sqrt(n_clusters) + 0.5)); diff --git a/cpp/include/cuvs/cluster/detail/kmeans_common.cuh b/cpp/include/cuvs/cluster/detail/kmeans_common.cuh index 965d8a549..d4f6a43a2 100644 --- a/cpp/include/cuvs/cluster/detail/kmeans_common.cuh +++ b/cpp/include/cuvs/cluster/detail/kmeans_common.cuh @@ -267,7 +267,7 @@ void sampleCentroids(raft::resources const& handle, resource::sync_stream(handle, stream); uint8_t* rawPtr_isSampleCentroid = isSampleCentroid.data_handle(); - thrust::for_each_n(resource::get_thrust_policy(handle), + thrust::for_each_n(raft::resource::get_thrust_policy(handle), sampledMinClusterDistance.data_handle(), nPtsSampledInRank, [=] __device__(raft::KeyValuePair val) { @@ -399,7 +399,7 @@ void minClusterAndDistanceCompute( raft::KeyValuePair initial_value(0, std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + minClusterAndDistance.size(), initial_value); @@ -527,7 +527,7 @@ void minClusterDistanceCompute(raft::resources const& handle, auto pairwiseDistance = raft::make_device_matrix_view( L2NormBuf_OR_DistBuf.data(), dataBatchSize, centroidsBatchSize); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), minClusterDistance.data_handle(), minClusterDistance.data_handle() + minClusterDistance.size(), std::numeric_limits::max()); diff --git a/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh b/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh index f5c920e08..0c2548863 100644 --- a/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh +++ b/cpp/include/cuvs/distance/detail/fused_l2_nn.cuh @@ -101,7 +101,7 @@ void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, cudaStream_t s } // TODO: specialize this function for MinAndDistanceReduceOp -// with atomicCAS of 64 bit which will eliminate mutex and shfls +// with atomicCAS of 64 bit which will eliminate mutex and raft::shfls template DI void updateReducedVal( int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY) @@ -204,7 +204,7 @@ __launch_bounds__(P::Nthreads, 2) RAFT_KERNEL fusedL2NNkernel(OutT* min, #pragma unroll for (int j = P::AccThCols / 2; j > 0; j >>= 1) { // Actually, the srcLane (lid +j) should be (lid +j) % P:AccThCols, - // but the shfl op applies the modulo internally. + // but the raft::shfl op applies the modulo internally. auto tmpkey = raft::shfl(val[i].key, lid + j, P::AccThCols); auto tmpvalue = raft::shfl(val[i].value, lid + j, P::AccThCols); KVPair tmp = {tmpkey, tmpvalue}; diff --git a/cpp/include/cuvs/distance/detail/masked_nn.cuh b/cpp/include/cuvs/distance/detail/masked_nn.cuh index b5a65407b..8b30d8eec 100644 --- a/cpp/include/cuvs/distance/detail/masked_nn.cuh +++ b/cpp/include/cuvs/distance/detail/masked_nn.cuh @@ -255,7 +255,7 @@ void masked_l2_nn_impl(raft::resources const& handle, // Get stream and workspace memory resource rmm::mr::device_memory_resource* ws_mr = - dynamic_cast(resource::get_workspace_resource(handle)); + dynamic_cast(raft::resource::get_workspace_resource(handle)); auto stream = resource::get_cuda_stream(handle); // Acquire temporary buffers and initialize to zero: diff --git a/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh b/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh index 990f845fd..57366dec9 100644 --- a/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh +++ b/cpp/include/cuvs/distance/detail/pairwise_distance_base.cuh @@ -222,7 +222,7 @@ struct PairwiseDistances : public BaseClass { DI void accumulate() { - // We have a separate ldsXY and accumulate_reg_tile outside the loop body, + // We have a separate raft::ldsXY and accumulate_reg_tile outside the loop body, // so that these separated calls can be interspersed with preceding and // following instructions, thereby hiding latency. this->ldsXY(0); diff --git a/cpp/include/cuvs/distance/distance-ext.cuh b/cpp/include/cuvs/distance/distance-ext.cuh index efba6eecf..fdbe6a971 100644 --- a/cpp/include/cuvs/distance/distance-ext.cuh +++ b/cpp/include/cuvs/distance/distance-ext.cuh @@ -132,7 +132,7 @@ void distance(raft::resources const& handle, raft::device_matrix_view dist, DataT metric_arg = 2.0f) RAFT_EXPLICIT; -template +template void pairwise_distance(raft::resources const& handle, device_matrix_view const x, device_matrix_view const y, diff --git a/cpp/include/cuvs/neighbors/brute_force-inl.cuh b/cpp/include/cuvs/neighbors/brute_force-inl.cuh index b5584c5b7..3d5c449a9 100644 --- a/cpp/include/cuvs/neighbors/brute_force-inl.cuh +++ b/cpp/include/cuvs/neighbors/brute_force-inl.cuh @@ -172,8 +172,8 @@ void knn(raft::resources const& handle, RAFT_EXPECTS(indices.extent(1) == distances.extent(1) && distances.extent(1), "Number of columns in output indices and distances matrices must the same"); - bool rowMajorIndex = std::is_same_v; - bool rowMajorQuery = std::is_same_v; + bool rowMajorIndex = std::is_same_v; + bool rowMajorQuery = std::is_same_v; std::vector inputs; std::vector sizes; diff --git a/cpp/include/cuvs/neighbors/cagra_types.hpp b/cpp/include/cuvs/neighbors/cagra_types.hpp index 9dee5b72a..0299b78df 100644 --- a/cpp/include/cuvs/neighbors/cagra_types.hpp +++ b/cpp/include/cuvs/neighbors/cagra_types.hpp @@ -221,7 +221,7 @@ struct index : ann::index { * @endcode * In the above example, we have passed a host dataset to build. The returned index will own a * device copy of the dataset and the knn_graph. In contrast, if we pass the dataset as a - * device_mdspan to build, then it will only store a reference to it. + * raft::device_mdspan to build, then it will only store a reference to it. * * - Constructing index using existing knn-graph * @code{.cpp} diff --git a/cpp/include/cuvs/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/cuvs/neighbors/detail/cagra/cagra_build.cuh index 2349d5943..399d0071b 100644 --- a/cpp/include/cuvs/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/include/cuvs/neighbors/detail/cagra/cagra_build.cuh @@ -54,10 +54,11 @@ void build_knn_graph( "Currently only L2Expanded metric is supported"); uint32_t node_degree = knn_graph.extent(1); - common::nvtx::range fun_scope("cagra::build_graph(%zu, %zu, %u)", - size_t(dataset.extent(0)), - size_t(dataset.extent(1)), - node_degree); + raft::common::nvtx::range fun_scope( + "cagra::build_graph(%zu, %zu, %u)", + size_t(dataset.extent(0)), + size_t(dataset.extent(1)), + node_degree); if (!build_params) { build_params = ivf_pq::index_params{}; diff --git a/cpp/include/cuvs/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/cuvs/neighbors/detail/cagra/cagra_search.cuh index 6680a8b4b..87d8876e3 100644 --- a/cpp/include/cuvs/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/include/cuvs/neighbors/detail/cagra/cagra_search.cuh @@ -122,7 +122,7 @@ void search_main(raft::resources const& res, if (params.max_queries == 0) { params.max_queries = queries.extent(0); } - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "cagra::search(max_queries = %u, k = %u, dim = %zu)", params.max_queries, topk, index.dim()); using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector::type; diff --git a/cpp/include/cuvs/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/include/cuvs/neighbors/detail/cagra/cagra_serialize.cuh index c57f03bf4..019da84f3 100644 --- a/cpp/include/cuvs/neighbors/detail/cagra/cagra_serialize.cuh +++ b/cpp/include/cuvs/neighbors/detail/cagra/cagra_serialize.cuh @@ -49,7 +49,7 @@ void serialize(raft::resources const& res, const index& index_, bool include_dataset) { - common::nvtx::range fun_scope("cagra::serialize"); + raft::common::nvtx::range fun_scope("cagra::serialize"); RAFT_LOG_DEBUG( "Saving CAGRA index, size %zu, dim %u", static_cast(index_.size()), index_.dim()); @@ -103,7 +103,8 @@ void serialize_to_hnswlib(raft::resources const& res, std::ostream& os, const index& index_) { - common::nvtx::range fun_scope("cagra::serialize_to_hnswlib"); + raft::common::nvtx::range fun_scope( + "cagra::serialize_to_hnswlib"); RAFT_LOG_DEBUG("Saving CAGRA index to hnswlib format, size %zu, dim %u", static_cast(index_.size()), index_.dim()); @@ -233,7 +234,7 @@ void serialize_to_hnswlib(raft::resources const& res, template auto deserialize(raft::resources const& res, std::istream& is) -> index { - common::nvtx::range fun_scope("cagra::deserialize"); + raft::common::nvtx::range fun_scope("cagra::deserialize"); char dtype_string[4]; is.read(dtype_string, 4); diff --git a/cpp/include/cuvs/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/cuvs/neighbors/detail/cagra/search_multi_kernel.cuh index c1d550382..622a6a825 100644 --- a/cpp/include/cuvs/neighbors/detail/cagra/search_multi_kernel.cuh +++ b/cpp/include/cuvs/neighbors/detail/cagra/search_multi_kernel.cuh @@ -204,8 +204,8 @@ void random_pickup(const DATA_T* const dataset_ptr, // [dataset_size, dataset_d template RAFT_KERNEL pickup_next_parents_kernel( - INDEX_T* const parent_candidates_ptr, // [num_queries, lds] - const std::size_t lds, // (*) lds >= parent_candidates_size + INDEX_T* const parent_candidates_ptr, // [num_queries, raft::lds] + const std::size_t raft::lds, // (*) raft::lds >= parent_candidates_size const std::uint32_t parent_candidates_size, // INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] const std::size_t hash_bitlen, @@ -268,8 +268,8 @@ RAFT_KERNEL pickup_next_parents_kernel( } template -void pickup_next_parents(INDEX_T* const parent_candidates_ptr, // [num_queries, lds] - const std::size_t lds, // (*) lds >= parent_candidates_size +void pickup_next_parents(INDEX_T* const parent_candidates_ptr, // [num_queries, raft::lds] + const std::size_t raft::lds, // (*) raft::lds >= parent_candidates_size const std::size_t parent_candidates_size, // const std::size_t num_queries, INDEX_T* const visited_hashmap_ptr, // [num_queries, 1 << hash_bitlen] @@ -291,7 +291,7 @@ void pickup_next_parents(INDEX_T* const parent_candidates_ptr, // [num_queries, } pickup_next_parents_kernel <<>>(parent_candidates_ptr, - lds, + raft::lds, parent_candidates_size, visited_hashmap_ptr, hash_bitlen, @@ -312,7 +312,7 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel( const INDEX_T* const parent_node_list, // [num_queries, search_width] INDEX_T* const parent_candidates_ptr, // [num_queries, search_width] DISTANCE_T* const parent_distance_ptr, // [num_queries, search_width] - const std::size_t lds, + const std::size_t raft::lds, const std::uint32_t search_width, const DATA_T* const dataset_ptr, // [dataset_size, data_dim] const std::uint32_t data_dim, @@ -395,7 +395,7 @@ void compute_distance_to_child_nodes( const INDEX_T* const parent_node_list, // [num_queries, search_width] INDEX_T* const parent_candidates_ptr, // [num_queries, search_width] DISTANCE_T* const parent_distance_ptr, // [num_queries, search_width] - const std::size_t lds, + const std::size_t raft::lds, const uint32_t search_width, const DATA_T* const dataset_ptr, // [dataset_size, data_dim] const std::uint32_t data_dim, @@ -421,7 +421,7 @@ void compute_distance_to_child_nodes( <<>>(parent_node_list, parent_candidates_ptr, parent_distance_ptr, - lds, + raft::lds, search_width, dataset_ptr, data_dim, @@ -471,7 +471,7 @@ void remove_parent_bit(const std::uint32_t num_queries, template RAFT_KERNEL apply_filter_kernel(INDEX_T* const result_indices_ptr, DISTANCE_T* const result_distances_ptr, - const std::size_t lds, + const std::size_t raft::lds, const std::uint32_t result_buffer_size, const std::uint32_t num_queries, const INDEX_T query_id_offset, @@ -482,7 +482,7 @@ RAFT_KERNEL apply_filter_kernel(INDEX_T* const result_indices_ptr, if (tid >= result_buffer_size * num_queries) { return; } const auto i = tid % result_buffer_size; const auto j = tid / result_buffer_size; - const auto index = i + j * lds; + const auto index = i + j * raft::lds; if (result_indices_ptr[index] != ~index_msb_1_mask && !sample_filter(query_id_offset + j, result_indices_ptr[index])) { @@ -494,7 +494,7 @@ RAFT_KERNEL apply_filter_kernel(INDEX_T* const result_indices_ptr, template void apply_filter(INDEX_T* const result_indices_ptr, DISTANCE_T* const result_distances_ptr, - const std::size_t lds, + const std::size_t raft::lds, const std::uint32_t result_buffer_size, const std::uint32_t num_queries, const INDEX_T query_id_offset, @@ -506,7 +506,7 @@ void apply_filter(INDEX_T* const result_indices_ptr, apply_filter_kernel<<>>(result_indices_ptr, result_distances_ptr, - lds, + raft::lds, result_buffer_size, num_queries, query_id_offset, @@ -642,7 +642,7 @@ struct search : search_plan_impl { parent_node_list(0, resource::get_cuda_stream(res)), topk_hint(0, resource::get_cuda_stream(res)), topk_workspace(0, resource::get_cuda_stream(res)), - terminate_flag(resource::get_cuda_stream(res)) + terminate_flag(raft::resource::get_cuda_stream(res)) { set_params(res); } diff --git a/cpp/include/cuvs/neighbors/detail/div_utils.hpp b/cpp/include/cuvs/neighbors/detail/div_utils.hpp index fc6b6f5c5..805bb1304 100644 --- a/cpp/include/cuvs/neighbors/detail/div_utils.hpp +++ b/cpp/include/cuvs/neighbors/detail/div_utils.hpp @@ -21,9 +21,9 @@ #endif /** - * @brief A simple wrapper for raft::Pow2 which uses Pow2 utils only when available and regular - * integer division otherwise. This is done to allow a common interface for division arithmetic for - * non CUDA headers. + * @brief A simple wrapper for raft::Pow2 which uses raft::Pow2 utils only when available and + * regular integer division otherwise. This is done to allow a common interface for division + * arithmetic for non CUDA headers. * * @tparam Value_ a compile-time value representable as a power-of-two. */ @@ -37,7 +37,7 @@ struct div_utils { static constexpr _RAFT_HOST_DEVICE inline auto roundDown(T x) { #if defined(_RAFT_HAS_CUDA) - return Pow2::roundDown(x); + return raft::Pow2::roundDown(x); #else return raft::round_down_safe(x, Value_); #endif @@ -47,7 +47,7 @@ struct div_utils { static constexpr _RAFT_HOST_DEVICE inline auto mod(T x) { #if defined(_RAFT_HAS_CUDA) - return Pow2::mod(x); + return raft::Pow2::mod(x); #else return x % Value_; #endif @@ -57,7 +57,7 @@ struct div_utils { static constexpr _RAFT_HOST_DEVICE inline auto div(T x) { #if defined(_RAFT_HAS_CUDA) - return Pow2::div(x); + return raft::Pow2::div(x); #else return x / Value_; #endif diff --git a/cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkWarp.cuh b/cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkWarp.cuh index a4c895d0d..cf97d99ca 100644 --- a/cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkWarp.cuh +++ b/cpp/include/cuvs/neighbors/detail/faiss_select/MergeNetworkWarp.cuh @@ -138,7 +138,7 @@ inline __device__ void warpBitonicMergeLE16(K& k, V& v) // Template for performing a bitonic merge of an arbitrary set of // registers -template +template struct BitonicMergeStep {}; // diff --git a/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh b/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh index 9181fba79..796a841a4 100644 --- a/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh +++ b/cpp/include/cuvs/neighbors/detail/faiss_select/Select.cuh @@ -127,7 +127,7 @@ struct BlockSelect { warpV[i] = initV; } - warpFence(); + raft::warpFence(); } __device__ inline void addThreadQ(K k, V v) @@ -161,7 +161,7 @@ struct BlockSelect { return; } - // This has a trailing warpFence + // This has a trailing raft::warpFence mergeWarpQ(); // Any top-k elements have been merged into the warp queue; we're @@ -177,7 +177,7 @@ struct BlockSelect { // We have to beat at least this element warpKTop = warpK[kMinus1]; - warpFence(); + raft::warpFence(); } /// This function handles sorting and merging together the @@ -215,7 +215,7 @@ struct BlockSelect { warpV[i * raft::WarpSize + laneId] = warpVRegisters[i]; } - warpFence(); + raft::warpFence(); } /// WARNING: all threads in a warp must participate in this. diff --git a/cpp/include/cuvs/neighbors/detail/faiss_select/key_value_block_select.cuh b/cpp/include/cuvs/neighbors/detail/faiss_select/key_value_block_select.cuh index 4407473a8..14484435b 100644 --- a/cpp/include/cuvs/neighbors/detail/faiss_select/key_value_block_select.cuh +++ b/cpp/include/cuvs/neighbors/detail/faiss_select/key_value_block_select.cuh @@ -65,7 +65,7 @@ struct KeyValueBlockSelect { warpV[i].value = initVv; } - warpFence(); + raft::warpFence(); } __device__ inline void addThreadQ(K k, K vk, V vv) @@ -101,7 +101,7 @@ struct KeyValueBlockSelect { return; } - // This has a trailing warpFence + // This has a trailing raft::warpFence mergeWarpQ(); // Any top-k elements have been merged into the warp queue; we're @@ -119,7 +119,7 @@ struct KeyValueBlockSelect { warpKTop = warpK[kMinus1]; warpKTopRDist = warpV[kMinus1].key; - warpFence(); + raft::warpFence(); } /// This function handles sorting and merging together the @@ -143,7 +143,7 @@ struct KeyValueBlockSelect { warpVRegisters[i].value = warpV[i * raft::WarpSize + laneId].value; } - warpFence(); + raft::warpFence(); // The warp queue is already sorted, and now that we've sorted the // per-thread queue, merge both sorted lists together, producing @@ -159,7 +159,7 @@ struct KeyValueBlockSelect { warpV[i * raft::WarpSize + laneId].value = warpVRegisters[i].value; } - warpFence(); + raft::warpFence(); } /// WARNING: all threads in a warp must participate in this. diff --git a/cpp/include/cuvs/neighbors/detail/ivf_flat_build.cuh b/cpp/include/cuvs/neighbors/detail/ivf_flat_build.cuh index 98d387080..022e5eac5 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_flat_build.cuh @@ -134,7 +134,7 @@ RAFT_KERNEL build_index_kernel(const LabelT* labels, list_index[inlist_id] = source_ixs == nullptr ? i : source_ixs[i]; // The data is written in interleaved groups of `index::kGroupSize` vectors - using interleaved_group = Pow2; + using interleaved_group = raft::Pow2; auto group_offset = interleaved_group::roundDown(inlist_id); auto ingroup_id = interleaved_group::mod(inlist_id) * veclen; @@ -172,7 +172,7 @@ void extend(raft::resources const& handle, auto dim = index->dim(); list_spec list_device_spec{index->dim(), index->conservative_memory_allocation()}; - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "ivf_flat::extend(%zu, %u)", size_t(n_rows), dim); RAFT_EXPECTS(new_indices != nullptr || index->size() == 0, @@ -235,7 +235,7 @@ void extend(raft::resources const& handle, lists[label], list_device_spec, new_list_sizes[label], - Pow2::roundUp(old_list_sizes[label])); + raft::Pow2::roundUp(old_list_sizes[label])); } } // Update the pointers and the sizes @@ -305,7 +305,7 @@ inline auto build(raft::resources const& handle, uint32_t dim) -> index { auto stream = resource::get_cuda_stream(handle); - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "ivf_flat::build(%zu, %u)", size_t(n_rows), dim); static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "unsupported data type"); @@ -379,7 +379,7 @@ inline void fill_refinement_index(raft::resources const& handle, auto stream = resource::get_cuda_stream(handle); uint32_t n_lists = n_queries; - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries)); rmm::device_uvector new_labels(n_queries * n_candidates, stream); @@ -456,7 +456,8 @@ void pack_list_data( raft::device_matrix_view codes, uint32_t veclen, std::variant offset_or_indices, - device_mdspan::list_extents, raft::row_major> list_data) + raft::device_mdspan::list_extents, raft::row_major> + list_data) { uint32_t n_rows = codes.extent(0); uint32_t dim = codes.extent(1); @@ -473,7 +474,7 @@ void pack_list_data( template void unpack_list_data( raft::resources const& res, - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> list_data, uint32_t veclen, std::variant offset_or_indices, diff --git a/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh b/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh index 0978c4c7f..221da924c 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_flat_interleaved_scan-inl.cuh @@ -206,7 +206,8 @@ struct loadAndComputeDist { encV, reinterpret_cast(data) + loadIndex + j * kIndexGroupSize * veclen_int); uint32_t queryRegs[veclen_int]; - lds(queryRegs, reinterpret_cast(query_shared + shmemIndex) + j * veclen_int); + raft::lds(queryRegs, + reinterpret_cast(query_shared + shmemIndex) + j * veclen_int); #pragma unroll for (int k = 0; k < veclen_int; k++) { compute_dist(dist, queryRegs[k], encV[k]); @@ -234,7 +235,7 @@ struct loadAndComputeDist { const int d = (i * kUnroll + j) * veclen_int; #pragma unroll for (int k = 0; k < veclen_int; ++k) { - compute_dist(dist, shfl(queryReg, d + k, raft::WarpSize), encV[k]); + compute_dist(dist, raft::shfl(queryReg, d + k, raft::WarpSize), encV[k]); } } } @@ -255,7 +256,7 @@ struct loadAndComputeDist { raft::ldg(enc, reinterpret_cast(data) + lane_id * veclen_int); #pragma unroll for (int k = 0; k < veclen_int; k++) { - uint32_t q = shfl(queryReg, (d / 4) + k, raft::WarpSize); + uint32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize); compute_dist(dist, q, enc[k]); } } @@ -301,7 +302,7 @@ struct loadAndComputeDist { #pragma unroll for (int j = 0; j < kUnroll; ++j) { uint32_t encV = reinterpret_cast(data)[lane_id + j * kIndexGroupSize]; - uint32_t q = shfl(queryReg, i * kUnroll + j, raft::WarpSize); + uint32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize); compute_dist(dist, q, encV); } } @@ -318,7 +319,7 @@ struct loadAndComputeDist { uint32_t queryReg = loadDim < dim ? reinterpret_cast(query)[loadDim] : 0; for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { uint32_t enc = reinterpret_cast(data)[lane_id]; - uint32_t q = shfl(queryReg, d / veclen, raft::WarpSize); + uint32_t q = raft::shfl(queryReg, d / veclen, raft::WarpSize); compute_dist(dist, q, enc); } } @@ -362,7 +363,7 @@ struct loadAndComputeDist { #pragma unroll for (int j = 0; j < kUnroll; ++j) { uint32_t encV = reinterpret_cast(data)[lane_id + j * kIndexGroupSize]; - uint32_t q = shfl(queryReg, i * kUnroll + j, raft::WarpSize); + uint32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize); compute_dist(dist, q, encV); } } @@ -379,7 +380,7 @@ struct loadAndComputeDist { uint32_t queryReg = loadDim < dim ? reinterpret_cast(query + loadDim)[0] : 0; for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { uint32_t enc = reinterpret_cast(data)[lane_id]; - uint32_t q = shfl(queryReg, d / veclen, raft::WarpSize); + uint32_t q = raft::shfl(queryReg, d / veclen, raft::WarpSize); compute_dist(dist, q, enc); } } @@ -422,7 +423,7 @@ struct loadAndComputeDist { #pragma unroll for (int j = 0; j < kUnroll; ++j) { uint32_t encV = data[lane_id + j * kIndexGroupSize]; - uint32_t q = shfl(queryReg, i * kUnroll + j, raft::WarpSize); + uint32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize); compute_dist(dist, q, encV); } } @@ -439,7 +440,7 @@ struct loadAndComputeDist { uint32_t queryReg = loadDim < dim ? query[loadDim] : 0; for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { uint32_t enc = data[lane_id]; - uint32_t q = shfl(queryReg, d, raft::WarpSize); + uint32_t q = raft::shfl(queryReg, d, raft::WarpSize); compute_dist(dist, q, enc); } } @@ -519,7 +520,7 @@ struct loadAndComputeDist { raft::ldg(enc, reinterpret_cast(data) + lane_id * veclen_int); #pragma unroll for (int k = 0; k < veclen_int; k++) { - int32_t q = shfl(queryReg, (d / 4) + k, raft::WarpSize); // Here 4 is for 1 - int; + int32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize); // Here 4 is for 1 - int; compute_dist(dist, q, enc[k]); } } @@ -562,7 +563,7 @@ struct loadAndComputeDist { #pragma unroll for (int j = 0; j < kUnroll; ++j) { int32_t encV = reinterpret_cast(data)[lane_id + j * kIndexGroupSize]; - int32_t q = shfl(queryReg, i * kUnroll + j, raft::WarpSize); + int32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize); compute_dist(dist, q, encV); } } @@ -576,7 +577,7 @@ struct loadAndComputeDist { int32_t queryReg = loadDim < dim ? reinterpret_cast(query + loadDim)[0] : 0; for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { int32_t enc = reinterpret_cast(data + lane_id * veclen)[0]; - int32_t q = shfl(queryReg, d / veclen, raft::WarpSize); + int32_t q = raft::shfl(queryReg, d / veclen, raft::WarpSize); compute_dist(dist, q, enc); } } @@ -702,8 +703,8 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) copy_vectorized(query_shared, query, std::min(dim, query_smem_elems)); __syncthreads(); - using block_sort_t = matrix::detail::select::warpsort::block_sort< - matrix::detail::select::warpsort::warp_sort_filtered, + using block_sort_t = raft::matrix::detail::select::warpsort::block_sort< + raft::matrix::detail::select::warpsort::warp_sort_filtered, Capacity, Ascending, float, @@ -711,7 +712,7 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) block_sort_t queue(k); { - using align_warp = Pow2; + using align_warp = raft::Pow2; const int lane_id = align_warp::mod(threadIdx.x); // How many full warps needed to compute the distance (without remainder) @@ -758,7 +759,7 @@ RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) } if (dim > query_smem_elems) { - // The default path - using shfl ops - for dimensions beyond query_smem_elems + // The default path - using raft::shfl ops - for dimensions beyond query_smem_elems loadAndComputeDist lc(dist, compute_dist); for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += raft::WarpSize) { @@ -835,7 +836,7 @@ void launch_kernel(Lambda lambda, { RAFT_EXPECTS(Veclen == index.veclen(), "Configured Veclen does not match the index interleaving pattern."); - constexpr auto kKernel = interleaved_scan_kernel; - const int max_query_smem = 16384; - int query_smem_elems = - std::min(max_query_smem / sizeof(T), Pow2::roundUp(index.dim())); + const int max_query_smem = 16384; + int query_smem_elems = std::min(max_query_smem / sizeof(T), + raft::Pow2::roundUp(index.dim())); int smem_size = query_smem_elems * sizeof(T); constexpr int kSubwarpSize = std::min(Capacity, raft::WarpSize); auto block_merge_mem = @@ -911,7 +912,7 @@ struct euclidean_dist { { if constexpr (Veclen > 1) { const auto diff = __vabsdiffu4(x, y); - acc = dp4a(diff, diff, acc); + acc = raft::dp4a(diff, diff, acc); } else { const auto diff = __usad(x, y, 0u); acc += diff * diff; @@ -924,12 +925,12 @@ struct euclidean_dist { __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y) { if constexpr (Veclen > 1) { - // Note that we enforce here that the unsigned version of dp4a is used, because the difference - // between two int8 numbers can be greater than 127 and therefore represented as a negative - // number in int8. Casting from int8 to int32 would yield incorrect results, while casting - // from uint8 to uint32 is correct. + // Note that we enforce here that the unsigned version of raft::dp4a is used, because the + // difference between two int8 numbers can be greater than 127 and therefore represented as a + // negative number in int8. Casting from int8 to int32 would yield incorrect results, while + // casting from uint8 to uint32 is correct. const auto diff = __vabsdiffs4(x, y); - acc = dp4a(diff, diff, static_cast(acc)); + acc = raft::dp4a(diff, diff, static_cast(acc)); } else { const auto diff = x - y; acc += diff * diff; @@ -1042,7 +1043,7 @@ struct select_interleaved_scan_kernel { RAFT_EXPECTS(capacity == Capacity, "Capacity must be power-of-two not bigger than the maximum allowed size " "matrix::detail::select::warpsort::kMaxCapacity (%d).", - matrix::detail::select::warpsort::kMaxCapacity); + raft::matrix::detail::select::warpsort::kMaxCapacity); RAFT_EXPECTS( veclen == Veclen, "Veclen must be power-of-two not bigger than the maximum allowed size for this data type."); diff --git a/cpp/include/cuvs/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/cuvs/neighbors/detail/ivf_flat_search-inl.cuh index 39032a575..7f613963b 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_flat_search-inl.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_flat_search-inl.cuh @@ -27,7 +27,7 @@ #include // raft::linalg::gemm #include // raft::linalg::norm #include // raft::linalg::unary_op -#include // matrix::detail::select_k +#include // raft::matrix::detail::select_k #include // rmm::device_memory_resource namespace cuvs::neighbors::ivf_flat::detail { @@ -127,16 +127,16 @@ void search_impl(raft::resources const& handle, stream); RAFT_LOG_TRACE_VEC(distance_buffer_dev.data(), std::min(20, index.n_lists())); - matrix::detail::select_k(handle, - distance_buffer_dev.data(), - nullptr, - n_queries, - index.n_lists(), - n_probes, - coarse_distances_dev.data(), - coarse_indices_dev.data(), - select_min, - search_mr); + raft::matrix::detail::select_k(handle, + distance_buffer_dev.data(), + nullptr, + n_queries, + index.n_lists(), + n_probes, + coarse_distances_dev.data(), + coarse_indices_dev.data(), + select_min, + search_mr); RAFT_LOG_TRACE_VEC(coarse_indices_dev.data(), n_probes); RAFT_LOG_TRACE_VEC(coarse_distances_dev.data(), n_probes); @@ -191,16 +191,16 @@ void search_impl(raft::resources const& handle, // Merge topk values from different blocks if (grid_dim_x > 1) { - matrix::detail::select_k(handle, - refined_distances_dev.data(), - refined_indices_dev.data(), - n_queries, - k * grid_dim_x, - k, - distances, - neighbors, - select_min, - search_mr); + raft::matrix::detail::select_k(handle, + refined_distances_dev.data(), + refined_indices_dev.data(), + n_queries, + k * grid_dim_x, + k, + distances, + neighbors, + select_min, + search_mr); } } @@ -219,7 +219,7 @@ inline void search(raft::resources const& handle, rmm::mr::device_memory_resource* mr = nullptr, IvfSampleFilterT sample_filter = IvfSampleFilterT()) { - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim()); RAFT_EXPECTS(params.n_probes > 0, diff --git a/cpp/include/cuvs/neighbors/detail/ivf_flat_serialize.cuh b/cpp/include/cuvs/neighbors/detail/ivf_flat_serialize.cuh index 1f181b517..60d2392be 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_flat_serialize.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_flat_serialize.cuh @@ -86,7 +86,7 @@ void serialize(raft::resources const& handle, std::ostream& os, const index::roundUp(sizes_host(label))); + raft::Pow2::roundUp(sizes_host(label))); } resource::sync_stream(handle); } diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_build.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_build.cuh index de89ea2d9..c3d3152e5 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_build.cuh @@ -68,7 +68,7 @@ template __launch_bounds__(BlockDim) RAFT_KERNEL copy_warped_kernel( T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows) { - using warp = Pow2; + using warp = raft::Pow2; size_t row_ix = warp::div(size_t(threadIdx.x) + size_t(BlockDim) * size_t(blockIdx.x)); uint32_t i = warp::mod(threadIdx.x); if (row_ix >= n_rows) return; @@ -104,7 +104,7 @@ void copy_warped(T* out, { constexpr uint32_t kBlockDim = 128; dim3 threads(kBlockDim, 1, 1); - dim3 blocks(div_rounding_up_safe(n_rows, kBlockDim / WarpSize), 1, 1); + dim3 blocks(div_rounding_up_safe(n_rows, kBlockDim / raft::WarpSize), 1, 1); copy_warped_kernel <<>>(out, ld_out, in, ld_in, n_cols, n_rows); } @@ -126,7 +126,7 @@ inline void make_rotation_matrix(raft::resources const& handle, float* rotation_matrix, raft::random::RngState rng = raft::random::RngState(7ULL)) { - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "ivf_pq::make_rotation_matrix(%u * %u)", n_rows, n_cols); auto stream = resource::get_cuda_stream(handle); bool inplace = n_rows == n_cols; @@ -178,8 +178,8 @@ void select_residuals(raft::resources const& handle, { auto stream = resource::get_cuda_stream(handle); rmm::device_uvector tmp(size_t(n_rows) * size_t(dim), stream, device_memory); - // Note: the number of rows of the input dataset isn't actually n_rows, but matrix::gather doesn't - // need to know it, any strictly positive number would work. + // Note: the number of rows of the input dataset isn't actually n_rows, but raft::matrix::gather + // doesn't need to know it, any strictly positive number would work. cub::TransformInputIterator, const T*> mapping_itr( dataset, utils::mapping{}); raft::matrix::gather(mapping_itr, (IdxT)dim, n_rows, row_ids, n_rows, tmp.data(), stream); @@ -411,7 +411,7 @@ void train_per_subset(raft::resources const& handle, rmm::device_uvector pq_cluster_sizes(index.pq_book_size(), stream, device_memory); for (uint32_t j = 0; j < index.pq_dim(); j++) { - common::nvtx::range pq_per_subspace_scope( + raft::common::nvtx::range pq_per_subspace_scope( "ivf_pq::build::per_subspace[%u]", j); // Get the rotated cluster centers for each training vector. @@ -509,7 +509,7 @@ void train_per_cluster(raft::resources const& handle, for (uint32_t l = 0; l < index.n_lists(); l++) { auto cluster_size = cluster_sizes.data()[l]; if (cluster_size == 0) continue; - common::nvtx::range pq_per_cluster_scope( + raft::common::nvtx::range pq_per_cluster_scope( "ivf_pq::build::per_cluster[%u](size = %u)", l, cluster_size); select_residuals(handle, @@ -569,8 +569,8 @@ void train_per_cluster(raft::resources const& handle, template static __device__ auto reinterpret_vectors( raft::device_matrix_view vectors, - device_mdspan, raft::row_major> pq_centers) - -> device_mdspan, raft::row_major> + raft::device_mdspan, raft::row_major> pq_centers) + -> raft::device_mdspan, raft::row_major> { const uint32_t pq_len = pq_centers.extent(1); const uint32_t pq_dim = vectors.extent(1) / pq_len; @@ -608,7 +608,7 @@ struct unpack_codes { template __launch_bounds__(BlockSize) RAFT_KERNEL unpack_list_data_kernel( raft::device_matrix_view out_codes, - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> in_list_data, std::variant offset_or_indices) { @@ -628,7 +628,7 @@ __launch_bounds__(BlockSize) RAFT_KERNEL unpack_list_data_kernel( */ inline void unpack_list_data( raft::device_matrix_view codes, - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> list_data, std::variant offset_or_indices, uint32_t pq_bits, @@ -699,7 +699,7 @@ struct unpack_contiguous { template __launch_bounds__(BlockSize) RAFT_KERNEL unpack_contiguous_list_data_kernel( uint8_t* out_codes, - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> in_list_data, uint32_t n_rows, uint32_t pq_dim, @@ -720,7 +720,7 @@ __launch_bounds__(BlockSize) RAFT_KERNEL unpack_contiguous_list_data_kernel( */ inline void unpack_contiguous_list_data( uint8_t* codes, - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> list_data, uint32_t n_rows, uint32_t pq_dim, @@ -771,9 +771,9 @@ struct reconstruct_vectors { codebook_gen codebook_kind; uint32_t cluster_ix; uint32_t pq_len; - device_mdspan, raft::row_major> pq_centers; - device_mdspan, raft::row_major> centers_rot; - device_mdspan, raft::row_major> out_vectors; + raft::device_mdspan, raft::row_major> pq_centers; + raft::device_mdspan, raft::row_major> centers_rot; + raft::device_mdspan, raft::row_major> out_vectors; /** * Create a callable to be passed to `run_on_list`. @@ -786,7 +786,7 @@ struct reconstruct_vectors { */ __device__ inline reconstruct_vectors( raft::device_matrix_view out_vectors, - device_mdspan, raft::row_major> pq_centers, + raft::device_mdspan, raft::row_major> pq_centers, raft::device_matrix_view centers_rot, codebook_gen codebook_kind, uint32_t cluster_ix) @@ -824,9 +824,9 @@ struct reconstruct_vectors { template __launch_bounds__(BlockSize) RAFT_KERNEL reconstruct_list_data_kernel( raft::device_matrix_view out_vectors, - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> in_list_data, - device_mdspan, raft::row_major> pq_centers, + raft::device_mdspan, raft::row_major> pq_centers, raft::device_matrix_view centers_rot, codebook_gen codebook_kind, uint32_t cluster_ix, @@ -943,7 +943,8 @@ struct pass_codes { template __launch_bounds__(BlockSize) RAFT_KERNEL pack_list_data_kernel( - device_mdspan::list_extents, raft::row_major> list_data, + raft::device_mdspan::list_extents, raft::row_major> + list_data, raft::device_matrix_view codes, std::variant offset_or_indices) { @@ -963,7 +964,8 @@ __launch_bounds__(BlockSize) RAFT_KERNEL pack_list_data_kernel( * @param[in] stream */ inline void pack_list_data( - device_mdspan::list_extents, raft::row_major> list_data, + raft::device_mdspan::list_extents, raft::row_major> + list_data, raft::device_matrix_view codes, std::variant offset_or_indices, uint32_t pq_bits, @@ -1032,7 +1034,8 @@ struct pack_contiguous { template __launch_bounds__(BlockSize) RAFT_KERNEL pack_contiguous_list_data_kernel( - device_mdspan::list_extents, raft::row_major> list_data, + raft::device_mdspan::list_extents, raft::row_major> + list_data, const uint8_t* codes, uint32_t n_rows, uint32_t pq_dim, @@ -1054,7 +1057,8 @@ __launch_bounds__(BlockSize) RAFT_KERNEL pack_contiguous_list_data_kernel( * @param[in] stream */ inline void pack_contiguous_list_data( - device_mdspan::list_extents, raft::row_major> list_data, + raft::device_mdspan::list_extents, raft::row_major> + list_data, const uint8_t* codes, uint32_t n_rows, uint32_t pq_dim, @@ -1106,7 +1110,7 @@ void pack_contiguous_list_data(raft::resources const& res, * * @tparam SubWarpSize * how many threads work on a single vector; - * bounded by either WarpSize or pq_book_size. + * bounded by either raft::WarpSize or pq_book_size. * * @param pq_centers * - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size] @@ -1124,11 +1128,11 @@ template struct encode_vectors { codebook_gen codebook_kind; uint32_t cluster_ix; - device_mdspan, raft::row_major> pq_centers; - device_mdspan, raft::row_major> in_vectors; + raft::device_mdspan, raft::row_major> pq_centers; + raft::device_mdspan, raft::row_major> in_vectors; __device__ inline encode_vectors( - device_mdspan, raft::row_major> pq_centers, + raft::device_mdspan, raft::row_major> pq_centers, raft::device_matrix_view in_vectors, codebook_gen codebook_kind, uint32_t cluster_ix) @@ -1145,7 +1149,7 @@ struct encode_vectors { */ __device__ inline auto operator()(IdxT i, uint32_t j) -> uint8_t { - uint32_t lane_id = Pow2::mod(laneId()); + uint32_t lane_id = raft::Pow2::mod(laneId()); uint32_t partition_ix; switch (codebook_kind) { case codebook_gen::PER_CLUSTER: { @@ -1196,11 +1200,11 @@ __launch_bounds__(BlockSize) RAFT_KERNEL process_and_fill_codes_kernel( raft::device_vector_view list_sizes, raft::device_vector_view inds_ptrs, raft::device_vector_view data_ptrs, - device_mdspan, raft::row_major> pq_centers, + raft::device_mdspan, raft::row_major> pq_centers, codebook_gen codebook_kind) { constexpr uint32_t kSubWarpSize = std::min(WarpSize, 1u << PqBits); - using subwarp_align = Pow2; + using subwarp_align = raft::Pow2; const uint32_t lane_id = subwarp_align::mod(threadIdx.x); const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x}); if (row_ix >= new_vectors.extent(0)) { return; } @@ -1208,7 +1212,7 @@ __launch_bounds__(BlockSize) RAFT_KERNEL process_and_fill_codes_kernel( const uint32_t cluster_ix = new_labels[row_ix]; uint32_t out_ix; if (lane_id == 0) { out_ix = atomicAdd(&list_sizes(cluster_ix), 1); } - out_ix = shfl(out_ix, 0, kSubWarpSize); + out_ix = raft::shfl(out_ix, 0, kSubWarpSize); // write the label (one record per subwarp) auto pq_indices = inds_ptrs(cluster_ix); @@ -1235,9 +1239,10 @@ __launch_bounds__(BlockSize) RAFT_KERNEL process_and_fill_codes_kernel( template __launch_bounds__(BlockSize) RAFT_KERNEL encode_list_data_kernel( - device_mdspan::list_extents, raft::row_major> list_data, + raft::device_mdspan::list_extents, raft::row_major> + list_data, raft::device_matrix_view new_vectors, - device_mdspan, raft::row_major> pq_centers, + raft::device_mdspan, raft::row_major> pq_centers, codebook_gen codebook_kind, uint32_t cluster_ix, std::variant offset_or_indices) @@ -1564,7 +1569,7 @@ void extend(raft::resources const& handle, const IdxT* new_indices, IdxT n_rows) { - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "ivf_pq::extend(%zu, %u)", size_t(n_rows), index->dim()); resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::extend"); @@ -1764,7 +1769,7 @@ auto build(raft::resources const& handle, IdxT n_rows, uint32_t dim) -> index { - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "ivf_pq::build(%zu, %u)", size_t(n_rows), dim); resource::detail::warn_non_pool_workspace(handle, "raft::ivf_pq::build"); static_assert(std::is_same_v || std::is_same_v || std::is_same_v, diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_codepacking.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_codepacking.cuh index 3ae826424..bbd47baa0 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_pq_codepacking.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_codepacking.cuh @@ -31,7 +31,7 @@ namespace cuvs::neighbors::ivf_pq::detail { /** A chunk of PQ-encoded vector managed by one CUDA thread. */ -using pq_vec_t = TxN_t::io_t; +using pq_vec_t = raft::TxN_t::io_t; /** * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`. @@ -81,7 +81,8 @@ struct bitfield_view_t { constexpr auto operator[](uint32_t i) -> bitfield_ref_t { uint32_t bit_offset = i * Bits; - return bitfield_ref_t{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)}; + return bitfield_ref_t{raw + raft::Pow2<8>::div(bit_offset), + raft::Pow2<8>::mod(bit_offset)}; } }; @@ -100,14 +101,14 @@ struct bitfield_view_t { */ template __device__ void run_on_vector( - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> in_list_data, uint32_t in_ix, uint32_t out_ix, uint32_t pq_dim, Action action) { - using group_align = Pow2; + using group_align = raft::Pow2; const uint32_t group_ix = group_align::div(in_ix); const uint32_t ingroup_ix = group_align::mod(in_ix); @@ -143,16 +144,16 @@ __device__ void run_on_vector( */ template __device__ void write_vector( - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> out_list_data, uint32_t out_ix, IdxT in_ix, uint32_t pq_dim, Action action) { - const uint32_t lane_id = Pow2::mod(threadIdx.x); + const uint32_t lane_id = raft::Pow2::mod(threadIdx.x); - using group_align = Pow2; + using group_align = raft::Pow2; const uint32_t group_ix = group_align::div(out_ix); const uint32_t ingroup_ix = group_align::mod(out_ix); @@ -179,7 +180,7 @@ __device__ void write_vector( /** Process the given indices or a block of a single list (cluster). */ template __device__ void run_on_list( - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> in_list_data, std::variant offset_or_indices, uint32_t len, @@ -197,14 +198,14 @@ __device__ void run_on_list( /** Process the given indices or a block of a single list (cluster). */ template __device__ void write_list( - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> out_list_data, std::variant offset_or_indices, uint32_t len, uint32_t pq_dim, Action action) { - using subwarp_align = Pow2; + using subwarp_align = raft::Pow2; uint32_t stride = subwarp_align::div(blockDim.x); uint32_t ix = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x); for (; ix < len; ix += stride) { diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-inl.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-inl.cuh index 7526a8052..c5c1be45c 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-inl.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_compute_similarity-inl.cuh @@ -20,7 +20,7 @@ #include // dummy_block_sort_t #include // codebook_gen #include // none_ivf_sample_filter -#include // matrix::detail::select::warpsort::warp_sort_distributed +#include // raft::matrix::detail::select::warpsort::warp_sort_distributed #include // RAFT_CUDA_TRY #include // raft::atomicMin #include // raft::Pow2 @@ -37,7 +37,7 @@ namespace cuvs::neighbors::ivf_pq::detail { */ static constexpr int kMaxCapacity = 128; static_assert((kMaxCapacity >= 32) && !(kMaxCapacity & (kMaxCapacity - 1)), - "kMaxCapacity must be a power of two, not smaller than the WarpSize."); + "kMaxCapacity must be a power of two, not smaller than the raft::WarpSize."); // using weak attribute here, because it may be compiled multiple times. auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k, uint32_t n_probes, uint32_t n_queries) @@ -50,8 +50,8 @@ auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k, uint32_t n_probes, ui template struct pq_block_sort { - using type = matrix::detail::select::warpsort::block_sort< - matrix::detail::select::warpsort::warp_sort_distributed_ext, + using type = raft::matrix::detail::select::warpsort::block_sort< + raft::matrix::detail::select::warpsort::warp_sort_distributed_ext, Capacity, true, T, @@ -104,7 +104,7 @@ constexpr inline auto estimate_carveout(double shmem_fraction, size_t shmem_per_block, const cudaDeviceProp& dev_props) -> int { - using shmem_unit = Pow2<128>; + using shmem_unit = raft::Pow2<128>; size_t m = shmem_unit::roundUp(shmem_per_block); size_t r = dev_props.reservedSharedMemPerBlock; size_t s = dev_props.sharedMemPerMultiprocessor; @@ -437,11 +437,11 @@ RAFT_KERNEL compute_similarity_kernel(uint32_t dim, // Then, such a chunk contains `chunk_size = 128 / pq_bits` record elements, and the record // consists of `ceildiv(pq_dim, chunk_size)` chunks. The chunks are interleaved in groups of 32, // so that the warp can achieve the best coalesced read throughput. - using group_align = Pow2; - using vec_align = Pow2; + using group_align = raft::Pow2; + using vec_align = raft::Pow2; using local_topk_t = block_sort_t; using op_t = uint32_t; - using vec_t = TxN_t; + using vec_t = raft::TxN_t; uint32_t sample_offset = 0; if (probe_ix > 0) { sample_offset = chunk_indices[probe_ix - 1]; } @@ -453,7 +453,7 @@ RAFT_KERNEL compute_similarity_kernel(uint32_t dim, group_align::mod(threadIdx.x) * vec_align::Value; pq_line_width *= blockDim.x; - constexpr OutT kDummy = upper_bound(); + constexpr OutT kDummy = raft::upper_bound(); OutT query_kth = kDummy; if constexpr (kManageLocalTopK) { query_kth = OutT(query_kths[query_ix]); } OutT early_stop_limit = kDummy; @@ -585,7 +585,7 @@ auto get_compute_similarity_kernel(uint32_t pq_bits, uint32_t k_max) /** Estimate the occupancy for the given kernel on the given device. */ template struct occupancy_t { - using shmem_unit = Pow2<128>; + using shmem_unit = raft::Pow2<128>; int blocks_per_sm = 0; double occupancy = 0.0; @@ -725,7 +725,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, ltk_reduce_mem_t(bool manage_local_topk, uint32_t topk) : manage_local_topk(manage_local_topk), topk(topk) { - subwarp_size = WarpSize; + subwarp_size = raft::WarpSize; while (topk * 2 <= subwarp_size) { subwarp_size /= 2; } @@ -733,11 +733,10 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, [[nodiscard]] auto operator()(uint32_t n_threads) const -> size_t { - return manage_local_topk - ? matrix::detail::select::warpsort::template calc_smem_size_for_block_wide( - n_threads / subwarp_size, topk) - : 0; + return manage_local_topk ? raft::matrix::detail::select::warpsort:: + template calc_smem_size_for_block_wide( + n_threads / subwarp_size, topk) + : 0; } } ltk_reduce_mem{manage_local_topk, topk}; @@ -760,7 +759,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, // 1. It's a power-of-two for efficient L1 caching of pq_centers values // (multiples of `1 << pq_bits`). // 2. It should be large enough to fully utilize an SM. - uint32_t n_threads_min = WarpSize; + uint32_t n_threads_min = raft::WarpSize; while (dev_props.maxBlocksPerMultiProcessor * int(n_threads_min) < dev_props.maxThreadsPerMultiProcessor) { n_threads_min *= 2; @@ -782,7 +781,7 @@ auto compute_similarity_select(const cudaDeviceProp& dev_props, // Granularity of changing the number of threads when computing the maximum block size. // It's good to have it multiple of the PQ book width. - uint32_t n_threads_gty = round_up_safe(1u << pq_bits, WarpSize); + uint32_t n_threads_gty = raft::round_up_safe(1u << pq_bits, raft::WarpSize); /* Shared memory / L1 cache balance is the main limiter of this kernel. diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh index 34273af26..8732aed3e 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_dummy_block_sort.cuh @@ -16,7 +16,7 @@ #pragma once -#include // matrix::detail::select::warpsort::warp_sort_distributed +#include // raft::matrix::detail::select::warpsort::warp_sort_distributed /* * This header file is a bit of an ugly duckling. The type dummy_block_sort is @@ -31,7 +31,8 @@ namespace cuvs::neighbors::ivf_pq::detail { template struct dummy_block_sort_t { - using queue_t = matrix::detail::select::warpsort::warp_sort_distributed; + using queue_t = + raft::matrix::detail::select::warpsort::warp_sort_distributed; template __device__ dummy_block_sort_t(int k, Args...){}; }; diff --git a/cpp/include/cuvs/neighbors/detail/ivf_pq_search.cuh b/cpp/include/cuvs/neighbors/detail/ivf_pq_search.cuh index dbaf36adf..fa6f64c7b 100644 --- a/cpp/include/cuvs/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/cuvs/neighbors/detail/ivf_pq_search.cuh @@ -153,16 +153,16 @@ void select_clusters(raft::resources const& handle, // Select neighbor clusters for each query. rmm::device_uvector cluster_dists(n_queries * n_probes, stream, mr); - matrix::detail::select_k(handle, - qc_distances.data(), - nullptr, - n_queries, - n_lists, - n_probes, - cluster_dists.data(), - clusters_to_probe, - true, - mr); + raft::matrix::detail::select_k(handle, + qc_distances.data(), + nullptr, + n_queries, + n_lists, + n_probes, + cluster_dists.data(), + clusters_to_probe, + true, + mr); } /** @@ -187,7 +187,7 @@ __launch_bounds__(BlockDim) RAFT_KERNEL chunk_indices += n_probes * blockIdx.x; // block scan - const uint32_t n_probes_aligned = Pow2::roundUp(n_probes); + const uint32_t n_probes_aligned = raft::Pow2::roundUp(n_probes); uint32_t total = 0; for (uint32_t probe_ix = threadIdx.x; probe_ix < n_probes_aligned; probe_ix += BlockDim) { auto label = probe_ix < n_probes ? clusters_to_probe[probe_ix] : 0u; @@ -230,7 +230,7 @@ struct calc_chunk_indices { template static auto try_block_dim(uint32_t n_probes, uint32_t n_queries) -> configured { - if constexpr (BlockDim >= WarpSize * 2) { + if constexpr (BlockDim >= raft::WarpSize * 2) { if (BlockDim >= n_probes * 2) { return try_block_dim<(BlockDim / 2)>(n_probes, n_queries); } } return {reinterpret_cast(calc_chunk_indices_kernel), @@ -584,16 +584,16 @@ void ivfpq_search_worker(raft::resources const& handle, // Select topk vectors for each query rmm::device_uvector topk_dists(n_queries * topK, stream, mr); - matrix::detail::select_k(handle, - distances_buf.data(), - neighbors_ptr, - n_queries, - topk_len, - topK, - topk_dists.data(), - neighbors_uint32, - true, - mr); + raft::matrix::detail::select_k(handle, + distances_buf.data(), + neighbors_ptr, + n_queries, + topk_len, + topK, + topk_dists.data(), + neighbors_uint32, + true, + mr); // Postprocessing postprocess_distances( @@ -741,7 +741,7 @@ inline void search(raft::resources const& handle, { static_assert(std::is_same_v || std::is_same_v || std::is_same_v, "Unsupported element type."); - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "ivf_pq::search(n_queries = %u, n_probes = %u, k = %u, dim = %zu)", n_queries, params.n_probes, @@ -778,7 +778,7 @@ inline void search(raft::resources const& handle, uint32_t max_samples = 0; { - IdxT ms = Pow2<128>::roundUp(index.accum_sorted_sizes()(n_probes)); + IdxT ms = raft::Pow2<128>::roundUp(index.accum_sorted_sizes()(n_probes)); RAFT_EXPECTS(ms <= IdxT(std::numeric_limits::max()), "The maximum sample size is too big."); max_samples = ms; diff --git a/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh b/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh index 555f7b258..00610c45e 100644 --- a/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh +++ b/cpp/include/cuvs/neighbors/detail/knn_merge_parts.cuh @@ -41,7 +41,7 @@ RAFT_KERNEL knn_merge_parts_kernel(const value_t* inK, int k, value_idx* translations) { - constexpr int kNumWarps = tpb / WarpSize; + constexpr int kNumWarps = tpb / raft::WarpSize; __shared__ value_t smemK[kNumWarps * warp_q]; __shared__ value_idx smemV[kNumWarps * warp_q]; diff --git a/cpp/include/cuvs/neighbors/detail/refine_device.cuh b/cpp/include/cuvs/neighbors/detail/refine_device.cuh index 61d7a6ecc..5bc478702 100644 --- a/cpp/include/cuvs/neighbors/detail/refine_device.cuh +++ b/cpp/include/cuvs/neighbors/detail/refine_device.cuh @@ -55,7 +55,7 @@ void refine_device( "k must be lest than topk::kMaxCapacity (%d).", raft::matrix::detail::select::warpsort::kMaxCapacity); - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "neighbors::refine(%zu, %u)", size_t(n_queries), uint32_t(n_candidates)); refine_check_input(dataset.extents(), @@ -74,7 +74,7 @@ void refine_device( // - We run IVF flat search with n_probes=1 to select the best k elements of the candidates. rmm::device_uvector fake_coarse_idx(n_queries, resource::get_cuda_stream(handle)); - thrust::sequence(resource::get_thrust_policy(handle), + thrust::sequence(raft::resource::get_thrust_policy(handle), fake_coarse_idx.data(), fake_coarse_idx.data() + n_queries); diff --git a/cpp/include/cuvs/neighbors/detail/refine_host-inl.hpp b/cpp/include/cuvs/neighbors/detail/refine_host-inl.hpp index ec830e58c..c753e56f7 100644 --- a/cpp/include/cuvs/neighbors/detail/refine_host-inl.hpp +++ b/cpp/include/cuvs/neighbors/detail/refine_host-inl.hpp @@ -39,7 +39,7 @@ template fun_scope( + raft::common::nvtx::range fun_scope( "neighbors::refine_host(%zu, %zu -> %zu)", n_queries, orig_k, refined_k); auto suggested_n_threads = std::max(1, std::min(omp_get_num_procs(), omp_get_max_threads())); diff --git a/cpp/include/cuvs/neighbors/detail/selection_faiss-inl.cuh b/cpp/include/cuvs/neighbors/detail/selection_faiss-inl.cuh index acc931e4a..f10339485 100644 --- a/cpp/include/cuvs/neighbors/detail/selection_faiss-inl.cuh +++ b/cpp/include/cuvs/neighbors/detail/selection_faiss-inl.cuh @@ -35,7 +35,7 @@ RAFT_KERNEL select_k_kernel(const key_t* inK, payload_t initV, int k) { - using align_warp = Pow2; + using align_warp = raft::Pow2; constexpr int kNumWarps = align_warp::div(tpb); __shared__ key_t smemK[kNumWarps * warp_q]; @@ -93,7 +93,7 @@ inline void select_k_impl(const key_t* inK, constexpr int n_threads = (warp_q <= 1024) ? 128 : 64; auto block = dim3(n_threads); - auto kInit = select_min ? upper_bound() : lower_bound(); + auto kInit = select_min ? raft::upper_bound() : lower_bound(); auto vInit = -1; if (select_min) { select_k_kernel diff --git a/cpp/include/cuvs/neighbors/ivf_flat_helpers.cuh b/cpp/include/cuvs/neighbors/ivf_flat_helpers.cuh index 25b84ec55..cca83cea0 100644 --- a/cpp/include/cuvs/neighbors/ivf_flat_helpers.cuh +++ b/cpp/include/cuvs/neighbors/ivf_flat_helpers.cuh @@ -64,7 +64,8 @@ void pack( raft::device_matrix_view codes, uint32_t veclen, uint32_t offset, - device_mdspan::list_extents, raft::row_major> list_data) + raft::device_mdspan::list_extents, raft::row_major> + list_data) { cuvs::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, offset, list_data); } @@ -100,7 +101,7 @@ void pack( template void unpack( raft::resources const& res, - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> list_data, uint32_t veclen, uint32_t offset, diff --git a/cpp/include/cuvs/neighbors/ivf_flat_types.hpp b/cpp/include/cuvs/neighbors/ivf_flat_types.hpp index 4863805fa..28023f474 100644 --- a/cpp/include/cuvs/neighbors/ivf_flat_types.hpp +++ b/cpp/include/cuvs/neighbors/ivf_flat_types.hpp @@ -323,7 +323,7 @@ struct index : ann::index { copy(&this_inds_ptrs(label), &inds_ptr, 1, stream); } auto this_list_sizes = list_sizes().data_handle(); - total_size_ = thrust::reduce(resource::get_thrust_policy(res), + total_size_ = thrust::reduce(raft::resource::get_thrust_policy(res), this_list_sizes, this_list_sizes + this_lists.size(), 0, diff --git a/cpp/include/cuvs/neighbors/ivf_list.hpp b/cpp/include/cuvs/neighbors/ivf_list.hpp index 316403893..c395980de 100644 --- a/cpp/include/cuvs/neighbors/ivf_list.hpp +++ b/cpp/include/cuvs/neighbors/ivf_list.hpp @@ -45,7 +45,7 @@ list::list(raft::resources const& res, size_type n_rows) : size{n_rows}, data{res}, indices{res} { - auto capacity = round_up_safe(n_rows, spec.align_max); + auto capacity = raft::round_up_safe(n_rows, spec.align_max); if (n_rows < spec.align_max) { capacity = bound_by_power_of_two(std::max(n_rows, spec.align_min)); capacity = std::min(capacity, spec.align_max); @@ -63,7 +63,7 @@ list::list(raft::resources const& res, e.what()); } // Fill the index buffer with a pre-defined marker for easier debugging - thrust::fill_n(resource::get_thrust_policy(res), + thrust::fill_n(raft::resource::get_thrust_policy(res), indices.data_handle(), indices.size(), ivf::kInvalidRecord); diff --git a/cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh b/cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh index 29c9f164b..f02148101 100644 --- a/cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh +++ b/cpp/include/cuvs/neighbors/ivf_pq_helpers.cuh @@ -66,7 +66,7 @@ namespace codepacker { */ inline void unpack( raft::resources const& res, - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> list_data, uint32_t pq_bits, uint32_t offset, @@ -112,7 +112,7 @@ inline void unpack( */ inline void unpack_contiguous( raft::resources const& res, - device_mdspan::list_extents, raft::row_major> + raft::device_mdspan::list_extents, raft::row_major> list_data, uint32_t pq_bits, uint32_t offset, @@ -151,7 +151,8 @@ inline void pack( raft::device_matrix_view codes, uint32_t pq_bits, uint32_t offset, - device_mdspan::list_extents, raft::row_major> list_data) + raft::device_mdspan::list_extents, raft::row_major> + list_data) { ivf_pq::detail::pack_list_data(list_data, codes, offset, pq_bits, resource::get_cuda_stream(res)); } @@ -191,7 +192,8 @@ inline void pack_contiguous( uint32_t pq_dim, uint32_t pq_bits, uint32_t offset, - device_mdspan::list_extents, raft::row_major> list_data) + raft::device_mdspan::list_extents, raft::row_major> + list_data) { ivf_pq::detail::pack_contiguous_list_data( list_data, codes, n_rows, pq_dim, offset, pq_bits, resource::get_cuda_stream(res)); diff --git a/cpp/include/cuvs/neighbors/ivf_pq_types.hpp b/cpp/include/cuvs/neighbors/ivf_pq_types.hpp index e89e11712..16a904fcc 100644 --- a/cpp/include/cuvs/neighbors/ivf_pq_types.hpp +++ b/cpp/include/cuvs/neighbors/ivf_pq_types.hpp @@ -175,7 +175,7 @@ struct list_spec { * ]. */ using list_extents = - extents; + extents; SizeT align_max; SizeT align_min; @@ -374,20 +374,21 @@ struct index : ann::index { { } - using pq_centers_extents = - std::experimental::extents; + using pq_centers_extents = std::experimental:: + extents; /** * PQ cluster centers * * - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size] * - codebook_gen::PER_CLUSTER: [n_lists, pq_len, pq_book_size] */ - inline auto pq_centers() noexcept -> device_mdspan + inline auto pq_centers() noexcept + -> raft::device_mdspan { return pq_centers_.view(); } [[nodiscard]] inline auto pq_centers() const noexcept - -> device_mdspan + -> raft::device_mdspan { return pq_centers_.view(); } @@ -445,12 +446,13 @@ struct index : ann::index { * * This span is used during search to estimate the maximum size of the workspace. */ - inline auto accum_sorted_sizes() noexcept -> host_vector_view + inline auto accum_sorted_sizes() noexcept + -> raft::host_vector_view { return accum_sorted_sizes_.view(); } [[nodiscard]] inline auto accum_sorted_sizes() const noexcept - -> host_vector_view + -> raft::host_vector_view { return accum_sorted_sizes_.view(); } @@ -523,7 +525,7 @@ struct index : ann::index { // Primary data members std::vector>> lists_; raft::device_vector list_sizes_; - device_mdarray pq_centers_; + raft::device_mdarray pq_centers_; raft::device_matrix centers_; raft::device_matrix centers_rot_; raft::device_matrix rotation_matrix_; @@ -531,7 +533,7 @@ struct index : ann::index { // Computed members for accelerating search. raft::device_vector data_ptrs_; raft::device_vector inds_ptrs_; - host_vector accum_sorted_sizes_; + raft::host_vector accum_sorted_sizes_; /** Throw an error if the index content is inconsistent. */ void check_consistency() diff --git a/cpp/include/cuvs/spatial/knn/ann.cuh b/cpp/include/cuvs/spatial/knn/ann.cuh index e19eba6a4..99f5f12eb 100644 --- a/cpp/include/cuvs/spatial/knn/ann.cuh +++ b/cpp/include/cuvs/spatial/knn/ann.cuh @@ -47,7 +47,7 @@ approx_knn_build_index(raft::resources& handle, value_idx n, value_idx D) { - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "legacy approx_knn_build_index(n_rows = %u, dim = %u)", n, D); detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D); } @@ -75,7 +75,7 @@ approx_knn_search(raft::resources& handle, T* query_array, value_idx n) { - common::nvtx::range fun_scope( + raft::common::nvtx::range fun_scope( "legacy approx_knn_search(k = %u, n_queries = %u)", k, n); detail::approx_knn_search(handle, distances, indices, index, k, query_array, n); } diff --git a/cpp/include/cuvs/spatial/knn/detail/ball_cover.cuh b/cpp/include/cuvs/spatial/knn/detail/ball_cover.cuh index 1291b160c..f467600dd 100644 --- a/cpp/include/cuvs/spatial/knn/detail/ball_cover.cuh +++ b/cpp/include/cuvs/spatial/knn/detail/ball_cover.cuh @@ -72,17 +72,17 @@ void sample_landmarks(raft::resources const& handle, rmm::device_uvector R_1nn_ones(index.m, resource::get_cuda_stream(handle)); rmm::device_uvector R_indices(index.n_landmarks, resource::get_cuda_stream(handle)); - thrust::sequence(resource::get_thrust_policy(handle), + thrust::sequence(raft::resource::get_thrust_policy(handle), index.get_R_1nn_cols().data_handle(), index.get_R_1nn_cols().data_handle() + index.m, (value_idx)0); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), R_1nn_ones.data(), R_1nn_ones.data() + R_1nn_ones.size(), 1.0); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), R_indices.data(), R_indices.data() + R_indices.size(), 0.0); @@ -131,7 +131,7 @@ void construct_landmark_1nn(raft::resources const& handle, { rmm::device_uvector R_1nn_inds(index.m, resource::get_cuda_stream(handle)); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), R_1nn_inds.data(), R_1nn_inds.data() + index.m, std::numeric_limits::max()); @@ -150,7 +150,7 @@ void construct_landmark_1nn(raft::resources const& handle, thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists().data_handle())); // group neighborhoods for each reference landmark and sort each group by distance - thrust::sort_by_key(resource::get_thrust_policy(handle), + thrust::sort_by_key(raft::resource::get_thrust_policy(handle), keys, keys + index.m, index.get_R_1nn_cols().data_handle(), @@ -214,7 +214,7 @@ void compute_landmark_radii(raft::resources const& handle, const value_idx* R_indptr_ptr = index.get_R_indptr().data_handle(); const value_t* R_1nn_dists_ptr = index.get_R_1nn_dists().data_handle(); value_t* R_radius_ptr = index.get_R_radius().data_handle(); - thrust::for_each(resource::get_thrust_policy(handle), + thrust::for_each(raft::resource::get_thrust_policy(handle), entries, entries + index.n_landmarks, [=] __device__(value_idx input) { @@ -253,11 +253,11 @@ void perform_rbc_query(raft::resources const& handle, bool perform_post_filtering = true) { // initialize output inds and dists - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), inds, inds + (k * n_query_pts), std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), dists, dists + (k * n_query_pts), std::numeric_limits::max()); @@ -349,11 +349,11 @@ void rbc_build_index(raft::resources const& handle, rmm::device_uvector R_knn_inds(index.m, resource::get_cuda_stream(handle)); // Initialize the uvectors - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), R_knn_inds.begin(), R_knn_inds.end(), std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), index.get_R_closest_landmark_dists().data_handle(), index.get_R_closest_landmark_dists().data_handle() + index.m, std::numeric_limits::max()); @@ -416,20 +416,20 @@ void rbc_all_knn_query(raft::resources const& handle, rmm::device_uvector R_knn_dists(k * index.m, raft::resource::get_cuda_stream(handle)); // Initialize the uvectors - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), R_knn_inds.begin(), R_knn_inds.end(), std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), R_knn_dists.begin(), R_knn_dists.end(), std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), inds, inds + (k * index.m), std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), dists, dists + (k * index.m), std::numeric_limits::max()); @@ -494,20 +494,20 @@ void rbc_knn_query(raft::resources const& handle, raft::resource::get_cuda_stream(handle)); // Initialize the uvectors - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), R_knn_inds.begin(), R_knn_inds.end(), std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), R_knn_dists.begin(), R_knn_dists.end(), std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), inds, inds + (k * n_query_pts), std::numeric_limits::max()); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), dists, dists + (k * n_query_pts), std::numeric_limits::max()); @@ -518,11 +518,11 @@ void rbc_knn_query(raft::resources const& handle, rmm::device_uvector dists_counter(index.m, raft::resource::get_cuda_stream(handle)); rmm::device_uvector post_dists_counter(index.m, raft::resource::get_cuda_stream(handle)); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), post_dists_counter.data(), post_dists_counter.data() + post_dists_counter.size(), 0); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), dists_counter.data(), dists_counter.data() + dists_counter.size(), 0); diff --git a/cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-inl.cuh b/cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-inl.cuh index 95a70ba30..f1cb45e97 100644 --- a/cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-inl.cuh +++ b/cpp/include/cuvs/spatial/knn/detail/ball_cover/registers-inl.cuh @@ -171,7 +171,7 @@ RAFT_KERNEL compute_final_dists_registers(const value_t* X_index, dist_func dfunc, value_int* dist_counter) { - static constexpr int kNumWarps = tpb / WarpSize; + static constexpr int kNumWarps = tpb / raft::WarpSize; __shared__ value_t shared_memK[kNumWarps * warp_q]; __shared__ KeyValuePair shared_memV[kNumWarps * warp_q]; @@ -191,7 +191,7 @@ RAFT_KERNEL compute_final_dists_registers(const value_t* X_index, shared_memV, k); - const value_int n_k = Pow2::roundDown(k); + const value_int n_k = raft::Pow2::roundDown(k); value_int i = threadIdx.x; for (; i < n_k; i += tpb) { value_idx ind = knn_inds[blockIdx.x * k + i]; @@ -218,7 +218,7 @@ RAFT_KERNEL compute_final_dists_registers(const value_t* X_index, // Round R_size to the nearest warp threads so they can // all be computing in parallel. - const value_int limit = Pow2::roundDown(R_size); + const value_int limit = raft::Pow2::roundDown(R_size); i = threadIdx.x; for (; i < limit; i += tpb) { @@ -328,7 +328,7 @@ RAFT_KERNEL block_rbc_kernel_registers(const value_t* X_index, distance_func dfunc, float weight = 1.0) { - static constexpr value_int kNumWarps = tpb / WarpSize; + static constexpr value_int kNumWarps = tpb / raft::WarpSize; __shared__ value_t shared_memK[kNumWarps * warp_q]; __shared__ KeyValuePair shared_memV[kNumWarps * warp_q]; @@ -379,7 +379,7 @@ RAFT_KERNEL block_rbc_kernel_registers(const value_t* X_index, value_idx R_size = R_stop_offset - R_start_offset; - value_int limit = Pow2::roundDown(R_size); + value_int limit = raft::Pow2::roundDown(R_size); value_int i = threadIdx.x; for (; i < limit; i += tpb) { // Index and distance of current candidate's nearest landmark diff --git a/cpp/include/cuvs/spatial/knn/knn.cuh b/cpp/include/cuvs/spatial/knn/knn.cuh index f73268d5b..f6267feb5 100644 --- a/cpp/include/cuvs/spatial/knn/knn.cuh +++ b/cpp/include/cuvs/spatial/knn/knn.cuh @@ -134,12 +134,13 @@ template cudaStream_t stream, SelectKAlgo algo = SelectKAlgo::FAISS) { - common::nvtx::range fun_scope("select-%s-%d (%zu, %zu) algo-%d", - select_min ? "min" : "max", - k, - n_inputs, - input_len, - int(algo)); + raft::common::nvtx::range fun_scope( + "select-%s-%d (%zu, %zu) algo-%d", + select_min ? "min" : "max", + k, + n_inputs, + input_len, + int(algo)); ASSERT(size_t(input_len) >= size_t(k), "Size of the input (input_len = %zu) must be not smaller than the selection (k = %zu).", size_t(input_len), @@ -152,17 +153,17 @@ template break; case SelectKAlgo::RADIX_8_BITS: - matrix::detail::select::radix::select_k( + raft::matrix::detail::select::radix::select_k( in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, true, stream); break; case SelectKAlgo::RADIX_11_BITS: - matrix::detail::select::radix::select_k( + raft::matrix::detail::select::radix::select_k( in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, true, stream); break; case SelectKAlgo::WARP_SORT: - matrix::detail::select::warpsort::select_k( + raft::matrix::detail::select::warpsort::select_k( in_keys, in_values, n_inputs, input_len, k, out_keys, out_values, select_min, stream); break; diff --git a/cpp/include/cuvs/spectral/cluster_solvers.cuh b/cpp/include/cuvs/spectral/cluster_solvers.cuh index 760f55491..63859adb1 100644 --- a/cpp/include/cuvs/spectral/cluster_solvers.cuh +++ b/cpp/include/cuvs/spectral/cluster_solvers.cuh @@ -69,7 +69,7 @@ struct kmeans_solver_t { auto centroids = raft::make_device_matrix(handle, config_.n_clusters, dim); auto weight = raft::make_device_vector(handle, n_obs_vecs); - thrust::fill(resource::get_thrust_policy(handle), + thrust::fill(raft::resource::get_thrust_policy(handle), weight.data_handle(), weight.data_handle() + n_obs_vecs, 1); diff --git a/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp b/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp index 3cd3eec0b..ebdb9835a 100644 --- a/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp +++ b/cpp/include/cuvs/spectral/detail/matrix_wrappers.hpp @@ -93,9 +93,9 @@ struct vector_view_t { template class vector_t { public: - vector_t(resources const& raft_handle, size_type sz) + vector_t(raft::resources const& raft_handle, size_type sz) : buffer_(sz, resource::get_cuda_stream(raft_handle)), - thrust_policy(resource::get_thrust_policy(raft_handle)) + thrust_policy(raft::resource::get_thrust_policy(raft_handle)) { } @@ -133,7 +133,7 @@ class vector_t { template struct sparse_matrix_t { - sparse_matrix_t(resources const& raft_handle, + sparse_matrix_t(raft::resources const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, @@ -150,7 +150,7 @@ struct sparse_matrix_t { { } - sparse_matrix_t(resources const& raft_handle, + sparse_matrix_t(raft::resources const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, @@ -167,7 +167,7 @@ struct sparse_matrix_t { } template - sparse_matrix_t(resources const& raft_handle, CSRView const& csr_view) + sparse_matrix_t(raft::resources const& raft_handle, CSRView const& csr_view) : handle_(raft_handle), row_offsets_(csr_view.offsets), col_indices_(csr_view.indices), @@ -313,7 +313,7 @@ struct sparse_matrix_t { template struct laplacian_matrix_t : sparse_matrix_t { - laplacian_matrix_t(resources const& raft_handle, + laplacian_matrix_t(raft::resources const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, @@ -328,7 +328,7 @@ struct laplacian_matrix_t : sparse_matrix_t { sparse_matrix_t::mv(1, ones.raw(), 0, diagonal_.raw()); } - laplacian_matrix_t(resources const& raft_handle, + laplacian_matrix_t(raft::resources const& raft_handle, sparse_matrix_t const& csr_m) : sparse_matrix_t(raft_handle, csr_m.row_offsets_, @@ -387,7 +387,7 @@ struct laplacian_matrix_t : sparse_matrix_t { template struct modularity_matrix_t : laplacian_matrix_t { - modularity_matrix_t(resources const& raft_handle, + modularity_matrix_t(raft::resources const& raft_handle, index_type const* row_offsets, index_type const* col_indices, value_type const* values, @@ -399,7 +399,7 @@ struct modularity_matrix_t : laplacian_matrix_t { edge_sum_ = laplacian_matrix_t::diagonal_.nrm1(); } - modularity_matrix_t(resources const& raft_handle, + modularity_matrix_t(raft::resources const& raft_handle, sparse_matrix_t const& csr_m) : laplacian_matrix_t(raft_handle, csr_m) { diff --git a/cpp/include/cuvs/spectral/eigen_solvers.cuh b/cpp/include/cuvs/spectral/eigen_solvers.cuh index be8be83cd..59e0c0d96 100644 --- a/cpp/include/cuvs/spectral/eigen_solvers.cuh +++ b/cpp/include/cuvs/spectral/eigen_solvers.cuh @@ -51,7 +51,7 @@ struct lanczos_solver_t { index_type_t solve_smallest_eigenvectors( raft::resources const& handle, - matrix::sparse_matrix_t const& A, + raft::matrix::sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { @@ -74,7 +74,7 @@ struct lanczos_solver_t { index_type_t solve_largest_eigenvectors( raft::resources const& handle, - matrix::sparse_matrix_t const& A, + raft::matrix::sparse_matrix_t const& A, value_type_t* __restrict__ eigVals, value_type_t* __restrict__ eigVecs) const { diff --git a/cpp/include/cuvs/spectral/modularity_maximization.cuh b/cpp/include/cuvs/spectral/modularity_maximization.cuh index 2a15b8fe1..6cee2086d 100644 --- a/cpp/include/cuvs/spectral/modularity_maximization.cuh +++ b/cpp/include/cuvs/spectral/modularity_maximization.cuh @@ -46,7 +46,7 @@ namespace spectral { template std::tuple modularity_maximization( raft::resources const& handle, - matrix::sparse_matrix_t const& csr_m, + raft::matrix::sparse_matrix_t const& csr_m, EigenSolver const& eigen_solver, ClusterSolver const& cluster_solver, vertex_t* __restrict__ clusters, @@ -71,7 +71,7 @@ std::tuple modularity_maximization( */ template void analyzeModularity(raft::resources const& handle, - matrix::sparse_matrix_t const& csr_m, + raft::matrix::sparse_matrix_t const& csr_m, vertex_t nClusters, vertex_t const* __restrict__ clusters, weight_t& modularity) diff --git a/cpp/include/cuvs/spectral/partition.cuh b/cpp/include/cuvs/spectral/partition.cuh index c16dd4ef0..3f327dbfb 100644 --- a/cpp/include/cuvs/spectral/partition.cuh +++ b/cpp/include/cuvs/spectral/partition.cuh @@ -48,7 +48,7 @@ namespace spectral { template std::tuple partition( raft::resources const& handle, - matrix::sparse_matrix_t const& csr_m, + raft::matrix::sparse_matrix_t const& csr_m, EigenSolver const& eigen_solver, ClusterSolver const& cluster_solver, vertex_t* __restrict__ clusters, @@ -79,7 +79,7 @@ std::tuple partition( */ template void analyzePartition(raft::resources const& handle, - matrix::sparse_matrix_t const& csr_m, + raft::matrix::sparse_matrix_t const& csr_m, vertex_t nClusters, const vertex_t* __restrict__ clusters, weight_t& edgeCut, diff --git a/cpp/include/cuvs/stats/detail/meanvar.cuh b/cpp/include/cuvs/stats/detail/meanvar.cuh index 1ebaf3b18..c286d5ed9 100644 --- a/cpp/include/cuvs/stats/detail/meanvar.cuh +++ b/cpp/include/cuvs/stats/detail/meanvar.cuh @@ -100,7 +100,7 @@ NB: current implementation here is not optimal, especially the rowmajor version; * * Assumptions: * - * 1. blockDim.x == WarpSize + * 1. blockDim.x == raft::WarpSize * 2. Dimension X goes along columns (D) * 3. Dimension Y goes along rows (N) * @@ -197,8 +197,9 @@ void meanvar( T* mean, T* var, const T* data, I D, I N, bool sample, bool rowMajor, cudaStream_t stream) { if (rowMajor) { - static_assert(BlockSize >= WarpSize, "Block size must be not smaller than the warp size."); - const dim3 bs(WarpSize, BlockSize / WarpSize, 1); + static_assert(BlockSize >= raft::WarpSize, + "Block size must be not smaller than the warp size."); + const dim3 bs(WarpSize, BlockSize / raft::WarpSize, 1); dim3 gs(raft::ceildiv(D, bs.x), raft::ceildiv(N, bs.y), 1); // Don't create more blocks than necessary to occupy the GPU