Skip to content

Commit

Permalink
Merge branch 'branch-24.12' into 2412-migrate_remaining_raft_items
Browse files Browse the repository at this point in the history
  • Loading branch information
cjnolet authored Dec 5, 2024
2 parents 748223d + c5e03f2 commit bdcf755
Show file tree
Hide file tree
Showing 37 changed files with 3,933 additions and 92 deletions.
5 changes: 4 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,10 @@ repos:
- id: codespell
additional_dependencies: [tomli]
args: ["--toml", "pyproject.toml"]
exclude: (?x)^(^CHANGELOG.md$)
exclude: |
(?x)
^CHANGELOG[.]md$|
^cpp/cmake/patches/cutlass/build-export[.]patch$
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
Expand Down
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ if(BUILD_SHARED_LIBS)
src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
src/neighbors/detail/cagra/cagra_build.cpp
src/neighbors/detail/cagra/topk_for_cagra/topk.cu
src/neighbors/dynamic_batching.cu
$<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
src/neighbors/ivf_flat_index.cpp
src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
Expand Down Expand Up @@ -462,6 +463,7 @@ if(BUILD_SHARED_LIBS)
src/neighbors/vamana_serialize_float.cu
src/neighbors/vamana_serialize_uint8.cu
src/neighbors/vamana_serialize_int8.cu
src/preprocessing/quantize/scalar.cu
src/selection/select_k_float_int64_t.cu
src/selection/select_k_float_int32_t.cu
src/selection/select_k_float_uint32_t.cu
Expand Down
26 changes: 26 additions & 0 deletions cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,26 @@ extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
#include "cuvs_mg_cagra_wrapper.h"
#endif

template <typename ParamT>
void parse_dynamic_batching_params(const nlohmann::json& conf, ParamT& param)
{
if (!conf.value("dynamic_batching", false)) { return; }
param.dynamic_batching = true;
if (conf.contains("dynamic_batching_max_batch_size")) {
param.dynamic_batching_max_batch_size = conf.at("dynamic_batching_max_batch_size");
}
param.dynamic_batching_conservative_dispatch =
conf.value("dynamic_batching_conservative_dispatch", false);
if (conf.contains("dynamic_batching_dispatch_timeout_ms")) {
param.dynamic_batching_dispatch_timeout_ms = conf.at("dynamic_batching_dispatch_timeout_ms");
}
if (conf.contains("dynamic_batching_n_queues")) {
param.dynamic_batching_n_queues = conf.at("dynamic_batching_n_queues");
}
param.dynamic_batching_k =
uint32_t(uint32_t(conf.at("k")) * float(conf.value("refine_ratio", 1.0f)));
}

#if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
template <typename T, typename IdxT>
void parse_build_param(const nlohmann::json& conf,
Expand Down Expand Up @@ -138,6 +158,9 @@ void parse_search_param(const nlohmann::json& conf,
param.refine_ratio = conf.at("refine_ratio");
if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); }
}

// enable dynamic batching
parse_dynamic_batching_params(conf, param);
}
#endif

Expand Down Expand Up @@ -291,5 +314,8 @@ void parse_search_param(const nlohmann::json& conf,
}
// Same ratio as in IVF-PQ
param.refine_ratio = conf.value("refine_ratio", 1.0f);

// enable dynamic batching
parse_dynamic_batching_params(conf, param);
}
#endif
97 changes: 79 additions & 18 deletions cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cuvs/distance/distance.hpp>
#include <cuvs/neighbors/cagra.hpp>
#include <cuvs/neighbors/common.hpp>
#include <cuvs/neighbors/dynamic_batching.hpp>
#include <cuvs/neighbors/ivf_pq.hpp>
#include <cuvs/neighbors/nn_descent.hpp>
#include <raft/core/device_mdspan.hpp>
Expand Down Expand Up @@ -63,6 +64,13 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
AllocatorType graph_mem = AllocatorType::kDevice;
AllocatorType dataset_mem = AllocatorType::kDevice;
[[nodiscard]] auto needs_dataset() const -> bool override { return true; }
/* Dynamic batching */
bool dynamic_batching = false;
int64_t dynamic_batching_k;
int64_t dynamic_batching_max_batch_size = 4;
double dynamic_batching_dispatch_timeout_ms = 0.01;
size_t dynamic_batching_n_queues = 8;
bool dynamic_batching_conservative_dispatch = false;
};

struct build_param {
Expand Down Expand Up @@ -173,6 +181,12 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
std::shared_ptr<raft::device_matrix<T, int64_t, raft::row_major>> dataset_;
std::shared_ptr<raft::device_matrix_view<const T, int64_t, raft::row_major>> input_dataset_v_;

std::shared_ptr<cuvs::neighbors::dynamic_batching::index<T, IdxT>> dynamic_batcher_;
cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
int64_t dynamic_batching_max_batch_size_;
size_t dynamic_batching_n_queues_;
bool dynamic_batching_conservative_dispatch_;

inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type)
{
switch (mem_type) {
Expand Down Expand Up @@ -216,26 +230,33 @@ inline auto allocator_to_string(AllocatorType mem_type) -> std::string
template <typename T, typename IdxT>
void cuvs_cagra<T, IdxT>::set_search_param(const search_param_base& param)
{
auto sp = dynamic_cast<const search_param&>(param);
search_params_ = sp.p;
refine_ratio_ = sp.refine_ratio;
auto sp = dynamic_cast<const search_param&>(param);
bool needs_dynamic_batcher_update =
(dynamic_batching_max_batch_size_ != sp.dynamic_batching_max_batch_size) ||
(dynamic_batching_n_queues_ != sp.dynamic_batching_n_queues) ||
(dynamic_batching_conservative_dispatch_ != sp.dynamic_batching_conservative_dispatch);
dynamic_batching_max_batch_size_ = sp.dynamic_batching_max_batch_size;
dynamic_batching_n_queues_ = sp.dynamic_batching_n_queues;
dynamic_batching_conservative_dispatch_ = sp.dynamic_batching_conservative_dispatch;
search_params_ = sp.p;
refine_ratio_ = sp.refine_ratio;
if (sp.graph_mem != graph_mem_) {
// Move graph to correct memory space
graph_mem_ = sp.graph_mem;
RAFT_LOG_DEBUG("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str());
// We create a new graph and copy to it from existing graph
auto mr = get_mr(graph_mem_);
auto new_graph = raft::make_device_mdarray<IdxT, int64_t>(
auto mr = get_mr(graph_mem_);
*graph_ = raft::make_device_mdarray<IdxT, int64_t>(
handle_, mr, raft::make_extents<int64_t>(index_->graph().extent(0), index_->graph_degree()));

raft::copy(new_graph.data_handle(),
raft::copy(graph_->data_handle(),
index_->graph().data_handle(),
index_->graph().size(),
raft::resource::get_cuda_stream(handle_));

index_->update_graph(handle_, make_const_mdspan(new_graph.view()));
// update_graph() only stores a view in the index. We need to keep the graph object alive.
*graph_ = std::move(new_graph);
// NB: update_graph() only stores a view in the index. We need to keep the graph object alive.
index_->update_graph(handle_, make_const_mdspan(graph_->view()));
needs_dynamic_batcher_update = true;
}

if (sp.dataset_mem != dataset_mem_ || need_dataset_update_) {
Expand All @@ -256,7 +277,26 @@ void cuvs_cagra<T, IdxT>::set_search_param(const search_param_base& param)
dataset_->data_handle(), dataset_->extent(0), this->dim_, dataset_->extent(1));
index_->update_dataset(handle_, dataset_view);

need_dataset_update_ = false;
need_dataset_update_ = false;
needs_dynamic_batcher_update = true;
}

// dynamic batching
if (sp.dynamic_batching) {
if (!dynamic_batcher_ || needs_dynamic_batcher_update) {
dynamic_batcher_ = std::make_shared<cuvs::neighbors::dynamic_batching::index<T, IdxT>>(
handle_,
cuvs::neighbors::dynamic_batching::index_params{{},
sp.dynamic_batching_k,
sp.dynamic_batching_max_batch_size,
sp.dynamic_batching_n_queues,
sp.dynamic_batching_conservative_dispatch},
*index_,
search_params_);
}
dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
} else {
if (dynamic_batcher_) { dynamic_batcher_.reset(); }
}
}

Expand Down Expand Up @@ -306,7 +346,7 @@ void cuvs_cagra<T, IdxT>::load(const std::string& file)
template <typename T, typename IdxT>
std::unique_ptr<algo<T>> cuvs_cagra<T, IdxT>::copy()
{
return std::make_unique<cuvs_cagra<T, IdxT>>(*this); // use copy constructor
return std::make_unique<cuvs_cagra<T, IdxT>>(std::cref(*this)); // use copy constructor
}

template <typename T, typename IdxT>
Expand All @@ -330,8 +370,17 @@ void cuvs_cagra<T, IdxT>::search_base(const T* queries,
raft::make_device_matrix_view<IdxT, int64_t>(neighbors_idx_t, batch_size, k);
auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);

cuvs::neighbors::cagra::search(
handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
if (dynamic_batcher_) {
cuvs::neighbors::dynamic_batching::search(handle_,
dynamic_batcher_sp_,
*dynamic_batcher_,
queries_view,
neighbors_view,
distances_view);
} else {
cuvs::neighbors::cagra::search(
handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
}

if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
if (raft::get_device_for_address(neighbors) < 0 &&
Expand Down Expand Up @@ -367,11 +416,23 @@ void cuvs_cagra<T, IdxT>::search(
const raft::resources& res = handle_;
auto mem_type =
raft::get_device_for_address(neighbors) >= 0 ? MemoryType::kDevice : MemoryType::kHostPinned;
auto& tmp_buf = get_tmp_buffer_from_global_pool(
((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
(kNeedsIoMapping ? sizeof(IdxT) : 0)) *
batch_size * k0);
auto* candidates_ptr = reinterpret_cast<algo_base::index_type*>(tmp_buf.data(mem_type));

// If dynamic batching is used and there's no sync between benchmark laps, multiple sequential
// requests can group together. The data is copied asynchronously, and if the same intermediate
// buffer is used for multiple requests, they can override each other's data. Hence, we need to
// allocate as much space as required by the maximum number of sequential requests.
auto max_dyn_grouping = dynamic_batcher_ ? raft::div_rounding_up_safe<int64_t>(
dynamic_batching_max_batch_size_, batch_size) *
dynamic_batching_n_queues_
: 1;
auto tmp_buf_size = ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
(kNeedsIoMapping ? sizeof(IdxT) : 0)) *
batch_size * k0;
auto& tmp_buf = get_tmp_buffer_from_global_pool(tmp_buf_size * max_dyn_grouping);
thread_local static int64_t group_id = 0;
auto* candidates_ptr = reinterpret_cast<algo_base::index_type*>(
reinterpret_cast<uint8_t*>(tmp_buf.data(mem_type)) + tmp_buf_size * group_id);
group_id = (group_id + 1) % max_dyn_grouping;
auto* candidate_dists_ptr =
reinterpret_cast<float*>(candidates_ptr + (disable_refinement ? 0 : batch_size * k0));
auto* neighbors_idx_t =
Expand Down
40 changes: 38 additions & 2 deletions cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
#include "cuvs_ann_bench_utils.h"

#include <cuvs/distance/distance.hpp>
#include <cuvs/neighbors/dynamic_batching.hpp>
#include <cuvs/neighbors/ivf_pq.hpp>

#include <raft/core/device_mdarray.hpp>
#include <raft/core/device_mdspan.hpp>
#include <raft/core/device_resources.hpp>
Expand All @@ -46,6 +48,13 @@ class cuvs_ivf_pq : public algo<T>, public algo_gpu {
cuvs::neighbors::ivf_pq::search_params pq_param;
float refine_ratio = 1.0f;
[[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; }
/* Dynamic batching */
bool dynamic_batching = false;
int64_t dynamic_batching_k;
int64_t dynamic_batching_max_batch_size = 128;
double dynamic_batching_dispatch_timeout_ms = 0.01;
size_t dynamic_batching_n_queues = 3;
bool dynamic_batching_conservative_dispatch = true;
};

using build_param = cuvs::neighbors::ivf_pq::index_params;
Expand Down Expand Up @@ -98,6 +107,9 @@ class cuvs_ivf_pq : public algo<T>, public algo_gpu {
int dimension_;
float refine_ratio_ = 1.0;
raft::device_matrix_view<const T, IdxT> dataset_;

std::shared_ptr<cuvs::neighbors::dynamic_batching::index<T, IdxT>> dynamic_batcher_;
cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
};

template <typename T, typename IdxT>
Expand Down Expand Up @@ -138,6 +150,21 @@ void cuvs_ivf_pq<T, IdxT>::set_search_param(const search_param_base& param)
search_params_ = sp.pq_param;
refine_ratio_ = sp.refine_ratio;
assert(search_params_.n_probes <= index_params_.n_lists);

if (sp.dynamic_batching) {
dynamic_batcher_ = std::make_shared<cuvs::neighbors::dynamic_batching::index<T, IdxT>>(
handle_,
cuvs::neighbors::dynamic_batching::index_params{{},
sp.dynamic_batching_k,
sp.dynamic_batching_max_batch_size,
sp.dynamic_batching_n_queues,
sp.dynamic_batching_conservative_dispatch},
*index_,
search_params_);
dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
} else {
dynamic_batcher_.reset();
}
}

template <typename T, typename IdxT>
Expand Down Expand Up @@ -168,8 +195,17 @@ void cuvs_ivf_pq<T, IdxT>::search_base(
raft::make_device_matrix_view<IdxT, uint32_t>(neighbors_idx_t, batch_size, k);
auto distances_view = raft::make_device_matrix_view<float, uint32_t>(distances, batch_size, k);

cuvs::neighbors::ivf_pq::search(
handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
if (dynamic_batcher_) {
cuvs::neighbors::dynamic_batching::search(handle_,
dynamic_batcher_sp_,
*dynamic_batcher_,
queries_view,
neighbors_view,
distances_view);
} else {
cuvs::neighbors::ivf_pq::search(
handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
}

if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
raft::linalg::unaryOp(neighbors,
Expand Down
27 changes: 27 additions & 0 deletions cpp/cmake/patches/cutlass/build-export.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
From e0a9597946257a01ae8444200f836ee51d5597ba Mon Sep 17 00:00:00 2001
From: Kyle Edwards <[email protected]>
Date: Wed, 20 Nov 2024 16:37:38 -0500
Subject: [PATCH] Remove erroneous include directories

These directories are left over from when CuTe was a separate
CMake project. Remove them.
---
CMakeLists.txt | 2 --
1 file changed, 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7419bdf5e..545384d82 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -665,8 +665,6 @@ target_include_directories(
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CUTLASS_INCLUDE_DIR}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
- $<BUILD_INTERFACE:${cute_SOURCE_DIR}/include>
- $<BUILD_INTERFACE:${cute_SOURCE_DIR}/examples>
)

# Mark CTK headers as system to supress warnings from them
--
2.34.1

16 changes: 16 additions & 0 deletions cpp/cmake/patches/cutlass_override.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"packages" : {
"cutlass" : {
"version": "3.5.1",
"git_url": "https://github.com/NVIDIA/cutlass.git",
"git_tag": "v${version}",
"patches" : [
{
"file" : "${current_json_dir}/cutlass/build-export.patch",
"issue" : "Fix build directory export",
"fixed_in" : ""
}
]
}
}
}
Loading

0 comments on commit bdcf755

Please sign in to comment.