diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 21cb98180..1e602ccf1 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -24,7 +24,6 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 432509bcb..b060e78c2 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -24,7 +24,6 @@ dependencies:
 - gcc_linux-64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 0c5043ac2..485122273 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -25,7 +25,6 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index cbb22333c..d5f48dadb 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -25,7 +25,6 @@ dependencies:
 - gcc_linux-64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
-- hnswlib=0.6.2
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eb2e7c7a4..6af423bd5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -397,6 +397,7 @@ if(BUILD_SHARED_LIBS)
     src/neighbors/iface/iface_pq_uint8_t_int64_t.cu
     src/neighbors/detail/cagra/cagra_build.cpp
     src/neighbors/detail/cagra/topk_for_cagra/topk.cu
+    src/neighbors/dynamic_batching.cu
     $<$<BOOL:${BUILD_CAGRA_HNSWLIB}>:src/neighbors/hnsw.cpp>
     src/neighbors/ivf_flat_index.cpp
     src/neighbors/ivf_flat/ivf_flat_build_extend_float_int64_t.cu
@@ -577,6 +578,7 @@ if(BUILD_SHARED_LIBS)
 
   if(BUILD_CAGRA_HNSWLIB)
     target_link_libraries(cuvs_objs PRIVATE hnswlib::hnswlib)
+    target_compile_definitions(cuvs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
     target_compile_definitions(cuvs_objs PUBLIC CUVS_BUILD_CAGRA_HNSWLIB)
   endif()
 
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 0f6b42ae9..c161a68bc 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -225,9 +225,7 @@ if(CUVS_ANN_BENCH_USE_CUVS_CAGRA)
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB)
-  ConfigureAnnBench(
-    NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs hnswlib::hnswlib
-  )
+  ConfigureAnnBench(NAME CUVS_CAGRA_HNSWLIB PATH src/cuvs/cuvs_cagra_hnswlib.cu LINKS cuvs)
 endif()
 
 if(CUVS_ANN_BENCH_USE_CUVS_MG)
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 57d5b1910..7617bfa66 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -56,6 +56,26 @@ extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
 #include "cuvs_mg_cagra_wrapper.h"
 #endif
 
+template <typename ParamT>
+void parse_dynamic_batching_params(const nlohmann::json& conf, ParamT& param)
+{
+  if (!conf.value("dynamic_batching", false)) { return; }
+  param.dynamic_batching = true;
+  if (conf.contains("dynamic_batching_max_batch_size")) {
+    param.dynamic_batching_max_batch_size = conf.at("dynamic_batching_max_batch_size");
+  }
+  param.dynamic_batching_conservative_dispatch =
+    conf.value("dynamic_batching_conservative_dispatch", false);
+  if (conf.contains("dynamic_batching_dispatch_timeout_ms")) {
+    param.dynamic_batching_dispatch_timeout_ms = conf.at("dynamic_batching_dispatch_timeout_ms");
+  }
+  if (conf.contains("dynamic_batching_n_queues")) {
+    param.dynamic_batching_n_queues = conf.at("dynamic_batching_n_queues");
+  }
+  param.dynamic_batching_k =
+    uint32_t(uint32_t(conf.at("k")) * float(conf.value("refine_ratio", 1.0f)));
+}
+
 #if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT) || defined(CUVS_ANN_BENCH_USE_CUVS_MG)
 template <typename T, typename IdxT>
 void parse_build_param(const nlohmann::json& conf,
@@ -138,6 +158,9 @@ void parse_search_param(const nlohmann::json& conf,
     param.refine_ratio = conf.at("refine_ratio");
     if (param.refine_ratio < 1.0f) { throw std::runtime_error("refine_ratio should be >= 1.0"); }
   }
+
+  // enable dynamic batching
+  parse_dynamic_batching_params(conf, param);
 }
 #endif
 
@@ -291,5 +314,8 @@ void parse_search_param(const nlohmann::json& conf,
   }
   // Same ratio as in IVF-PQ
   param.refine_ratio = conf.value("refine_ratio", 1.0f);
+
+  // enable dynamic batching
+  parse_dynamic_batching_params(conf, param);
 }
 #endif
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
index 558ba01e0..e45a3bd5a 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib.cu
@@ -24,12 +24,35 @@
 
 namespace cuvs::bench {
 
+template <typename T, typename IdxT>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_cagra_hnswlib<T, IdxT>::build_param& param)
+{
+  if (conf.contains("hierarchy")) {
+    if (conf.at("hierarchy") == "none") {
+      param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::NONE;
+    } else if (conf.at("hierarchy") == "cpu") {
+      param.hnsw_index_params.hierarchy = cuvs::neighbors::hnsw::HnswHierarchy::CPU;
+    } else {
+      THROW("Invalid value for hierarchy: %s", conf.at("hierarchy").get<std::string>().c_str());
+    }
+  }
+  if (conf.contains("ef_construction")) {
+    param.hnsw_index_params.ef_construction = conf.at("ef_construction");
+  }
+  if (conf.contains("num_threads")) {
+    param.hnsw_index_params.num_threads = conf.at("num_threads");
+  }
+}
+
 template <typename T, typename IdxT>
 void parse_search_param(const nlohmann::json& conf,
                         typename cuvs::bench::cuvs_cagra_hnswlib<T, IdxT>::search_param& param)
 {
-  param.ef = conf.at("ef");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+  param.hnsw_search_param.ef = conf.at("ef");
+  if (conf.contains("num_threads")) {
+    param.hnsw_search_param.num_threads = conf.at("num_threads");
+  }
 }
 
 template <typename T>
@@ -43,9 +66,10 @@ auto create_algo(const std::string& algo_name,
 
   if constexpr (std::is_same_v<T, float> or std::is_same_v<T, std::uint8_t>) {
     if (algo_name == "raft_cagra_hnswlib" || algo_name == "cuvs_cagra_hnswlib") {
-      typename cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>::build_param param;
-      parse_build_param<T, uint32_t>(conf, param);
-      a = std::make_unique<cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>>(metric, dim, param);
+      typename cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>::build_param bparam;
+      ::parse_build_param<T, uint32_t>(conf, bparam.cagra_build_param);
+      parse_build_param<T, uint32_t>(conf, bparam);
+      a = std::make_unique<cuvs::bench::cuvs_cagra_hnswlib<T, uint32_t>>(metric, dim, bparam);
     }
   }
 
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
index 875fe0bba..e4169f6f8 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_hnswlib_wrapper.h
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include "../hnswlib/hnswlib_wrapper.h"
 #include "cuvs_cagra_wrapper.h"
+#include <cuvs/neighbors/hnsw.hpp>
 
 #include <memory>
 
@@ -26,14 +26,20 @@ template <typename T, typename IdxT>
 class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
  public:
   using search_param_base = typename algo<T>::search_param;
-  using build_param       = typename cuvs_cagra<T, IdxT>::build_param;
-  using search_param      = typename hnsw_lib<T>::search_param;
+
+  struct build_param {
+    typename cuvs_cagra<T, IdxT>::build_param cagra_build_param;
+    cuvs::neighbors::hnsw::index_params hnsw_index_params;
+  };
+
+  struct search_param : public search_param_base {
+    cuvs::neighbors::hnsw::search_params hnsw_search_param;
+  };
 
   cuvs_cagra_hnswlib(Metric metric, int dim, const build_param& param, int concurrent_searches = 1)
     : algo<T>(metric, dim),
-      cagra_build_{metric, dim, param, concurrent_searches},
-      // hnsw_lib param values don't matter since we don't build with hnsw_lib
-      hnswlib_search_{metric, dim, typename hnsw_lib<T>::build_param{50, 100}}
+      build_param_{param},
+      cagra_build_{metric, dim, param.cagra_build_param, concurrent_searches}
   {
   }
 
@@ -69,40 +75,67 @@ class cuvs_cagra_hnswlib : public algo<T>, public algo_gpu {
   }
 
  private:
+  raft::resources handle_{};
+  build_param build_param_;
+  search_param search_param_;
   cuvs_cagra<T, IdxT> cagra_build_;
-  hnsw_lib<T> hnswlib_search_;
+  std::shared_ptr<cuvs::neighbors::hnsw::index<T>> hnsw_index_;
 };
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::build(const T* dataset, size_t nrow)
 {
   cagra_build_.build(dataset, nrow);
+  auto* cagra_index      = cagra_build_.get_index();
+  auto host_dataset_view = raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, this->dim_);
+  auto opt_dataset_view =
+    std::optional<raft::host_matrix_view<const T, int64_t>>(std::move(host_dataset_view));
+  hnsw_index_ = cuvs::neighbors::hnsw::from_cagra(
+    handle_, build_param_.hnsw_index_params, *cagra_index, opt_dataset_view);
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::set_search_param(const search_param_base& param_)
 {
-  hnswlib_search_.set_search_param(param_);
+  search_param_ = dynamic_cast<const search_param&>(param_);
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::save(const std::string& file) const
 {
-  cagra_build_.save_to_hnswlib(file);
+  cuvs::neighbors::hnsw::serialize(handle_, file, *(hnsw_index_.get()));
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::load(const std::string& file)
 {
-  hnswlib_search_.load(file);
-  hnswlib_search_.set_base_layer_only();
+  cuvs::neighbors::hnsw::index<T>* idx = nullptr;
+  cuvs::neighbors::hnsw::deserialize(handle_,
+                                     build_param_.hnsw_index_params,
+                                     file,
+                                     this->dim_,
+                                     parse_metric_type(this->metric_),
+                                     &idx);
+  hnsw_index_ = std::shared_ptr<cuvs::neighbors::hnsw::index<T>>(idx);
 }
 
 template <typename T, typename IdxT>
 void cuvs_cagra_hnswlib<T, IdxT>::search(
   const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
 {
-  hnswlib_search_.search(queries, batch_size, k, neighbors, distances);
+  // Only Latency mode is supported for now
+  auto queries_view =
+    raft::make_host_matrix_view<const T, int64_t>(queries, batch_size, this->dim_);
+  auto neighbors_view = raft::make_host_matrix_view<uint64_t, int64_t>(
+    reinterpret_cast<uint64_t*>(neighbors), batch_size, k);
+  auto distances_view = raft::make_host_matrix_view<float, int64_t>(distances, batch_size, k);
+
+  cuvs::neighbors::hnsw::search(handle_,
+                                search_param_.hnsw_search_param,
+                                *(hnsw_index_.get()),
+                                queries_view,
+                                neighbors_view,
+                                distances_view);
 }
 
 }  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
index b2ba35eee..8c9cb2d4f 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
@@ -24,6 +24,7 @@
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/dynamic_batching.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <cuvs/neighbors/nn_descent.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -63,6 +64,13 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
     AllocatorType graph_mem   = AllocatorType::kDevice;
     AllocatorType dataset_mem = AllocatorType::kDevice;
     [[nodiscard]] auto needs_dataset() const -> bool override { return true; }
+    /* Dynamic batching */
+    bool dynamic_batching = false;
+    int64_t dynamic_batching_k;
+    int64_t dynamic_batching_max_batch_size     = 4;
+    double dynamic_batching_dispatch_timeout_ms = 0.01;
+    size_t dynamic_batching_n_queues            = 8;
+    bool dynamic_batching_conservative_dispatch = false;
   };
 
   struct build_param {
@@ -154,6 +162,8 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
   void save_to_hnswlib(const std::string& file) const;
   std::unique_ptr<algo<T>> copy() override;
 
+  auto get_index() const -> const cuvs::neighbors::cagra::index<T, IdxT>* { return index_.get(); }
+
  private:
   // handle_ must go first to make sure it dies last and all memory allocated in pool
   configured_raft_resources handle_{};
@@ -171,6 +181,12 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
   std::shared_ptr<raft::device_matrix<T, int64_t, raft::row_major>> dataset_;
   std::shared_ptr<raft::device_matrix_view<const T, int64_t, raft::row_major>> input_dataset_v_;
 
+  std::shared_ptr<cuvs::neighbors::dynamic_batching::index<T, IdxT>> dynamic_batcher_;
+  cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
+  int64_t dynamic_batching_max_batch_size_;
+  size_t dynamic_batching_n_queues_;
+  bool dynamic_batching_conservative_dispatch_;
+
   inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type)
   {
     switch (mem_type) {
@@ -214,26 +230,33 @@ inline auto allocator_to_string(AllocatorType mem_type) -> std::string
 template <typename T, typename IdxT>
 void cuvs_cagra<T, IdxT>::set_search_param(const search_param_base& param)
 {
-  auto sp        = dynamic_cast<const search_param&>(param);
-  search_params_ = sp.p;
-  refine_ratio_  = sp.refine_ratio;
+  auto sp = dynamic_cast<const search_param&>(param);
+  bool needs_dynamic_batcher_update =
+    (dynamic_batching_max_batch_size_ != sp.dynamic_batching_max_batch_size) ||
+    (dynamic_batching_n_queues_ != sp.dynamic_batching_n_queues) ||
+    (dynamic_batching_conservative_dispatch_ != sp.dynamic_batching_conservative_dispatch);
+  dynamic_batching_max_batch_size_        = sp.dynamic_batching_max_batch_size;
+  dynamic_batching_n_queues_              = sp.dynamic_batching_n_queues;
+  dynamic_batching_conservative_dispatch_ = sp.dynamic_batching_conservative_dispatch;
+  search_params_                          = sp.p;
+  refine_ratio_                           = sp.refine_ratio;
   if (sp.graph_mem != graph_mem_) {
     // Move graph to correct memory space
     graph_mem_ = sp.graph_mem;
     RAFT_LOG_DEBUG("moving graph to new memory space: %s", allocator_to_string(graph_mem_).c_str());
     // We create a new graph and copy to it from existing graph
-    auto mr        = get_mr(graph_mem_);
-    auto new_graph = raft::make_device_mdarray<IdxT, int64_t>(
+    auto mr = get_mr(graph_mem_);
+    *graph_ = raft::make_device_mdarray<IdxT, int64_t>(
       handle_, mr, raft::make_extents<int64_t>(index_->graph().extent(0), index_->graph_degree()));
 
-    raft::copy(new_graph.data_handle(),
+    raft::copy(graph_->data_handle(),
                index_->graph().data_handle(),
                index_->graph().size(),
                raft::resource::get_cuda_stream(handle_));
 
-    index_->update_graph(handle_, make_const_mdspan(new_graph.view()));
-    // update_graph() only stores a view in the index. We need to keep the graph object alive.
-    *graph_ = std::move(new_graph);
+    // NB: update_graph() only stores a view in the index. We need to keep the graph object alive.
+    index_->update_graph(handle_, make_const_mdspan(graph_->view()));
+    needs_dynamic_batcher_update = true;
   }
 
   if (sp.dataset_mem != dataset_mem_ || need_dataset_update_) {
@@ -254,7 +277,26 @@ void cuvs_cagra<T, IdxT>::set_search_param(const search_param_base& param)
       dataset_->data_handle(), dataset_->extent(0), this->dim_, dataset_->extent(1));
     index_->update_dataset(handle_, dataset_view);
 
-    need_dataset_update_ = false;
+    need_dataset_update_         = false;
+    needs_dynamic_batcher_update = true;
+  }
+
+  // dynamic batching
+  if (sp.dynamic_batching) {
+    if (!dynamic_batcher_ || needs_dynamic_batcher_update) {
+      dynamic_batcher_ = std::make_shared<cuvs::neighbors::dynamic_batching::index<T, IdxT>>(
+        handle_,
+        cuvs::neighbors::dynamic_batching::index_params{{},
+                                                        sp.dynamic_batching_k,
+                                                        sp.dynamic_batching_max_batch_size,
+                                                        sp.dynamic_batching_n_queues,
+                                                        sp.dynamic_batching_conservative_dispatch},
+        *index_,
+        search_params_);
+    }
+    dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
+  } else {
+    if (dynamic_batcher_) { dynamic_batcher_.reset(); }
   }
 }
 
@@ -304,7 +346,7 @@ void cuvs_cagra<T, IdxT>::load(const std::string& file)
 template <typename T, typename IdxT>
 std::unique_ptr<algo<T>> cuvs_cagra<T, IdxT>::copy()
 {
-  return std::make_unique<cuvs_cagra<T, IdxT>>(*this);  // use copy constructor
+  return std::make_unique<cuvs_cagra<T, IdxT>>(std::cref(*this));  // use copy constructor
 }
 
 template <typename T, typename IdxT>
@@ -328,8 +370,17 @@ void cuvs_cagra<T, IdxT>::search_base(const T* queries,
     raft::make_device_matrix_view<IdxT, int64_t>(neighbors_idx_t, batch_size, k);
   auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
 
-  cuvs::neighbors::cagra::search(
-    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  if (dynamic_batcher_) {
+    cuvs::neighbors::dynamic_batching::search(handle_,
+                                              dynamic_batcher_sp_,
+                                              *dynamic_batcher_,
+                                              queries_view,
+                                              neighbors_view,
+                                              distances_view);
+  } else {
+    cuvs::neighbors::cagra::search(
+      handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  }
 
   if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
     if (raft::get_device_for_address(neighbors) < 0 &&
@@ -365,11 +416,23 @@ void cuvs_cagra<T, IdxT>::search(
   const raft::resources& res    = handle_;
   auto mem_type =
     raft::get_device_for_address(neighbors) >= 0 ? MemoryType::kDevice : MemoryType::kHostPinned;
-  auto& tmp_buf = get_tmp_buffer_from_global_pool(
-    ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
-     (kNeedsIoMapping ? sizeof(IdxT) : 0)) *
-    batch_size * k0);
-  auto* candidates_ptr = reinterpret_cast<algo_base::index_type*>(tmp_buf.data(mem_type));
+
+  // If dynamic batching is used and there's no sync between benchmark laps, multiple sequential
+  // requests can group together. The data is copied asynchronously, and if the same intermediate
+  // buffer is used for multiple requests, they can override each other's data. Hence, we need to
+  // allocate as much space as required by the maximum number of sequential requests.
+  auto max_dyn_grouping = dynamic_batcher_ ? raft::div_rounding_up_safe<int64_t>(
+                                               dynamic_batching_max_batch_size_, batch_size) *
+                                               dynamic_batching_n_queues_
+                                           : 1;
+  auto tmp_buf_size = ((disable_refinement ? 0 : (sizeof(float) + sizeof(algo_base::index_type))) +
+                       (kNeedsIoMapping ? sizeof(IdxT) : 0)) *
+                      batch_size * k0;
+  auto& tmp_buf = get_tmp_buffer_from_global_pool(tmp_buf_size * max_dyn_grouping);
+  thread_local static int64_t group_id = 0;
+  auto* candidates_ptr                 = reinterpret_cast<algo_base::index_type*>(
+    reinterpret_cast<uint8_t*>(tmp_buf.data(mem_type)) + tmp_buf_size * group_id);
+  group_id = (group_id + 1) % max_dyn_grouping;
   auto* candidate_dists_ptr =
     reinterpret_cast<float*>(candidates_ptr + (disable_refinement ? 0 : batch_size * k0));
   auto* neighbors_idx_t =
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
index 4c8a91f23..dac766669 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_pq_wrapper.h
@@ -19,7 +19,9 @@
 #include "cuvs_ann_bench_utils.h"
 
 #include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/dynamic_batching.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
@@ -46,6 +48,13 @@ class cuvs_ivf_pq : public algo<T>, public algo_gpu {
     cuvs::neighbors::ivf_pq::search_params pq_param;
     float refine_ratio = 1.0f;
     [[nodiscard]] auto needs_dataset() const -> bool override { return refine_ratio > 1.0f; }
+    /* Dynamic batching */
+    bool dynamic_batching = false;
+    int64_t dynamic_batching_k;
+    int64_t dynamic_batching_max_batch_size     = 128;
+    double dynamic_batching_dispatch_timeout_ms = 0.01;
+    size_t dynamic_batching_n_queues            = 3;
+    bool dynamic_batching_conservative_dispatch = true;
   };
 
   using build_param = cuvs::neighbors::ivf_pq::index_params;
@@ -98,6 +107,9 @@ class cuvs_ivf_pq : public algo<T>, public algo_gpu {
   int dimension_;
   float refine_ratio_ = 1.0;
   raft::device_matrix_view<const T, IdxT> dataset_;
+
+  std::shared_ptr<cuvs::neighbors::dynamic_batching::index<T, IdxT>> dynamic_batcher_;
+  cuvs::neighbors::dynamic_batching::search_params dynamic_batcher_sp_{};
 };
 
 template <typename T, typename IdxT>
@@ -138,6 +150,21 @@ void cuvs_ivf_pq<T, IdxT>::set_search_param(const search_param_base& param)
   search_params_ = sp.pq_param;
   refine_ratio_  = sp.refine_ratio;
   assert(search_params_.n_probes <= index_params_.n_lists);
+
+  if (sp.dynamic_batching) {
+    dynamic_batcher_ = std::make_shared<cuvs::neighbors::dynamic_batching::index<T, IdxT>>(
+      handle_,
+      cuvs::neighbors::dynamic_batching::index_params{{},
+                                                      sp.dynamic_batching_k,
+                                                      sp.dynamic_batching_max_batch_size,
+                                                      sp.dynamic_batching_n_queues,
+                                                      sp.dynamic_batching_conservative_dispatch},
+      *index_,
+      search_params_);
+    dynamic_batcher_sp_.dispatch_timeout_ms = sp.dynamic_batching_dispatch_timeout_ms;
+  } else {
+    dynamic_batcher_.reset();
+  }
 }
 
 template <typename T, typename IdxT>
@@ -168,8 +195,17 @@ void cuvs_ivf_pq<T, IdxT>::search_base(
     raft::make_device_matrix_view<IdxT, uint32_t>(neighbors_idx_t, batch_size, k);
   auto distances_view = raft::make_device_matrix_view<float, uint32_t>(distances, batch_size, k);
 
-  cuvs::neighbors::ivf_pq::search(
-    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  if (dynamic_batcher_) {
+    cuvs::neighbors::dynamic_batching::search(handle_,
+                                              dynamic_batcher_sp_,
+                                              *dynamic_batcher_,
+                                              queries_view,
+                                              neighbors_view,
+                                              distances_view);
+  } else {
+    cuvs::neighbors::ivf_pq::search(
+      handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+  }
 
   if constexpr (sizeof(IdxT) != sizeof(algo_base::index_type)) {
     raft::linalg::unaryOp(neighbors,
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
index 755c7c8d6..6e219d2a7 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_benchmark.cpp
@@ -33,7 +33,7 @@ void parse_build_param(const nlohmann::json& conf,
 {
   param.ef_construction = conf.at("efConstruction");
   param.m               = conf.at("M");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+  if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
 }
 
 template <typename T>
@@ -41,7 +41,7 @@ void parse_search_param(const nlohmann::json& conf,
                         typename cuvs::bench::hnsw_lib<T>::search_param& param)
 {
   param.ef = conf.at("ef");
-  if (conf.contains("numThreads")) { param.num_threads = conf.at("numThreads"); }
+  if (conf.contains("num_threads")) { param.num_threads = conf.at("num_threads"); }
 }
 
 template <typename T, template <typename> class Algo>
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index 74da25660..3e91d9995 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -22,8 +22,12 @@ endif()
 # Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with
 # clang)
 if(CMAKE_COMPILER_IS_GNUCXX)
-  list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
-  list(APPEND CUVS_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+  list(APPEND CUVS_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations
+       -Wno-reorder
+  )
+  list(APPEND CUVS_CUDA_FLAGS
+       -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations,-Wno-reorder
+  )
 
   # set warnings as errors
   if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2.0)
diff --git a/cpp/cmake/patches/hnswlib.diff b/cpp/cmake/patches/hnswlib.diff
index e7f89a8cc..f20c27d91 100644
--- a/cpp/cmake/patches/hnswlib.diff
+++ b/cpp/cmake/patches/hnswlib.diff
@@ -1,188 +1,159 @@
+diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h
+index bef0017..0ee7931 100644
 --- a/hnswlib/hnswalg.h
 +++ b/hnswlib/hnswalg.h
-@@ -3,6 +3,7 @@
- #include "visited_list_pool.h"
- #include "hnswlib.h"
- #include <atomic>
-+#include <limits>
- #include <random>
- #include <stdlib.h>
- #include <assert.h>
-@@ -16,6 +17,8 @@ namespace hnswlib {
-     template<typename dist_t>
-     class HierarchicalNSW : public AlgorithmInterface<dist_t> {
-     public:
-+        bool base_layer_only{false};
-+        int num_seeds=32;
-         static const tableint max_update_element_locks = 65536;
-         HierarchicalNSW(SpaceInterface<dist_t> *s) {
-         }
-@@ -56,7 +59,7 @@ namespace hnswlib {
-             visited_list_pool_ = new VisitedListPool(1, max_elements);
- 
-             //initializations for special treatment of the first node
--            enterpoint_node_ = -1;
-+            enterpoint_node_ = std::numeric_limits<tableint>::max();
-             maxlevel_ = -1;
- 
-             linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
-@@ -527,7 +530,7 @@ namespace hnswlib {
-                     tableint *datal = (tableint *) (data + 1);
-                     for (int i = 0; i < size; i++) {
-                         tableint cand = datal[i];
--                        if (cand < 0 || cand > max_elements_)
-+                        if (cand > max_elements_)
-                             throw std::runtime_error("cand error");
-                         dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
- 
-@@ -1067,7 +1070,7 @@ namespace hnswlib {
-                             tableint *datal = (tableint *) (data + 1);
-                             for (int i = 0; i < size; i++) {
-                                 tableint cand = datal[i];
--                                if (cand < 0 || cand > max_elements_)
-+                                if (cand > max_elements_)
-                                     throw std::runtime_error("cand error");
-                                 dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
-                                 if (d < curdist) {
-@@ -1119,28 +1122,41 @@ namespace hnswlib {
-             tableint currObj = enterpoint_node_;
-             dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
- 
--            for (int level = maxlevel_; level > 0; level--) {
--                bool changed = true;
--                while (changed) {
--                    changed = false;
--                    unsigned int *data;
-+            if (base_layer_only) {
-+                // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
-+                for (int i = 0; i < num_seeds; i++) {
-+                    tableint obj = i * (max_elements_ / num_seeds);
-+                    dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
-+                    if (dist < curdist) {
-+                        curdist = dist;
-+                        currObj = obj;
-+                    }
+@@ -16,6 +16,9 @@ typedef unsigned int linklistsizeint;
+ template<typename dist_t>
+ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+  public:
++    bool base_layer_only = false;
++    int num_seeds = 32;
++    bool base_layer_init = true;
+     static const tableint MAX_LABEL_OPERATION_LOCKS = 65536;
+     static const unsigned char DELETE_MARK = 0x01;
+ 
+@@ -1098,7 +1101,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+ 
+         std::unique_lock <std::mutex> lock_el(link_list_locks_[cur_c]);
+         int curlevel = getRandomLevel(mult_);
+-        if (level > 0)
++        if (level > -1)
+             curlevel = level;
+ 
+         element_levels_[cur_c] = curlevel;
+@@ -1116,6 +1119,9 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+         memcpy(getExternalLabeLp(cur_c), &label, sizeof(labeltype));
+         memcpy(getDataByInternalId(cur_c), data_point, data_size_);
+ 
++        if (!base_layer_init && curlevel == 0)
++            return cur_c;
++
+         if (curlevel) {
+             linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1);
+             if (linkLists_[cur_c] == nullptr)
+@@ -1138,7 +1144,7 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+                         tableint *datal = (tableint *) (data + 1);
+                         for (int i = 0; i < size; i++) {
+                             tableint cand = datal[i];
+-                            if (cand < 0 || cand > max_elements_)
++                            if (static_cast<int>(cand) < 0 || cand > max_elements_)
+                                 throw std::runtime_error("cand error");
+                             dist_t d = fstdistfunc_(data_point, getDataByInternalId(cand), dist_func_param_);
+                             if (d < curdist) {
+@@ -1188,28 +1194,41 @@ class HierarchicalNSW : public AlgorithmInterface<dist_t> {
+         tableint currObj = enterpoint_node_;
+         dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
+ 
+-        for (int level = maxlevel_; level > 0; level--) {
+-            bool changed = true;
+-            while (changed) {
+-                changed = false;
+-                unsigned int *data;
++        if (base_layer_only) {
++            // You can increase the number of seeds when testing large-scale dataset, num_seeds = 48 for 100M-scale
++            for (int i = 0; i < num_seeds; i++) {
++                tableint obj = i * (max_elements_ / num_seeds);
++                dist_t dist = fstdistfunc_(query_data, getDataByInternalId(obj), dist_func_param_);
++                if (dist < curdist) {
++                    curdist = dist;
++                    currObj = obj;
 +                }
 +            }
-+            else{
-+                for (int level = maxlevel_; level > 0; level--) {
-+                    bool changed = true;
-+                    while (changed) {
-+                        changed = false;
-+                        unsigned int *data;
- 
--                    data = (unsigned int *) get_linklist(currObj, level);
--                    int size = getListCount(data);
--                    metric_hops++;
--                    metric_distance_computations+=size;
-+                        data = (unsigned int *) get_linklist(currObj, level);
-+                        int size = getListCount(data);
-+                        metric_hops++;
-+                        metric_distance_computations+=size;
- 
--                    tableint *datal = (tableint *) (data + 1);
--                    for (int i = 0; i < size; i++) {
--                        tableint cand = datal[i];
--                        if (cand < 0 || cand > max_elements_)
--                            throw std::runtime_error("cand error");
--                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
-+                        tableint *datal = (tableint *) (data + 1);
-+                        for (int i = 0; i < size; i++) {
-+                            tableint cand = datal[i];
-+                            if (cand > max_elements_)
-+                                throw std::runtime_error("cand error");
-+                            dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
- 
--                        if (d < curdist) {
--                            curdist = d;
--                            currObj = cand;
--                            changed = true;
-+                            if (d < curdist) {
-+                                curdist = d;
-+                                currObj = cand;
-+                                changed = true;
-+                            }
-                         }
++        }
++        else {
++            for (int level = maxlevel_; level > 0; level--) {
++                bool changed = true;
++                while (changed) {
++                    changed = false;
++                    unsigned int *data;
+ 
+-                data = (unsigned int *) get_linklist(currObj, level);
+-                int size = getListCount(data);
+-                metric_hops++;
+-                metric_distance_computations+=size;
++                    data = (unsigned int *) get_linklist(currObj, level);
++                    int size = getListCount(data);
++                    metric_hops++;
++                    metric_distance_computations+=size;
++
++                    tableint *datal = (tableint *) (data + 1);
++                    for (int i = 0; i < size; i++) {
++                        tableint cand = datal[i];
++                        if (static_cast<int>(cand) < 0 || cand > max_elements_)
++                            throw std::runtime_error("cand error");
++                        dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+ 
+-                tableint *datal = (tableint *) (data + 1);
+-                for (int i = 0; i < size; i++) {
+-                    tableint cand = datal[i];
+-                    if (cand < 0 || cand > max_elements_)
+-                        throw std::runtime_error("cand error");
+-                    dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_);
+-
+-                    if (d < curdist) {
+-                        curdist = d;
+-                        currObj = cand;
+-                        changed = true;
++                        if (d < curdist) {
++                            curdist = d;
++                            currObj = cand;
++                            changed = true;
++                        }
                      }
                  }
+             }
 diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h
-index 4413537..c3240f3 100644
+index 834d19f..0c0af26 100644
 --- a/hnswlib/space_l2.h
 +++ b/hnswlib/space_l2.h
-@@ -252,13 +252,14 @@ namespace hnswlib {
-         ~L2Space() {}
-     };
- 
-+    template <typename T>
-     static int
-     L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
- 
-         size_t qty = *((size_t *) qty_ptr);
-         int res = 0;
--        unsigned char *a = (unsigned char *) pVect1;
--        unsigned char *b = (unsigned char *) pVect2;
-+        T *a = (T *) pVect1;
-+        T *b = (T *) pVect2;
- 
-         qty = qty >> 2;
-         for (size_t i = 0; i < qty; i++) {
-@@ -279,11 +280,12 @@ namespace hnswlib {
-         return (res);
-     }
- 
-+    template <typename T>
-     static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
-         size_t qty = *((size_t*)qty_ptr);
-         int res = 0;
--        unsigned char* a = (unsigned char*)pVect1;
--        unsigned char* b = (unsigned char*)pVect2;
-+        T* a = (T*)pVect1;
-+        T* b = (T*)pVect2;
- 
-         for(size_t i = 0; i < qty; i++)
-         {
-@@ -294,6 +296,7 @@ namespace hnswlib {
-         return (res);
-     }
- 
-+    template <typename T>
-     class L2SpaceI : public SpaceInterface<int> {
- 
-         DISTFUNC<int> fstdistfunc_;
-@@ -302,10 +305,10 @@ namespace hnswlib {
-     public:
-         L2SpaceI(size_t dim) {
-             if(dim % 4 == 0) {
--                fstdistfunc_ = L2SqrI4x;
-+                fstdistfunc_ = L2SqrI4x<T>;
-             }
-             else {
--                fstdistfunc_ = L2SqrI;
-+                fstdistfunc_ = L2SqrI<T>;
-             }
-             dim_ = dim;
-             data_size_ = dim * sizeof(unsigned char);
-diff --git a/hnswlib/visited_list_pool.h b/hnswlib/visited_list_pool.h
-index 5e1a4a5..4195ebd 100644
---- a/hnswlib/visited_list_pool.h
-+++ b/hnswlib/visited_list_pool.h
-@@ -3,6 +3,7 @@
- #include <mutex>
- #include <string.h>
- #include <deque>
-+#include <limits>
- 
- namespace hnswlib {
-     typedef unsigned short int vl_type;
-@@ -14,7 +15,7 @@ namespace hnswlib {
-         unsigned int numelements;
- 
-         VisitedList(int numelements1) {
--            curV = -1;
-+            curV = std::numeric_limits<vl_type>::max();
-             numelements = numelements1;
-             mass = new vl_type[numelements];
+@@ -252,12 +252,13 @@ class L2Space : public SpaceInterface<float> {
+     ~L2Space() {}
+ };
+ 
++template <typename T>
+ static int
+ L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
+     size_t qty = *((size_t *) qty_ptr);
+     int res = 0;
+-    unsigned char *a = (unsigned char *) pVect1;
+-    unsigned char *b = (unsigned char *) pVect2;
++    T *a = (T *) pVect1;
++    T *b = (T *) pVect2;
+ 
+     qty = qty >> 2;
+     for (size_t i = 0; i < qty; i++) {
+@@ -277,11 +278,12 @@ L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const voi
+     return (res);
+ }
+ 
++template <typename T>
+ static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
+     size_t qty = *((size_t*)qty_ptr);
+     int res = 0;
+-    unsigned char* a = (unsigned char*)pVect1;
+-    unsigned char* b = (unsigned char*)pVect2;
++    T* a = (T*)pVect1;
++    T* b = (T*)pVect2;
+ 
+     for (size_t i = 0; i < qty; i++) {
+         res += ((*a) - (*b)) * ((*a) - (*b));
+@@ -291,6 +293,7 @@ static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2,
+     return (res);
+ }
+ 
++template <typename T>
+ class L2SpaceI : public SpaceInterface<int> {
+     DISTFUNC<int> fstdistfunc_;
+     size_t data_size_;
+@@ -299,9 +302,9 @@ class L2SpaceI : public SpaceInterface<int> {
+  public:
+     L2SpaceI(size_t dim) {
+         if (dim % 4 == 0) {
+-            fstdistfunc_ = L2SqrI4x;
++            fstdistfunc_ = L2SqrI4x<T>;
+         } else {
+-            fstdistfunc_ = L2SqrI;
++            fstdistfunc_ = L2SqrI<T>;
          }
--- 
-2.43.0
-
+         dim_ = dim;
+         data_size_ = dim * sizeof(unsigned char);
diff --git a/cpp/cmake/patches/hnswlib_override.json b/cpp/cmake/patches/hnswlib_override.json
index aef2da772..c50220e24 100644
--- a/cpp/cmake/patches/hnswlib_override.json
+++ b/cpp/cmake/patches/hnswlib_override.json
@@ -1,16 +1,16 @@
 {
-    "packages" : {
-      "hnswlib" : {
-        "version": "0.6.2",
-        "git_url": "https://github.com/nmslib/hnswlib.git",
-        "git_tag": "v${version}",
-        "patches" : [
-          {
-            "file" : "${current_json_dir}/hnswlib.diff",
-            "issue" : "Correct compilation issues",
-            "fixed_in" : ""
-          }
-        ]
-      }
+  "packages": {
+    "hnswlib": {
+      "version": "0.7.0",
+      "git_url": "https://github.com/nmslib/hnswlib.git",
+      "git_tag": "v${version}",
+      "patches": [
+        {
+          "file": "${current_json_dir}/hnswlib.diff",
+          "issue": "Correct compilation issues",
+          "fixed_in": ""
+        }
+      ]
     }
-  }
\ No newline at end of file
+  }
+}
\ No newline at end of file
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index 2e6c895e5..5b4d89aa2 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -15,6 +15,7 @@
 #=============================================================================
 
 function(find_and_configure_hnswlib)
+  message(STATUS "Finding or building hnswlib")
   set(oneValueArgs)
 
   include(${rapids-cmake-dir}/cpm/package_override.cmake)
diff --git a/cpp/include/cuvs/core/c_api.h b/cpp/include/cuvs/core/c_api.h
index c8c8d3934..400d162ad 100644
--- a/cpp/include/cuvs/core/c_api.h
+++ b/cpp/include/cuvs/core/c_api.h
@@ -151,6 +151,22 @@ cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
  */
 cuvsError_t cuvsRMMMemoryResourceReset();
 
+/**
+ * @brief Allocates pinned memory on the host using RMM
+ * @param[out] ptr Pointer to allocated host memory
+ * @param[in] bytes Size in bytes to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMHostAlloc(void** ptr, size_t bytes);
+
+/**
+ * @brief Deallocates pinned memory on the host using RMM
+ * @param[in] ptr Pointer to allocated host memory to free
+ * @param[in] bytes Size in bytes to deallocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes);
+
 /** @} */
 
 #ifdef __cplusplus
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index 5ceb3010e..a4684ce26 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -272,6 +272,10 @@ static_assert(std::is_aggregate_v<search_params>);
  */
 template <typename T, typename IdxT>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = cagra::index_params;
+  using search_params_type = cagra::search_params;
+  using index_type         = IdxT;
+  using value_type         = T;
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
 
diff --git a/cpp/include/cuvs/neighbors/dynamic_batching.hpp b/cpp/include/cuvs/neighbors/dynamic_batching.hpp
new file mode 100644
index 000000000..410800357
--- /dev/null
+++ b/cpp/include/cuvs/neighbors/dynamic_batching.hpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuvs/neighbors/common.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+namespace detail {
+template <typename T, typename IdxT>
+class batch_runner;
+}
+
+/**
+ * @defgroup dynamic_batching_cpp_index_params Dynamic Batching index parameters
+ * @{
+ */
+struct index_params : cuvs::neighbors::index_params {
+  /** The number of neighbors to search is fixed at construction time. */
+  int64_t k;
+  /** Maximum size of the batch to submit to the upstream index. */
+  int64_t max_batch_size = 100;
+  /**
+   * The number of independent request queues.
+   *
+   * Each queue is associated with a unique CUDA stream and IO device buffers. If the number of
+   * concurrent requests is high, using multiple queues allows to fill-in data and prepare the batch
+   * while the other queue is busy. Moreover, the queues are submitted concurrently; this allows to
+   * better utilize the GPU by hiding the kernel launch latencies, which helps to improve the
+   * throughput.
+   */
+  size_t n_queues = 3;
+  /**
+   * By default (`conservative_dispatch = false`) the first CPU thread to commit a query to a batch
+   * dispatches the upstream search function as soon as possible (before the batch is full). In that
+   * case, it does not know the final batch size at the time of calling the upstream search and thus
+   * runs the upstream search with the maximum batch size every time, even if only one valid query
+   * is present in the batch. This reduces the latency at the cost of wasted GPU resources.
+   *
+   * The alternative behavaior (`conservative_dispatch = true`) is more conservative: the dispatcher
+   * thread starts the kernel that gathers input queries, but waits till the batch is full or the
+   * waiting time is exceeded. Only then it acquires the actual batch size and launches the upstream
+   * search. As a result, less GPU resources are wasted at the cost of exposing upstream search
+   * latency.
+   *
+   * *Rule of Thumb*:
+   *    for a large `max_batch_size` set `conservative_dispatch = true`, otherwise keep it disabled.
+   */
+  bool conservative_dispatch = false;
+};
+/** @} */
+
+/**
+ * @defgroup dynamic_batching_cpp_search_params Dynamic Batching search parameters
+ * @{
+ */
+struct search_params : cuvs::neighbors::search_params {
+  /**
+   * How long a request can stay in the queue (milliseconds).
+   * Note, this only affects the dispatch time and does not reflect full request latency;
+   * the latter depends on the upstream search parameters and the batch size.
+   */
+  double dispatch_timeout_ms = 1.0;
+};
+/** @} */
+
+/**
+ * @defgroup dynamic_batching_cpp_index Dynamic Batching index type
+ * @{
+ */
+
+/**
+ * @brief Lightweight dynamic batching index wrapper
+ *
+ * @tparam T data type
+ * @tparam IdxT index type
+ *
+ * One lightweight dynamic batching index manages a single index and a single search parameter set.
+ * This structure should be shared among multiple users via copy semantics: access to the
+ * underlying implementation is managed via a shared pointer, and concurrent search among the
+ * participants is thread-safe.
+ *
+ * __Usage example__
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // When creating a dynamic batching index, k parameter has to be passed explicitly.
+ *   // The first empty braces default-initialize the parent `neighbors::index_params` (unused).
+ *   dynamic_batching::index_params dynb_index_params{{}, k};
+ *   // Construct the index by wrapping the upstream index and search parameters.
+ *   dynamic_batching::index<float, uint32_t> index{
+ *       res, dynb_index_params, upstream_index, upstream_search_params
+ *   };
+ *   // Use default search parameters
+ *   dynamic_batching::search_params search_params;
+ *   // Search K nearest neighbours
+ *   auto neighbors = raft::make_device_matrix<uint32_t>(res, n_queries, k);
+ *   auto distances = raft::make_device_matrix<float>(res, n_queries, k);
+ *   dynamic_batching::search(
+ *       res, search_params, index, queries, neighbors.view(), distances.view()
+ *   );
+ * @endcode
+ *
+ *
+ * __Priority queues__
+ *
+ * The dynamic batching index has a limited support for prioritizing individual requests.
+ * There's only one pool of queues in the batcher and no functionality to prioritize one bach over
+ * the other. The `search_params::dispatch_timeout_ms` parameters passed in each request are
+ * aggregated internally and the batch is dispatched no later than any of the timeouts is exceeded.
+ * In this logic, a high-priority request can never be processed earlier than any lower-priority
+ * requests submitted earlier.
+ *
+ * However, dynamic batching indexes are lightweight and do not contain any global or static state.
+ * This means it's easy to combine multiple batchers.
+ * As an example, you can construct one batching index per priority class:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // Large batch size (128), couple queues (2),
+ *   //   enabled conservative dispatch - all for better throughput
+ *   dynamic_batching::index_params low_priority_params{{}, k, 128, 2, true};
+ *   // Small batch size (16), more queues (4),
+ *   //   disabled conservative dispatch - to minimize latency with reasonable throughput
+ *   dynamic_batching::index_params high_priority_params{{}, k, 16, 4, false};
+ *   // Construct the indexes by wrapping the upstream index and search parameters.
+ *   dynamic_batching::index<float, uint32_t> low_priority_index{
+ *       res, low_priority_params, upstream_index, upstream_search_params
+ *   };
+ *   dynamic_batching::index<float, uint32_t> high_priority_index{
+ *       res, high_priority_params, upstream_index, upstream_search_params
+ *   };
+ *   // Define a combined search function with priority selection
+ *   double high_priority_threshold_ms = 0.1;
+ *   auto search_function =
+ *      [low_priority_index, high_priority_index, high_priority_threshold_ms](
+ *        raft::resources const &res,
+ *        dynamic_batching::search_params search_params,
+ *        raft::device_matrix_view<const float, int64_t> queries,
+ *        raft::device_matrix_view<uint32_t, int64_t> neighbors,
+ *        raft::device_matrix_view<float, int64_t> distances) {
+ *      dynamic_batching::search(
+ *          res,
+ *          search_params,
+ *          search_params.dispatch_timeout_ms < high_priority_threshold_ms
+ *            ? high_priority_index : low_priority_index,
+ *          queries,
+ *          neighbors,
+ *          distances
+ *      );
+ *   };
+ * @endcode
+ */
+template <typename T, typename IdxT>
+struct index : cuvs::neighbors::index {
+  std::shared_ptr<detail::batch_runner<T, IdxT>> runner;
+
+  /**
+   * @brief Construct a dynamic batching index by wrapping the upstream index.
+   *
+   * @tparam Upstream the upstream index type
+   *
+   * @param[in] res raft resources
+   * @param[in] params dynamic batching parameters
+   * @param[in] upstream_index the original index to perform the search
+   *     (the reference must be alive for the lifetime of the dynamic batching index)
+   * @param[in] upstream_params the original index search parameters for all queries in a batch
+   *     (the parameters are captured by value for the lifetime of the dynamic batching index)
+   * @param[in] sample_filter
+   *     filtering function, if any, must be the same for all requests in a batch
+   *     (the pointer must be alive for the lifetime of the dynamic batching index)
+   */
+  template <typename Upstream>
+  index(const raft::resources& res,
+        const cuvs::neighbors::dynamic_batching::index_params& params,
+        const Upstream& upstream_index,
+        const typename Upstream::search_params_type& upstream_params,
+        const cuvs::neighbors::filtering::base_filter* sample_filter = nullptr);
+};
+/** @} */
+
+/**
+ *
+ * @defgroup dynamic_batching_cpp_search Dynamic Batching search
+ *
+ * @{
+ */
+
+/**
+ * @brief Search ANN using a dynamic batching index.
+ *
+ * The search parameters of the upstream index and the optional filtering function are configured at
+ * the dynamic batching index construction time.
+ *
+ * Like with many other indexes, the dynamic batching search has the stream-ordered semantics: the
+ * host function may return the control before the results are ready. Synchronize with the main CUDA
+ * stream in the given resource object to wait for arrival of the search results.
+ *
+ * Dynamic batching search is thread-safe: call the search function with copies of the same index in
+ * multiple threads to increase the occupancy of the batches.
+ *
+ * @param[in] res
+ * @param[in] params query-specific batching parameters, such as the maximum waiting time
+ * @param[in] index a dynamic batching index
+ * @param[in] queries a device matrix view to a row-major matrix
+ *               [n_queries, dim]
+ * @param[out] neighbors a device matrix view to the indices of the neighbors in the source dataset
+ *               [n_queries, k]
+ * @param[out] distances a device matrix view to the distances to the selected neighbors
+ *               [n_queries, k]
+ *
+ */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<float, uint32_t> const& index,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<half, uint32_t> const& index,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<int8_t, uint32_t> const& index,
+            raft::device_matrix_view<const int8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<uint8_t, uint32_t> const& index,
+            raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<float, int64_t> const& index,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<half, int64_t> const& index,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<int8_t, int64_t> const& index,
+            raft::device_matrix_view<const int8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @copydoc search */
+void search(raft::resources const& res,
+            cuvs::neighbors::dynamic_batching::search_params const& params,
+            dynamic_batching::index<uint8_t, int64_t> const& index,
+            raft::device_matrix_view<const uint8_t, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances);
+
+/** @} */
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/include/cuvs/neighbors/hnsw.h b/cpp/include/cuvs/neighbors/hnsw.h
index 0495c574a..b7eda54b8 100644
--- a/cpp/include/cuvs/neighbors/hnsw.h
+++ b/cpp/include/cuvs/neighbors/hnsw.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "cagra.h"
+
 #include <cuvs/core/c_api.h>
 #include <cuvs/distance/distance.h>
 #include <dlpack/dlpack.h>
@@ -27,32 +29,51 @@ extern "C" {
 #endif
 
 /**
- * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params
+ * @defgroup hnsw_c_index_params C API for HNSW index params
  * @{
  */
 
-struct cuvsHnswSearchParams {
-  int32_t ef;
-  int32_t numThreads;
+/**
+ * @brief Hierarchy for HNSW index when converting from CAGRA index
+ *
+ * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index.
+ */
+enum cuvsHnswHierarchy {
+  /* Flat hierarchy, search is base-layer only */
+  NONE,
+  /* Full hierarchy is built using the CPU */
+  CPU
 };
 
-typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t;
+struct cuvsHnswIndexParams {
+  /* hierarchy of the hnsw index */
+  cuvsHnswHierarchy hierarchy;
+  /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
+  int ef_construction;
+  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
+  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
+  to parallelism, and increasing the number of threads can reduce the quality of the index.
+   */
+  int num_threads;
+};
+
+typedef struct cuvsHnswIndexParams* cuvsHnswIndexParams_t;
 
 /**
- * @brief Allocate HNSW search params, and populate with default values
+ * @brief Allocate HNSW Index params, and populate with default values
  *
- * @param[in] params cuvsHnswSearchParams_t to allocate
+ * @param[in] params cuvsHnswIndexParams_t to allocate
  * @return cuvsError_t
  */
-cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params);
+cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params);
 
 /**
- * @brief De-allocate HNSW search params
+ * @brief De-allocate HNSW Index params
  *
- * @param[in] params cuvsHnswSearchParams_t to de-allocate
+ * @param[in] params
  * @return cuvsError_t
  */
-cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params);
+cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params);
 
 /**
  * @}
@@ -90,6 +111,184 @@ cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index);
  */
 cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
 
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_extend_params Parameters for extending HNSW index
+ * @{
+ */
+
+struct cuvsHnswExtendParams {
+  /** Number of CPU threads used to extend additional vectors */
+  int num_threads;
+};
+
+typedef struct cuvsHnswExtendParams* cuvsHnswExtendParams_t;
+
+/**
+ * @brief Allocate HNSW extend params, and populate with default values
+ *
+ * @param[in] params cuvsHnswExtendParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params);
+
+/**
+ * @brief De-allocate HNSW extend params
+ *
+ * @param[in] params cuvsHnswExtendParams_t to de-allocate
+ * @return cuvsError_t
+ */
+
+cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_index_load Load CAGRA index as hnswlib index
+ * @{
+ */
+
+/**
+ * @brief Convert a CAGRA Index to an HNSW index.
+ * NOTE: When hierarchy is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
+ * @param[in] cagra_index cuvsCagraIndex_t to convert to HNSW index
+ * @param[out] hnsw_index cuvsHnswIndex_t to return the HNSW index
+ *
+ * @return cuvsError_t
+ *
+ * @code{.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/cagra.h>
+ * #include <cuvs/neighbors/hnsw.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create a CAGRA index with `cuvsCagraBuild`
+ *
+ * // Convert the CAGRA index to an HNSW index
+ * cuvsHnswIndex_t hnsw_index;
+ * cuvsHnswIndexCreate(&hnsw_index);
+ * cuvsHnswIndexParams_t hnsw_params;
+ * cuvsHnswIndexParamsCreate(&hnsw_params);
+ * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+ *
+ * // de-allocate `hnsw_params`, `hnsw_index` and `res`
+ * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+ * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+ * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+ * @endcode
+ */
+cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                              cuvsHnswIndexParams_t params,
+                              cuvsCagraIndex_t cagra_index,
+                              cuvsHnswIndex_t hnsw_index);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_index_extend Extend HNSW index with additional vectors
+ * @{
+ */
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the hierarchy is `CPU`
+ *       when converting from a CAGRA index.
+
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswExtendParams_t used to extend Hnsw index
+ * @param[in] additional_dataset DLManagedTensor* additional dataset to extend the index
+ * @param[inout] index cuvsHnswIndex_t to extend
+  *
+  * @return cuvsError_t
+  *
+  * @code{.c}
+  * #include <cuvs/core/c_api.h>
+  * #include <cuvs/neighbors/cagra.h>
+  * #include <cuvs/neighbors/hnsw.h>
+  *
+  * // Create cuvsResources_t
+  * cuvsResources_t res;
+  * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+  *
+  * // create an index with `cuvsCagraBuild`
+  *
+  * // Convert the CAGRA index to an HNSW index
+  * cuvsHnswIndex_t hnsw_index;
+  * cuvsHnswIndexCreate(&hnsw_index);
+  * cuvsHnswIndexParams_t hnsw_params;
+  * cuvsHnswIndexParamsCreate(&hnsw_params);
+  * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+  *
+  * // Extend the HNSW index with additional vectors
+  * DLManagedTensor additional_dataset;
+  * cuvsHnswExtendParams_t extend_params;
+  * cuvsHnswExtendParamsCreate(&extend_params);
+  * cuvsHnswExtend(res, extend_params, additional_dataset, hnsw_index);
+  *
+  * // de-allocate `hnsw_params`, `hnsw_index`, `extend_params` and `res`
+  * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+  * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+  * cuvsError_t extend_params_destroy_status = cuvsHnswExtendParamsDestroy(extend_params);
+  * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+  * @endcode
+  */
+
+cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                           cuvsHnswExtendParams_t params,
+                           DLManagedTensor* additional_dataset,
+                           cuvsHnswIndex_t index);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup hnsw_c_search_params C API for hnswlib wrapper search params
+ * @{
+ */
+
+struct cuvsHnswSearchParams {
+  int32_t ef;
+  int32_t num_threads;
+};
+
+typedef struct cuvsHnswSearchParams* cuvsHnswSearchParams_t;
+
+/**
+ * @brief Allocate HNSW search params, and populate with default values
+ *
+ * @param[in] params cuvsHnswSearchParams_t to allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params);
+
+/**
+ * @brief De-allocate HNSW search params
+ *
+ * @param[in] params cuvsHnswSearchParams_t to de-allocate
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params);
+
 /**
  * @}
  */
@@ -111,8 +310,8 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  *          c. `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 8`
  *        2. `neighbors`: `kDLDataType.code == kDLUInt` and `kDLDataType.bits = 64`
  *        3. `distances`: `kDLDataType.code == kDLFloat` and `kDLDataType.bits = 32`
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * NOTE: When hierarchy is `NONE`, the HNSW index can only be searched by the hnswlib wrapper in
+ * cuVS, as the format is not compatible with the original hnswlib.
  *
  * @code {.c}
  * #include <cuvs/core/c_api.h>
@@ -131,7 +330,7 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  * cuvsHnswSearchParams_t params;
  * cuvsError_t params_create_status = cuvsHnswSearchParamsCreate(&params);
  *
- * // Search the `index` built using `cuvsHnswBuild`
+ * // Search the `index` built using `cuvsHnswFromCagra`
  * cuvsError_t search_status = cuvsHnswSearch(res, params, index, &queries, &neighbors,
  * &distances);
  *
@@ -142,7 +341,7 @@ cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index);
  *
  * @param[in] res cuvsResources_t opaque C handle
  * @param[in] params cuvsHnswSearchParams_t used to search Hnsw index
- * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswBuild`
+ * @param[in] index cuvsHnswIndex which has been returned by `cuvsHnswFromCagra`
  * @param[in] queries DLManagedTensor* queries dataset to search
  * @param[out] neighbors DLManagedTensor* output `k` neighbors for queries
  * @param[out] distances DLManagedTensor* output `k` distances for queries
@@ -163,9 +362,50 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res,
  * @{
  */
 
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] filename the name of the file to save the index
+ * @param[in] index cuvsHnswIndex_t to serialize
+ * @return cuvsError_t
+ *
+ * @code{.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/cagra.h>
+ * #include <cuvs/neighbors/hnsw.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // create an index with `cuvsCagraBuild`
+ *
+ * // Convert the CAGRA index to an HNSW index
+ * cuvsHnswIndex_t hnsw_index;
+ * cuvsHnswIndexCreate(&hnsw_index);
+ * cuvsHnswIndexParams_t hnsw_params;
+ * cuvsHnswIndexParamsCreate(&hnsw_params);
+ * cuvsHnswFromCagra(res, hnsw_params, cagra_index, hnsw_index);
+ *
+ * // Serialize the HNSW index
+ * cuvsHnswSerialize(res, "/path/to/index", hnsw_index);
+ *
+ * // de-allocate `hnsw_params`, `hnsw_index` and `res`
+ * cuvsError_t hnsw_params_destroy_status = cuvsHnswIndexParamsDestroy(hnsw_params);
+ * cuvsError_t hnsw_index_destroy_status = cuvsHnswIndexDestroy(hnsw_index);
+ * cuvsError_t res_destroy_status = cuvsResourcesDestroy(res);
+ * @endcode
+ */
+cuvsError_t cuvsHnswSerialize(cuvsResources_t res, const char* filename, cuvsHnswIndex_t index);
+
 /**
  * Load hnswlib index from file which was serialized from a HNSW index.
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the loaded hnswlib index is immutable, and only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
  * Experimental, both the API and the serialization format are subject to change.
  *
@@ -185,17 +425,22 @@ cuvsError_t cuvsHnswSearch(cuvsResources_t res,
  * // The index should have the same dtype as the one used to build CAGRA the index
  * cuvsHnswIndex_t hnsw_index;
  * cuvsHnswIndexCreate(&hnsw_index);
+ * cuvsHnsWIndexParams_t hnsw_params;
+ * cuvsHnswIndexParamsCreate(&hnsw_params);
+ * hnsw_params->hierarchy = NONE;
  * hnsw_index->dtype = index->dtype;
- * cuvsCagraDeserialize(res, "/path/to/index", hnsw_index);
+ * cuvsHnswDeserialize(res, hnsw_params, "/path/to/index", dim, metric hnsw_index);
  * @endcode
  *
  * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] params cuvsHnswIndexParams_t used to load Hnsw index
  * @param[in] filename the name of the file that stores the index
  * @param[in] dim the dimension of the vectors in the index
  * @param[in] metric the distance metric used to build the index
  * @param[out] index HNSW index loaded disk
  */
 cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                cuvsHnswIndexParams_t params,
                                 const char* filename,
                                 int dim,
                                 cuvsDistanceType metric,
diff --git a/cpp/include/cuvs/neighbors/hnsw.hpp b/cpp/include/cuvs/neighbors/hnsw.hpp
index d5abd6d55..f0b433d8e 100644
--- a/cpp/include/cuvs/neighbors/hnsw.hpp
+++ b/cpp/include/cuvs/neighbors/hnsw.hpp
@@ -34,14 +34,30 @@
 namespace cuvs::neighbors::hnsw {
 
 /**
- * @defgroup hnsw_cpp_search_params Build CAGRA index and search with hnswlib
+ * @defgroup hnsw_cpp_index_params hnswlib index wrapper params
  * @{
  */
 
-struct search_params : cuvs::neighbors::search_params {
-  int ef;               // size of the candidate list
-  int num_threads = 0;  // number of host threads to use for concurrent searches. Value of 0
-                        // automatically maximizes parallelism
+/**
+ * @brief Hierarchy for HNSW index when converting from CAGRA index
+ *
+ * NOTE: When the value is `NONE`, the HNSW index is built as a base-layer-only index.
+ */
+enum class HnswHierarchy {
+  NONE,  // base-layer-only index
+  CPU    // full index with CPU-built hierarchy
+};
+
+struct index_params : cuvs::neighbors::index_params {
+  /** Hierarchy build type for HNSW index when converting from CAGRA index */
+  HnswHierarchy hierarchy = HnswHierarchy::NONE;
+  /** Size of the candidate list during hierarchy construction when hierarchy is `CPU`*/
+  int ef_construction = 200;
+  /** Number of host threads to use to construct hierarchy when hierarchy is `CPU`
+  NOTE: Constructing the hierarchy when converting from a CAGRA graph is highly sensitive
+  to parallelism, and increasing the number of threads can reduce the quality of the index.
+   */
+  int num_threads = 2;
 };
 
 /**@}*/
@@ -62,8 +78,12 @@ struct index : cuvs::neighbors::index {
    *
    * @param[in] dim dimensions of the training dataset
    * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
+   * @param[in] hierarchy hierarchy used for upper HNSW layers
    */
-  index(int dim, cuvs::distance::DistanceType metric) : dim_{dim}, metric_{metric} {}
+  index(int dim, cuvs::distance::DistanceType metric, HnswHierarchy hierarchy = HnswHierarchy::NONE)
+    : dim_{dim}, metric_{metric}, hierarchy_{hierarchy}
+  {
+  }
 
   virtual ~index() {}
 
@@ -76,6 +96,8 @@ struct index : cuvs::neighbors::index {
 
   auto metric() const -> cuvs::distance::DistanceType { return metric_; }
 
+  auto hierarchy() const -> HnswHierarchy { return hierarchy_; }
+
   /**
   @brief Set ef for search
   */
@@ -84,24 +106,41 @@ struct index : cuvs::neighbors::index {
  private:
   int dim_;
   cuvs::distance::DistanceType metric_;
+  HnswHierarchy hierarchy_;
 };
 
 /**@}*/
 
+/**
+ * @defgroup hnsw_cpp_extend_params HNSW index extend parameters
+ * @{
+ */
+
+struct extend_params {
+  /** Number of host threads to use to add additional vectors to the index.
+  Value of 0 automatically maximizes parallelism. */
+  int num_threads = 0;
+};
+
 /**
  * @defgroup hnsw_cpp_index_load Load CAGRA index as hnswlib index
  * @{
  */
 
 /**
- * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index
- * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
- * before reading it as an hnswlib index, then deleting the temporary file. The returned index
- * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not
- * compatible with the original hnswlib.
+ * @brief Construct an hnswlib index from a CAGRA index
+ * NOTE: When `hnsw::index_params.hierarchy` is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] cagra_index cagra index
+ * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU`
  *
  * Usage example:
  * @code{.cpp}
@@ -110,24 +149,34 @@ struct index : cuvs::neighbors::index {
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as base-layer-only hnswlib index
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  * @endcode
  */
 std::unique_ptr<index<float>> from_cagra(
-  raft::resources const& res, const cuvs::neighbors::cagra::index<float, uint32_t>& cagra_index);
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<float, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const float, int64_t, raft::row_major>> dataset =
+    std::nullopt);
 
 /**
- * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index
- * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
- * before reading it as an hnswlib index, then deleting the temporary file.  The returned index
- * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not
- * compatible with the original hnswlib.
+ * @brief Construct an hnswlib index from a CAGRA index
+ * NOTE: When `hnsw::index_params.hierarchy` is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] cagra_index cagra index
+ * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU`
  *
  * Usage example:
  * @code{.cpp}
@@ -136,24 +185,34 @@ std::unique_ptr<index<float>> from_cagra(
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<uint8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as base-layer-only hnswlib index
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  * @endcode
  */
 std::unique_ptr<index<uint8_t>> from_cagra(
-  raft::resources const& res, const cuvs::neighbors::cagra::index<uint8_t, uint32_t>& cagra_index);
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<uint8_t, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const uint8_t, int64_t, raft::row_major>> dataset =
+    std::nullopt);
 
 /**
- * @brief Construct an immutable hnswlib base-layer-only index from a CAGRA index
- * NOTE: This method uses the filesystem to write the CAGRA index in `/tmp/<random_number>.bin`
- * before reading it as an hnswlib index, then deleting the temporary file.  The returned index
- * is immutable and can only be searched by the hnswlib wrapper in cuVS, as the format is not
- * compatible with the original hnswlib.
+ * @brief Construct an hnswlib index from a CAGRA index
+ * NOTE: When `hnsw::index_params.hierarchy` is:
+ *       1. `NONE`: This method uses the filesystem to write the CAGRA index in
+ * `/tmp/<random_number>.bin` before reading it as an hnswlib index, then deleting the temporary
+ * file. The returned index is immutable and can only be searched by the hnswlib wrapper in cuVS, as
+ * the format is not compatible with the original hnswlib.
+ *       2. `CPU`: The returned index is mutable and can be extended with additional vectors. The
+ * serialized index is also compatible with the original hnswlib library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] cagra_index cagra index
+ * @param[in] dataset optional dataset to avoid extra memory copy when hierarchy is `CPU`
  *
  * Usage example:
  * @code{.cpp}
@@ -162,14 +221,138 @@ std::unique_ptr<index<uint8_t>> from_cagra(
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<int8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as base-layer-only hnswlib index
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  * @endcode
  */
 std::unique_ptr<index<int8_t>> from_cagra(
-  raft::resources const& res, const cuvs::neighbors::cagra::index<int8_t, uint32_t>& cagra_index);
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<int8_t, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const int8_t, int64_t, raft::row_major>> dataset =
+    std::nullopt);
+
+/**@}*/
+
+/**
+ * @defgroup hnsw_cpp_index_extend Extend HNSW index with additional vectors
+ * @{
+ */
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU`
+ *       when converting from a CAGRA index.
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the extend
+ * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()]
+ * @param[inout] idx HNSW index to extend
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *
+ *   // Extend the HNSW index with additional vectors
+ *   auto additional_dataset = raft::make_host_matrix<float>(res, add_size, index->dim());
+ *   hnsw::extend_params extend_params;
+ *   hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get());
+ */
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const float, int64_t, raft::row_major> additional_dataset,
+            index<float>& idx);
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU`
+ *       when converting from a CAGRA index.
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the extend
+ * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()]
+ * @param[inout] idx HNSW index to extend
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *
+ *   // Extend the HNSW index with additional vectors
+ *   auto additional_dataset = raft::make_host_matrix<float>(res, add_size, index->dim());
+ *   hnsw::extend_params extend_params;
+ *   hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get());
+ */
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const uint8_t, int64_t, raft::row_major> additional_dataset,
+            index<uint8_t>& idx);
+
+/**
+ * @brief Add new vectors to an HNSW index
+ * NOTE: The HNSW index can only be extended when the `hnsw::index_params.hierarchy` is `CPU`
+ *       when converting from a CAGRA index.
+ *
+ * @param[in] res raft resources
+ * @param[in] params configure the extend
+ * @param[in] additional_dataset a host matrix view to a row-major matrix [n_rows, index->dim()]
+ * @param[inout] idx HNSW index to extend
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   hnsw_params.hierarchy = hnsw::HnswHierarchy::CPU;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *
+ *   // Extend the HNSW index with additional vectors
+ *   auto additional_dataset = raft::make_host_matrix<float>(res, add_size, index->dim());
+ *   hnsw::extend_params extend_params;
+ *   hnsw::extend(res, extend_params, additional_dataset, *hnsw_index.get());
+ */
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const int8_t, int64_t, raft::row_major> additional_dataset,
+            index<int8_t>& idx);
+
+/**@} */
+
+/**
+ * @defgroup hnsw_cpp_search_params Build CAGRA index and search with hnswlib
+ * @{
+ */
+
+struct search_params : cuvs::neighbors::search_params {
+  int ef;               // size of the candidate list
+  int num_threads = 0;  // number of host threads to use for concurrent searches. Value of 0
+                        // automatically maximizes parallelism
+};
 
 /**@}*/
 
@@ -181,9 +364,9 @@ std::unique_ptr<index<int8_t>> from_cagra(
  */
 
 /**
- * @brief Search hnswlib base-layer-only index constructed from a CAGRA index
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * @brief Search HNSW index constructed from a CAGRA index
+ * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is
+ * `NONE`, as the format is not compatible with the original hnswlib.
  *
  * @param[in] res raft resources
  * @param[in] params configure the search
@@ -201,10 +384,11 @@ std::unique_ptr<index<int8_t>> from_cagra(
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as a base-layer HNSW index using the filesystem
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  *
  *   // Search K nearest neighbors as an hnswlib index
  *   // using host threads for concurrency
@@ -224,9 +408,9 @@ void search(raft::resources const& res,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances);
 
 /**
- * @brief Search hnswlib base-layer-only index constructed from a CAGRA index
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * @brief Search HNSWindex constructed from a CAGRA index
+ * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is
+ * `NONE`, as the format is not compatible with the original hnswlib.
  *
  * @param[in] res raft resources
  * @param[in] params configure the search
@@ -244,10 +428,11 @@ void search(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<uint8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as a base-layer HNSW index using the filesystem
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  *
  *   // Search K nearest neighbors as an hnswlib index
  *   // using host threads for concurrency
@@ -267,9 +452,9 @@ void search(raft::resources const& res,
             raft::host_matrix_view<float, int64_t, raft::row_major> distances);
 
 /**
- * @brief Search hnswlib base-layer-only index constructed from a CAGRA index
- * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS,
- *       as the format is not compatible with the original hnswlib.
+ * @brief Search HNSW index constructed from a CAGRA index
+ * NOTE: The HNSW index can only be searched by the hnswlib wrapper in cuVS when the hierarchy is
+ * `NONE`, as the format is not compatible with the original hnswlib.
  *
  * @param[in] res raft resources
  * @param[in] params configure the search
@@ -287,10 +472,11 @@ void search(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<int8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // Load CAGRA index as a base-layer HNSW index using the filesystem
- *   auto hnsw_index = hnsw::from_cagra(res, index);
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
  *
  *   // Search K nearest neighbors as an hnswlib index
  *   // using host threads for concurrency
@@ -312,16 +498,106 @@ void search(raft::resources const& res,
 /**@}*/
 
 /**
- * @defgroup hnsw_cpp_index_deserialize Deserialize CAGRA index as hnswlib index
+ * @defgroup hnsw_cpp_index_serialize Deserialize CAGRA index as hnswlib index
  * @{
  */
 
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res raft resources
+ * @param[in] filename path to the file to save the serialized CAGRA index
+ * @param[in] idx cagra index
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *   // Save the index
+ *   hnsw::serialize(res, "index.bin", index);
+ * @endcode
+ */
+void serialize(raft::resources const& res, const std::string& filename, const index<float>& idx);
+
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res raft resources
+ * @param[in] filename path to the file to save the serialized CAGRA index
+ * @param[in] idx cagra index
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *   // Save the index
+ *   hnsw::serialize(res, "index.bin", index);
+ * @endcode
+ */
+void serialize(raft::resources const& res, const std::string& filename, const index<uint8_t>& idx);
+
+/**
+ * @brief Serialize a CAGRA index to a file as an hnswlib index
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
+ * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
+ *
+ * @param[in] res raft resources
+ * @param[in] filename path to the file to save the serialized CAGRA index
+ * @param[in] idx cagra index
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // Build a CAGRA index
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   cagra::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = cagra::build(res, index_params, dataset);
+ *
+ *   // Load CAGRA index as an HNSW index
+ *   hnsw::index_params hnsw_params;
+ *   auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *   // Save the index
+ *   hnsw::serialize(res, "index.bin", index);
+ * @endcode
+ */
+void serialize(raft::resources const& res, const std::string& filename, const index<int8_t>& idx);
+
 /**
  * @brief De-serialize a CAGRA index saved to a file as an hnswlib index
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] filename path to the file containing the serialized CAGRA index
  * @param[in] dim dimensions of the training dataset
  * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
@@ -334,19 +610,23 @@ void search(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<float, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // save a CAGRA index to a file
- *   cagra::serialize(res, index, "index.bin");
- *   // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem
- *   index<float>* hnsw_index = nullptr;
- *   hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index);
+ *   // Load CAGRA index as an HNSW index
+ *  hnsw::index_params hnsw_params;
+ *  auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *  // save HNSW index to a file
+ *  hnsw::serialize(res, "index.bin", hnsw_index);
+ *  // De-serialize the HNSW index
+ *  index<float>* hnsw_index = nullptr;
+ *  hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index);
  *
  *   // Delete index after use
  *   delete hnsw_index;
  * @endcode
  */
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
@@ -354,10 +634,13 @@ void deserialize(raft::resources const& res,
 
 /**
  * @brief De-serialize a CAGRA index saved to a file as an hnswlib index
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] filename path to the file containing the serialized CAGRA index
  * @param[in] dim dimensions of the training dataset
  * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
@@ -370,19 +653,23 @@ void deserialize(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<uint8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // save a CAGRA index to a file
- *   cagra::serialize(res, index, "index.bin");
- *   // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem
- *   index<uint8_t>* hnsw_index = nullptr;
- *   hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index);
+ *   // Load CAGRA index as an HNSW index
+ *  hnsw::index_params hnsw_params;
+ *  auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *  // save HNSW index to a file
+ *  hnsw::serialize(res, "index.bin", hnsw_index);
+ *  // De-serialize the HNSW index
+ *  index<uint8_t>* hnsw_index = nullptr;
+ *  hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index);
  *
  *   // Delete index after use
  *   delete hnsw_index;
  * @endcode
  */
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
@@ -390,10 +677,13 @@ void deserialize(raft::resources const& res,
 
 /**
  * @brief De-serialize a CAGRA index saved to a file as an hnswlib index
- * NOTE: The loaded hnswlib index is immutable, and only be read by the
+ * NOTE: When hierarchy is `NONE`, the saved hnswlib index is immutable and can only be read by the
  * hnswlib wrapper in cuVS, as the serialization format is not compatible with the original hnswlib.
+ * However, when hierarchy is `CPU`, the saved hnswlib index is compatible with the original hnswlib
+ * library.
  *
  * @param[in] res raft resources
+ * @param[in] params hnsw index parameters
  * @param[in] filename path to the file containing the serialized CAGRA index
  * @param[in] dim dimensions of the training dataset
  * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
@@ -406,19 +696,23 @@ void deserialize(raft::resources const& res,
  *   // use default index parameters
  *   cagra::index_params index_params;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = cagra::build<int8_t, uint32_t>(res, index_params, dataset);
+ *   auto index = cagra::build(res, index_params, dataset);
  *
- *   // save a CAGRA index to a file
- *   cagra::serialize(res, index, "index.bin");
- *   // De-serialize a CAGRA index as a base-layer HNSW index using the filesystem
- *   index<int8_t>* hnsw_index = nullptr;
- *   hnsw::deserialize(res, "index.bin", index->dim(), index->metric(), &hnsw_index);
+ *   // Load CAGRA index as an HNSW index
+ *  hnsw::index_params hnsw_params;
+ *  auto hnsw_index = hnsw::from_cagra(res, hnsw_params, index);
+ *  // save HNSW index to a file
+ *  hnsw::serialize(res, "index.bin", hnsw_index);
+ *  // De-serialize the HNSW index
+ *  index<int8_t>* hnsw_index = nullptr;
+ *  hnsw::deserialize(res, hnsw_params, "index.bin", index->dim(), index->metric(), &hnsw_index);
  *
  *   // Delete index after use
  *   delete hnsw_index;
  * @endcode
  */
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
diff --git a/cpp/include/cuvs/neighbors/ivf_flat.hpp b/cpp/include/cuvs/neighbors/ivf_flat.hpp
index 7f852d635..e017946d9 100644
--- a/cpp/include/cuvs/neighbors/ivf_flat.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_flat.hpp
@@ -138,6 +138,10 @@ using list_data = ivf::list<list_spec, SizeT, ValueT, IdxT>;
  */
 template <typename T, typename IdxT>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = ivf_flat::index_params;
+  using search_params_type = ivf_flat::search_params;
+  using index_type         = IdxT;
+  using value_type         = T;
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
 
diff --git a/cpp/include/cuvs/neighbors/ivf_pq.hpp b/cpp/include/cuvs/neighbors/ivf_pq.hpp
index ae543c9e9..d85753b7f 100644
--- a/cpp/include/cuvs/neighbors/ivf_pq.hpp
+++ b/cpp/include/cuvs/neighbors/ivf_pq.hpp
@@ -319,6 +319,9 @@ using list_data = ivf::list<list_spec, SizeT, IdxT>;
  */
 template <typename IdxT>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = ivf_pq::index_params;
+  using search_params_type = ivf_pq::search_params;
+  using index_type         = IdxT;
   static_assert(!raft::is_narrowing_v<uint32_t, IdxT>,
                 "IdxT must be able to represent all values of uint32_t");
 
diff --git a/cpp/src/core/c_api.cpp b/cpp/src/core/c_api.cpp
index cfbeed2d5..4333bff0c 100644
--- a/cpp/src/core/c_api.cpp
+++ b/cpp/src/core/c_api.cpp
@@ -26,6 +26,7 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/host/pinned_memory_resource.hpp>
 #include <thread>
 
 extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res)
@@ -130,6 +131,21 @@ extern "C" cuvsError_t cuvsRMMMemoryResourceReset()
   });
 }
 
+thread_local std::unique_ptr<rmm::mr::pinned_memory_resource> pinned_mr;
+
+extern "C" cuvsError_t cuvsRMMHostAlloc(void** ptr, size_t bytes)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (pinned_mr == nullptr) { pinned_mr = std::make_unique<rmm::mr::pinned_memory_resource>(); }
+    *ptr = pinned_mr->allocate(bytes);
+  });
+}
+
+extern "C" cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes)
+{
+  return cuvs::core::translate_exceptions([=] { pinned_mr->deallocate(ptr, bytes); });
+}
+
 thread_local std::string last_error_text = "";
 
 extern "C" const char* cuvsGetLastErrorText()
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 2bed19009..fa71dbaf9 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -129,17 +129,27 @@ struct search : search_plan_impl<DataT, IndexT, DistanceT, SAMPLE_FILTER_T> {
       (sizeof(INDEX_T) + sizeof(DISTANCE_T)) * result_buffer_size_32 +
       sizeof(INDEX_T) * hashmap::get_size(small_hash_bitlen) + sizeof(INDEX_T) * search_width +
       sizeof(std::uint32_t) * topk_ws_size + sizeof(std::uint32_t);
-    smem_size = base_smem_size;
+
+    std::uint32_t additional_smem_size = 0;
     if (num_itopk_candidates > 256) {
       // Tentatively calculate the required share memory size when radix
       // sort based topk is used, assuming the block size is the maximum.
       if (itopk_size <= 256) {
-        smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        additional_smem_size += topk_by_radix_sort<256, INDEX_T>::smem_size * sizeof(std::uint32_t);
       } else {
-        smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t);
+        additional_smem_size += topk_by_radix_sort<512, INDEX_T>::smem_size * sizeof(std::uint32_t);
       }
     }
 
+    if (!std::is_same_v<SAMPLE_FILTER_T, cuvs::neighbors::filtering::none_sample_filter>) {
+      // For filtering postprocess
+      using scan_op_t = cub::WarpScan<unsigned>;
+      additional_smem_size =
+        std::max<std::uint32_t>(additional_smem_size, sizeof(scan_op_t::TempStorage));
+    }
+
+    smem_size = base_smem_size + additional_smem_size;
+
     uint32_t block_size = thread_block_size;
     if (block_size == 0) {
       block_size = min_block_size;
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 79cb6bc10..678ed0cb4 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -111,7 +111,7 @@ RAFT_DEVICE_INLINE_FUNCTION void pickup_next_parents(std::uint32_t* const termin
 }
 
 template <unsigned MAX_CANDIDATES, class IdxT = void>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st(
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_full(
   float* candidate_distances,  // [num_candidates]
   IdxT* candidate_indices,     // [num_candidates]
   const std::uint32_t num_candidates,
@@ -215,7 +215,7 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_1st(
 }
 
 template <unsigned MAX_ITOPK, class IdxT = void>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd(
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_merge(
   float* itopk_distances,  // [num_itopk]
   IdxT* itopk_indices,     // [num_itopk]
   const std::uint32_t num_itopk,
@@ -424,7 +424,7 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_2nd(
 template <unsigned MAX_ITOPK,
           unsigned MAX_CANDIDATES,
           class IdxT>
-RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
+RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort_and_merge(
   float* itopk_distances,  // [num_itopk]
   IdxT* itopk_indices,     // [num_itopk]
   const std::uint32_t num_itopk,
@@ -437,20 +437,62 @@ RAFT_DEVICE_INLINE_FUNCTION void topk_by_bitonic_sort(
   const unsigned MULTI_WARPS_2)
 {
   // The results in candidate_distances/indices are sorted by bitonic sort.
-  topk_by_bitonic_sort_1st<MAX_CANDIDATES, IdxT>(
+  topk_by_bitonic_sort_and_full<MAX_CANDIDATES, IdxT>(
     candidate_distances, candidate_indices, num_candidates, num_itopk, MULTI_WARPS_1);
 
   // The results sorted above are merged with the internal intermediate top-k
   // results so far using bitonic merge.
-  topk_by_bitonic_sort_2nd<MAX_ITOPK, IdxT>(itopk_distances,
-                                            itopk_indices,
-                                            num_itopk,
-                                            candidate_distances,
-                                            candidate_indices,
-                                            num_candidates,
-                                            work_buf,
-                                            first,
-                                            MULTI_WARPS_2);
+  topk_by_bitonic_sort_and_merge<MAX_ITOPK, IdxT>(itopk_distances,
+                                                  itopk_indices,
+                                                  num_itopk,
+                                                  candidate_distances,
+                                                  candidate_indices,
+                                                  num_candidates,
+                                                  work_buf,
+                                                  first,
+                                                  MULTI_WARPS_2);
+}
+
+// This function move the invalid index element to the end of the itopk list.
+// Require : array_length % 32 == 0 && The invalid entry is only one.
+template <class IdxT>
+RAFT_DEVICE_INLINE_FUNCTION void move_invalid_to_end_of_list(IdxT* const index_array,
+                                                             float* const distance_array,
+                                                             const std::uint32_t array_length)
+{
+  constexpr std::uint32_t warp_size     = 32;
+  constexpr std::uint32_t invalid_index = utils::get_max_value<IdxT>();
+  const std::uint32_t lane_id           = threadIdx.x % warp_size;
+
+  if (threadIdx.x >= warp_size) { return; }
+
+  bool found_invalid = false;
+  if (array_length % warp_size == 0) {
+    for (std::uint32_t i = lane_id; i < array_length; i += warp_size) {
+      const auto index    = index_array[i];
+      const auto distance = distance_array[i];
+
+      if (found_invalid) {
+        index_array[i - 1]    = index;
+        distance_array[i - 1] = distance;
+      } else {
+        // Check if the index is invalid
+        const auto I_found_invalid = (index == invalid_index);
+        const auto who_has_invalid = raft::ballot(I_found_invalid);
+        // if a value that is loaded by a smaller lane id thread, shift the array
+        if (who_has_invalid << (warp_size - lane_id)) {
+          index_array[i - 1]    = index;
+          distance_array[i - 1] = distance;
+        }
+
+        found_invalid = who_has_invalid;
+      }
+    }
+  }
+  if (lane_id == 0) {
+    index_array[array_length - 1]    = invalid_index;
+    distance_array[array_length - 1] = utils::get_max_value<float>();
+  }
 }
 
 template <class INDEX_T>
@@ -589,10 +631,10 @@ __device__ void search_core(
     // sort
     if constexpr (TOPK_BY_BITONIC_SORT) {
       // [Notice]
-      // It is good to use multiple warps in topk_by_bitonic_sort() when
+      // It is good to use multiple warps in topk_by_bitonic_sort_and_merge() when
       // batch size is small (short-latency), but it might not be always good
       // when batch size is large (high-throughput).
-      // topk_by_bitonic_sort() consists of two operations:
+      // topk_by_bitonic_sort_and_merge() consists of two operations:
       // if MAX_CANDIDATES is greater than 128, the first operation uses two warps;
       // if MAX_ITOPK is greater than 256, the second operation used two warps.
       const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
@@ -601,9 +643,9 @@ __device__ void search_core(
       // reset small-hash table.
       if ((iter + 1) % small_hash_reset_interval == 0) {
         // Depending on the block size and the number of warps used in
-        // topk_by_bitonic_sort(), determine which warps are used to reset
+        // topk_by_bitonic_sort_and_merge(), determine which warps are used to reset
         // the small hash and whether they are performed in overlap with
-        // topk_by_bitonic_sort().
+        // topk_by_bitonic_sort_and_merge().
         _CLK_START();
         unsigned hash_start_tid;
         if (blockDim.x == 32) {
@@ -627,28 +669,28 @@ __device__ void search_core(
 
       // topk with bitonic sort
       _CLK_START();
-      if (std::is_same<SAMPLE_FILTER_T, cuvs::neighbors::filtering::none_sample_filter>::value ||
-          *filter_flag == 0) {
-        topk_by_bitonic_sort<MAX_ITOPK, MAX_CANDIDATES>(result_distances_buffer,
-                                                        result_indices_buffer,
-                                                        internal_topk,
-                                                        result_distances_buffer + internal_topk,
-                                                        result_indices_buffer + internal_topk,
-                                                        search_width * graph_degree,
-                                                        topk_ws,
-                                                        (iter == 0),
-                                                        multi_warps_1,
-                                                        multi_warps_2);
-        __syncthreads();
-      } else {
-        topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
-          result_distances_buffer,
-          result_indices_buffer,
-          internal_topk + search_width * graph_degree,
-          internal_topk,
-          false);
+      if (!(std::is_same<SAMPLE_FILTER_T, cuvs::neighbors::filtering::none_sample_filter>::value ||
+            *filter_flag == 0)) {
+        // Move the filtered out index to the end of the itopk list
+        for (unsigned i = 0; i < search_width; i++) {
+          move_invalid_to_end_of_list(
+            result_indices_buffer, result_distances_buffer, internal_topk);
+        }
+
         if (threadIdx.x == 0) { *terminate_flag = 0; }
       }
+      topk_by_bitonic_sort_and_merge<MAX_ITOPK, MAX_CANDIDATES>(
+        result_distances_buffer,
+        result_indices_buffer,
+        internal_topk,
+        result_distances_buffer + internal_topk,
+        result_indices_buffer + internal_topk,
+        search_width * graph_degree,
+        topk_ws,
+        (iter == 0),
+        multi_warps_1,
+        multi_warps_2);
+      __syncthreads();
       _CLK_REC(clk_topk);
     } else {
       _CLK_START();
@@ -755,12 +797,66 @@ __device__ void search_core(
     }
 
     __syncthreads();
-    topk_by_bitonic_sort_1st<MAX_ITOPK + MAX_CANDIDATES>(
-      result_distances_buffer,
-      result_indices_buffer,
-      internal_topk + search_width * graph_degree,
-      top_k,
-      false);
+    // Move invalid index items to the end of the buffer without sorting the entire buffer
+    using scan_op_t    = cub::WarpScan<unsigned>;
+    auto& temp_storage = *reinterpret_cast<typename scan_op_t::TempStorage*>(smem_work_ptr);
+
+    constexpr std::uint32_t warp_size = 32;
+    if (threadIdx.x < warp_size) {
+      std::uint32_t num_found_valid = 0;
+      for (std::uint32_t buffer_offset = 0; buffer_offset < internal_topk;
+           buffer_offset += warp_size) {
+        // Calculate the new buffer index
+        const auto src_position = buffer_offset + threadIdx.x;
+        const std::uint32_t is_valid_index =
+          (result_indices_buffer[src_position] & (~index_msb_1_mask)) == invalid_index ? 0 : 1;
+        std::uint32_t new_position;
+        scan_op_t(temp_storage).InclusiveSum(is_valid_index, new_position);
+        if (is_valid_index) {
+          const auto dst_position               = num_found_valid + (new_position - 1);
+          result_indices_buffer[dst_position]   = result_indices_buffer[src_position];
+          result_distances_buffer[dst_position] = result_distances_buffer[src_position];
+        }
+
+        // Calculate the largest valid position within a warp and bcast it for the next iteration
+        num_found_valid += new_position;
+        for (std::uint32_t offset = (warp_size >> 1); offset > 0; offset >>= 1) {
+          const auto v = raft::shfl_xor(num_found_valid, offset);
+          if ((threadIdx.x & offset) == 0) { num_found_valid = v; }
+        }
+
+        // If the enough number of items are found, do early termination
+        if (num_found_valid >= top_k) { break; }
+      }
+
+      if (num_found_valid < top_k) {
+        // Fill the remaining buffer with invalid values so that `topk_by_bitonic_sort_and_merge` is
+        // usable in the next step
+        for (std::uint32_t i = num_found_valid + threadIdx.x; i < internal_topk; i += warp_size) {
+          result_indices_buffer[i]   = invalid_index;
+          result_distances_buffer[i] = utils::get_max_value<DISTANCE_T>();
+        }
+      }
+    }
+
+    // If the sufficient number of valid indexes are not in the internal topk, pick up from the
+    // candidate list.
+    if (top_k > internal_topk || result_indices_buffer[top_k - 1] == invalid_index) {
+      __syncthreads();
+      const unsigned multi_warps_1 = ((blockDim.x >= 64) && (MAX_CANDIDATES > 128)) ? 1 : 0;
+      const unsigned multi_warps_2 = ((blockDim.x >= 64) && (MAX_ITOPK > 256)) ? 1 : 0;
+      topk_by_bitonic_sort_and_merge<MAX_ITOPK, MAX_CANDIDATES>(
+        result_distances_buffer,
+        result_indices_buffer,
+        internal_topk,
+        result_distances_buffer + internal_topk,
+        result_indices_buffer + internal_topk,
+        search_width * graph_degree,
+        topk_ws,
+        (iter == 0),
+        multi_warps_1,
+        multi_warps_2);
+    }
     __syncthreads();
   }
 
diff --git a/cpp/src/neighbors/detail/dynamic_batching.cuh b/cpp/src/neighbors/detail/dynamic_batching.cuh
new file mode 100644
index 000000000..5c6b1654e
--- /dev/null
+++ b/cpp/src/neighbors/detail/dynamic_batching.cuh
@@ -0,0 +1,1197 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "../sample_filter.cuh"
+
+#include <cuvs/neighbors/dynamic_batching.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/pinned_mdarray.hpp>
+#include <raft/core/pinned_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+
+#include <cooperative_groups.h>
+#include <cuda/atomic>
+#include <cuda/std/atomic>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <chrono>
+#include <limits>
+#include <memory>
+#include <variant>
+#include <vector>
+
+#ifndef CUVS_SYSTEM_LITTLE_ENDIAN
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define CUVS_SYSTEM_LITTLE_ENDIAN 0
+#else
+#define CUVS_SYSTEM_LITTLE_ENDIAN 1
+#endif
+#endif
+
+namespace cuvs::neighbors::dynamic_batching::detail {
+
+using raft::RAFT_NAME;  // TODO: a workaround for RAFT_LOG_XXX macros
+
+/**
+ * A helper to make the requester threads more cooperative when busy-spinning.
+ * It is used in the wait loops across this file to reduce the CPU usage.
+ *
+ * Ideally, we should be using atomics notify/wait feature, but that is not always possible
+ * (e.g. waiting on multiple things or waiting on GPU volatile stores).
+ */
+struct local_waiter {
+  static constexpr inline int64_t kNonSleepIterations = 10;
+
+  explicit local_waiter(std::chrono::nanoseconds base_sleep_time,
+                        int64_t start_iteration = 0) noexcept
+    : base_sleep_time_{base_sleep_time}, iteration_{start_iteration}
+  {
+  }
+
+  inline void wait() noexcept
+  {
+    if (iteration_ < 2) {
+      // Don't wait for the first few iterations:
+      // maybe there's a weak CAS op in the loop, or something else that could return quickly
+    } else if (iteration_ < kNonSleepIterations) {
+      std::this_thread::yield();
+    } else {
+      auto k = iteration_ + 1 - kNonSleepIterations;
+      std::this_thread::sleep_for(base_sleep_time_ * k);
+    }
+    ++iteration_;
+  }
+
+  inline void reset(int64_t start_iteration = 0) noexcept { iteration_ = start_iteration; }
+
+ private:
+  std::chrono::nanoseconds base_sleep_time_;
+  int64_t iteration_;
+};
+
+class cuda_event {
+ public:
+  cuda_event(cuda_event&&)            = default;
+  cuda_event& operator=(cuda_event&&) = default;
+  ~cuda_event()                       = default;
+  cuda_event(cuda_event const&)       = delete;  // Copying disallowed: one event one owner
+  cuda_event& operator=(cuda_event&)  = delete;
+
+  cuda_event()
+    : event_{[]() {
+               cudaEvent_t* e = new cudaEvent_t;
+               RAFT_CUDA_TRY(cudaEventCreateWithFlags(e, cudaEventDisableTiming));
+               return e;
+             }(),
+             [](cudaEvent_t* e) {
+               RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(*e));
+               delete e;
+             }}
+  {
+  }
+
+  cudaEvent_t value() const { return *event_; }
+
+ private:
+  std::unique_ptr<cudaEvent_t, std::function<void(cudaEvent_t*)>> event_;
+};
+
+template <typename MdSpanOrArray>
+struct get_accessor_type_t {
+  using type = typename MdSpanOrArray::accessor_type;
+};
+
+template <typename ElementType, typename Extents, typename LayoutPolicy, typename ContainerPolicy>
+struct get_accessor_type_t<raft::mdarray<ElementType, Extents, LayoutPolicy, ContainerPolicy>> {
+  using mdarray_type = raft::mdarray<ElementType, Extents, LayoutPolicy, ContainerPolicy>;
+  using view_type    = typename mdarray_type::view_type;
+  using type         = typename view_type::accessor_type;
+};
+
+template <typename MdSpanOrArray>
+using get_accessor_type = typename get_accessor_type_t<MdSpanOrArray>::type;
+
+template <typename Source3DT>
+constexpr inline auto slice_3d(typename Source3DT::index_type i,
+                               const Source3DT& source3d,
+                               typename Source3DT::index_type n_rows = 0)
+{
+  using element_type  = typename Source3DT::element_type;
+  using index_type    = typename Source3DT::index_type;
+  using layout_type   = typename Source3DT::layout_type;
+  using accessor_type = get_accessor_type<Source3DT>;
+  auto extent2d =
+    raft::make_extents<index_type>(n_rows == 0 ? source3d.extent(1) : n_rows, source3d.extent(2));
+  auto stride = uint64_t(source3d.extent(1)) * uint64_t(source3d.extent(2));
+  return raft::mdspan<element_type, decltype(extent2d), layout_type, accessor_type>{
+    const_cast<element_type*>(source3d.data_handle()) + stride * i, extent2d};
+}
+
+template <typename Source2DT>
+constexpr inline auto slice_2d(typename Source2DT::index_type i, const Source2DT& source2d)
+{
+  using element_type  = typename Source2DT::element_type;
+  using index_type    = typename Source2DT::index_type;
+  using layout_type   = typename Source2DT::layout_type;
+  using accessor_type = get_accessor_type<Source2DT>;
+  auto extent1d       = raft::make_extents<index_type>(source2d.extent(1));
+  auto stride         = uint64_t(extent1d.extent(0));
+  return raft::mdspan<element_type, decltype(extent1d), layout_type, accessor_type>{
+    const_cast<element_type*>(source2d.data_handle()) + stride * i, extent1d};
+}
+
+// ---------------------------------------------
+
+constexpr size_t kCacheLineBytes = 64;
+
+template <typename Upstream, typename T, typename IdxT>
+using upstream_search_type_const = void(raft::resources const&,
+                                        typename Upstream::search_params_type const&,
+                                        Upstream const&,
+                                        raft::device_matrix_view<const T, int64_t, raft::row_major>,
+                                        raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                                        raft::device_matrix_view<float, int64_t, raft::row_major>,
+                                        const cuvs::neighbors::filtering::base_filter&);
+
+template <typename Upstream, typename T, typename IdxT>
+using upstream_search_type = void(raft::resources const&,
+                                  typename Upstream::search_params_type const&,
+                                  Upstream&,
+                                  raft::device_matrix_view<const T, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<float, int64_t, raft::row_major>,
+                                  const cuvs::neighbors::filtering::base_filter&);
+
+template <typename T, typename IdxT>
+using function_search_type = void(raft::resources const&,
+                                  raft::device_matrix_view<const T, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                                  raft::device_matrix_view<float, int64_t, raft::row_major>);
+
+/**
+ * State of the batch token slot.
+ *
+ * In a nutshell, there are only two batch slot states that matter: empty or full.
+ * Initially, all slots are empty. The host threads can commit (i.e. subscribe) to a batch slot even
+ * if it's empty (when they know it will be filled-in at some point in future). With this logic, we
+ * smooth out the bottleneck that occurs when many threads try to submit their work using a single
+ * atomic counter (the batch queue head).
+ *
+ * Once a GPU IO buffer is available, its owner returns the buffer to the queue by marking a slot as
+ * full. By that time, it may be partially or fully committed (i.e. several host threads are
+ * committed to submit a certain number of queries).
+ *
+ * If we had an infinite buffer, these two states would suffice. However, we have a finite ring
+ * buffer, so the used-up slots must be emptied again, so that they are usable in the following
+ * rounds through the ring buffer.
+ *
+ * The slot state depends not only on the value stored in it, but on the accessing thread as well
+ * (see `batch_queue_t::batch_status` below). The accessing thread may be ahead or behind the others
+ * (as defined by the sequential order id below). Depending on the accessor state, it may view the
+ * slot as being emptied/filled in the future, current, or previous rounds. This affects the
+ * decision whether the slot can be used and whether the thread has the right to advance tail or
+ * head counters of the batch queue.
+ *
+ */
+enum struct slot_state : int32_t {
+  /** The slot is empty, cleared-up in this round (hence the head should be past it). */
+  kEmptyPast = 1025,
+  /** The slot is empty, cleared-up in previous round. */
+  kEmpty = 1024,
+  /** The slot is empty, cleared-up two round ago and cannot be used yet (due to be filled). */
+  kEmptyBusy = 1023,
+  /** The current thread has been sleeping for too long and is way behind the others. */
+  kFullPast = 1,
+  /** The slot is full, filled-in in this round. */
+  kFull = 0,
+  /** This state is considered full, filled-in in previous round.  */
+  kFullBusy = -1
+  /** The rest of the values are impossible states indicating an error in the algo. */
+};
+
+/**
+ * Identifies the batch and its job-commit state.
+ * Should be in the pinned memory for fast shared access on CPU and GPU side.
+ *
+ * The batch token packs the IO buffer address (id) and a number of committed queries in a single
+ * 64-bit atomic. This is to allow conflict-free atomic updates of both values.
+ *
+ */
+struct batch_token {
+  uint64_t value = 0;
+
+  constexpr inline batch_token() {}
+  explicit constexpr inline batch_token(uint32_t buffer_id) { id() = buffer_id; }
+
+  /**
+   * Sequential id of the batch in the array of batches.
+   *
+   * The `id` field, in practice, stores not only the IO buffer address, but also an extra
+   * sequential "round" id. The latter identifies how many rounds through the batch ring buffer has
+   * already been done (computed from the the `seq_order_id` counter in the batch queue) and is used
+   * by `batch_queue_t::batch_status` below to compute the `slot_state`. This is to avoid the ABA
+   * atomic updates problem when using the ring buffer.
+   *
+   * There cannot be more IO buffers than the size of the ring buffer. The size of the ring buffer
+   * is always a power-of-two. Hence the IO buffer address needs only `log2(Size)` bits, and the
+   * rest is used for the ring buffer round id (see `batch_queue_t::make_seq_batch_id`).
+   *
+   */
+  RAFT_INLINE_FUNCTION auto id() noexcept -> uint32_t&
+  {
+    return *(reinterpret_cast<uint32_t*>(&value) + kOffsetOfId);
+  }
+  /**
+   * How many queries are promised by the participating CPU threads (requesters).
+   *
+   * The CPU threads atomically increment this counter until its size reaches `max_batch_size`.
+   *
+   * Any (CPU or GPU thread) may atomically write to the highest byte of this value, which indicates
+   * that no one can commit to this batch anymore (e.g. the wait timeout is exceeded).
+   * Hence, the actual number of committed queries is `size_committed % 0x00ffffff`.
+   *
+   * The gather kernel cannot finish while `size_committed < max_batch_size`.
+   *
+   * NB: we use the trick of writing to the highest byte to allow GPU write atomically to the pinned
+   * host memory. This way, we don't need to use device RMW atomics on host memory, which are not
+   * available on a broad class of GPUs. If not this workaround, we could simply do atomic add/or
+   * with value 0x01000000.
+   */
+  RAFT_INLINE_FUNCTION auto size_committed() noexcept -> uint32_t&
+  {
+    return *(reinterpret_cast<uint32_t*>(&value) + kOffsetOfSC);
+  }
+
+ private:
+  /** Offset of the `id()` value in the token if it's interpreted as uint32_t[2]. */
+  static constexpr inline uint32_t kOffsetOfId = CUVS_SYSTEM_LITTLE_ENDIAN;
+  /** Offset of the `size_committed()` value in the token if it's interpreted as uint32_t[2]. */
+  static constexpr inline uint32_t kOffsetOfSC = 1 - kOffsetOfId;
+};
+static_assert(sizeof(batch_token) == sizeof(uint64_t));
+static_assert(cuda::std::atomic<batch_token>::is_always_lock_free);
+
+/**
+ * The batch queue consists of several ring buffers and two counters determining where are the head
+ * and the tail of the queue in those buffers.
+ *
+ * There is an internal sequentially consistent order in the queue, defined by `seq_order_id`
+ * counter. The head and tail members define where the participants should look for full and
+ * empty slots in the queue respectively.
+ *
+ * The slots in the queue have their own states (see `slot_state` above). The states are updated
+ * concurrently in many threads, so the head and tail counters do not always accurately represent
+ * the actual compound state of the queue.
+ *
+ * `.head()` is where a host thread starts looking for a batch token. All slots earlier than
+ * returned by this method are not usable anymore (they batches are either "fully committed",
+ * dispatched, or emptied earlier). If a host thread determines that the current slot is not usable
+ * anymore, it increments the counter by calling `.pop()`.
+ *
+ * The tail is where a host thread reserves an empty slot to be filled-in by a GPU worker thread
+ * once it releases the owned IO buffer. There's no `.tail()` method, but `.push()` method returns
+ * the tail position (before advancing it). `.push()` blocks the host thread until it knows the slot
+ * isn't used by any other threads anymore (i.e. cleaned-up from the previous round).
+ *
+ * There's no strict relation between the head and the tail.
+ * Normally there is a single batch in the ring buffer being partially filled. It is followed by
+ * contiguous list of empty idle batches and reserved empty slots. The head and the tail loosely
+ * correspond to the beginning and the end of this sequence.
+ *
+ * Sometimes, the head can go further than the tail. This means all batches are busy and there are
+ * more threads committed to the slots that are not populated with the batches (and not even
+ * reserved for filling-in yet).
+ *
+ *
+ */
+template <uint32_t Size>
+struct batch_queue_t {
+  static constexpr uint32_t kSize        = Size;
+  static constexpr uint32_t kMinElemSize = sizeof(uint32_t);
+  static_assert(cuda::std::atomic<batch_token>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(cuda::std::atomic<uint32_t>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(cuda::std::atomic<int32_t>::is_always_lock_free,
+                "The value type must be lock-free.");
+  static_assert(raft::is_a_power_of_two(kSize), "The size must be a power-of-two for efficiency.");
+
+  static constexpr auto kMemOrder = cuda::std::memory_order_relaxed;
+
+  /** Type-safe synonym for the internal head & tail counters. */
+  struct seq_order_id {
+    uint32_t value;
+  };
+
+  explicit batch_queue_t(const raft::resources& res, bool use_batch_sizes) noexcept
+    : tokens_{raft::make_pinned_vector<cuda::atomic<batch_token, cuda::thread_scope_system>,
+                                       uint32_t>(res, kSize)},
+      rem_time_us_{
+        raft::make_pinned_vector<cuda::atomic<int32_t, cuda::thread_scope_system>, uint32_t>(
+          res, kSize)},
+      dispatch_sequence_id_(kSize),
+      batch_sizes_{
+        use_batch_sizes
+          ? std::make_optional(
+              raft::make_pinned_vector<cuda::atomic<uint32_t, cuda::thread_scope_system>, uint32_t>(
+                res, kSize))
+          : std::nullopt}
+  {
+    tail_.store(0, kMemOrder);
+    head_.store(0, kMemOrder);
+    auto past_seq_id = seq_order_id{static_cast<uint32_t>(-1)};
+    for (uint32_t i = 0; i < kSize; i++) {
+      rem_time_us_(i).store(std::numeric_limits<int32_t>::max(), kMemOrder);
+      if (batch_sizes_.has_value()) { batch_sizes_.value()(i).store(0, kMemOrder); }
+      dispatch_sequence_id_[i].store(past_seq_id.value, kMemOrder);
+      tokens_(i).store(make_empty_token(past_seq_id), kMemOrder);
+    }
+  }
+
+  /**
+   * Advance the tail position, ensure the slot is empty, and return the reference to the new slot.
+   * The calling side is responsible for filling-in the slot with an actual value at a later time.
+   *
+   * Conceptually, this method reserves a ring buffer slot on the host side, so that the GPU worker
+   * thread can return the IO buffer (filling the token slot) asynchronously.
+   */
+  inline auto push() -> seq_order_id
+  {
+    seq_order_id seq_id{tail_.fetch_add(1, kMemOrder)};
+    auto& loc = token(seq_id);
+    auto ss   = batch_status(loc.load(kMemOrder), seq_id);
+    /* [Note: very small waiting time]
+
+    Only a few (dispatcher) threads are going to call this function at the same time as opposed to
+    potentially any number of threads waiting on new batches to arrive.
+    This is a performance-critical code path.
+
+    Hence the small base sleep time.
+    */
+    local_waiter till_empty{std::chrono::nanoseconds{1000}};
+    while (ss == slot_state::kFull || ss == slot_state::kFullBusy || ss == slot_state::kEmptyBusy) {
+      // Wait till the slot becomes empty (doesn't matter future or past).
+      // The batch id is only ever updated in the scatter/gather kernels, which are the only source
+      // of truth whether a batch buffer is currently used by the GPU.
+      till_empty.wait();
+      ss = batch_status(loc.load(kMemOrder), seq_id);
+    }
+    return seq_id;
+  }
+
+  /**
+   * Return the offset of the given w.r.t. the tail of the queue.
+   * Negative value means the given slot is in the body of the queue and should be dispatched soon.
+   * Positive value means the given slot is ahead of the queue and should wait longer.
+   *
+   * That is the lower the value the higher the priority.
+   */
+  [[nodiscard]] inline auto niceness(seq_order_id id) const noexcept -> int32_t
+  {
+    return static_cast<int32_t>(id.value - tail_.load(kMemOrder));
+  }
+
+  /** Get the reference to the first element in the queue. */
+  inline auto head() noexcept -> seq_order_id
+  {
+    auto h = head_.load(kMemOrder);
+    // The head cannot go ahead of the tail by more than the queue buffer size.
+    // If the head is ahead by not more than kSize elements though, everything is fine;
+    // the slots too far ahead are protected by busy tokens.
+    local_waiter for_tail(std::chrono::nanoseconds{100000});
+    while (static_cast<int32_t>(h - tail_.load(kMemOrder)) >= static_cast<int32_t>(kSize)) {
+      for_tail.wait();
+      h = head_.load(kMemOrder);
+    }
+    return seq_order_id{h};
+  }
+
+  /** Batch commit state and IO buffer id (see `batch_token`) */
+  inline auto token(seq_order_id id) -> cuda::atomic<batch_token, cuda::thread_scope_system>&
+  {
+    return tokens_(cache_friendly_idx(id.value));
+  }
+
+  /**
+   * How much time has this batch left for waiting.
+   * It is an approximate value by design - to minimize the synchronization between CPU and GPU.
+   *
+   * The clocks on GPU and CPU may have different values, so the running kernel and the CPU thread
+   * have different ideas on how much time is left. Rather than trying to synchronize the clocks, we
+   * maintain independent timers and accept the uncertainty.
+   *
+   * Access pattern: CPU write-only (producer); GPU read-only (consumer).
+   */
+  inline auto rem_time_us(seq_order_id id) -> cuda::atomic<int32_t, cuda::thread_scope_system>&
+  {
+    return rem_time_us_(cache_friendly_idx(id.value));
+  }
+
+  /**
+   * The actual batch size - the final number of committed queries.
+   * This is only used if `conservative_dispatch = true`.
+   */
+  inline auto batch_size(seq_order_id id) noexcept
+    -> cuda::atomic<uint32_t, cuda::thread_scope_system>*
+  {
+    if (batch_sizes_.has_value()) { return &batch_sizes_.value()(cache_friendly_idx(id.value)); }
+    return nullptr;
+  }
+
+  /**
+   * This value is updated by the host thread after it submits the job completion event to indicate
+   * to other threads can wait on the event to get the results back.
+   * Other threads get the value from the batch queue and compare that value against this atomic.
+   *
+   * Access pattern: CPU-only; dispatching thread writes the id once, other threads wait on it.
+   */
+  inline auto dispatch_sequence_id(seq_order_id id) -> cuda::std::atomic<uint32_t>&
+  {
+    return dispatch_sequence_id_[cache_friendly_idx(id.value)];
+  }
+
+  /**
+   * An `atomicMax` on the queue head in disguise.
+   * This makes the given batch slot and all prior slots unreachable (not possible to commit).
+   */
+  inline void pop(seq_order_id id) noexcept
+  {
+    const auto desired = id.value + 1;
+    auto observed      = id.value;
+    while (observed < desired &&
+           !head_.compare_exchange_weak(observed, desired, kMemOrder, kMemOrder)) {}
+  }
+
+  static constexpr inline auto batch_id(batch_token token) noexcept -> uint32_t
+  {
+    return token.id() & kCounterLocMask;
+  }
+
+  /**
+   * Construct a token that is interpreted as having been emptied in the current round
+   * (the round is derived from seq_id).
+   *
+   * NB: "round" is the number of times the queue counters went over the whole ring buffer.
+   *     It's used to avoid the ABA problem for atomic token updates.
+   */
+  static constexpr inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token
+  {
+    // Modify the seq_id to identify that the token slot is empty
+    auto empty_round    = static_cast<uint32_t>(slot_state::kEmptyPast) * kSize;
+    auto empty_round_id = seq_order_id{seq_id.value + empty_round};
+    // Id of empty slot is ignored and can be anything
+    auto empty_id = kCounterLocMask;
+    return batch_token{make_seq_batch_id(empty_round_id, empty_id)};
+  }
+
+  /**
+   * Construct a sequential batch id by combining the current round and the real batch id.
+   *
+   * The "round" part gives a hint when the token slot was filled-in to avoid the ABA problem
+   *  (see above).
+   */
+  static constexpr inline auto make_seq_batch_id(seq_order_id seq_id, uint32_t batch_id) noexcept
+    -> uint32_t
+  {
+    return seq_round(seq_id) | batch_id;
+  }
+
+  /**
+   * Get the state of the batch slot w.r.t. the given seq_order_id counter.
+   * This gives the information whether the slot is emptied/filled by another thread and whether
+   * that thread is ahead or behind the current thread.
+   * By introducing these future/past flavours of states we solve the ABA problem for atomic updates
+   * of the ring buffer slots.
+   */
+  static inline auto batch_status(batch_token token, seq_order_id seq_id) -> slot_state
+  {
+    /*
+    The "round" part of the id is just a seq_id without the low bits.
+    Essentially, we comparing here seq_ids of two threads: the one that wrote to the slot in the
+    past and the one reads from it now.
+
+    `kSize` determines the number of bits we use for the IO buffer id and for the round id.
+      */
+    auto v =
+      static_cast<int32_t>(seq_round(token) - seq_round(seq_id)) / static_cast<int32_t>(kSize);
+    if (v < static_cast<int32_t>(slot_state::kFullBusy)) { RAFT_FAIL("Invalid batch state %d", v); }
+    if (v < static_cast<int32_t>(slot_state::kEmptyBusy)) {
+      return static_cast<slot_state>(std::min(v, static_cast<int32_t>(slot_state::kFullPast)));
+    }
+    return static_cast<slot_state>(std::min(v, static_cast<int32_t>(slot_state::kEmptyPast)));
+  }
+
+ private:
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> tail_{};
+  alignas(kCacheLineBytes) cuda::std::atomic<uint32_t> head_{};
+
+  alignas(kCacheLineBytes)
+    raft::pinned_vector<cuda::atomic<batch_token, cuda::thread_scope_system>, uint32_t> tokens_;
+  raft::pinned_vector<cuda::atomic<int32_t, cuda::thread_scope_system>, uint32_t> rem_time_us_;
+  std::vector<cuda::std::atomic<uint32_t>> dispatch_sequence_id_;
+  std::optional<raft::pinned_vector<cuda::atomic<uint32_t, cuda::thread_scope_system>, uint32_t>>
+    batch_sizes_;
+
+  /* [Note: cache-friendly indexing]
+     To avoid false sharing, the queue pushes and pops values not sequentially, but with an
+     increment that is larger than the cache line size.
+     Hence we introduce the `kCounterIncrement > kCacheLineBytes`.
+     However, to make sure all indices are used, we choose the increment to be coprime with the
+     buffer size. We also require that the buffer size is a power-of-two for two reasons:
+       1) Fast modulus operation - reduces to binary `and` (with `kCounterLocMask`).
+       2) Easy to ensure GCD(kCounterIncrement, kSize) == 1 by construction
+          (see the definition below).
+   */
+  static constexpr uint32_t kElemsPerCacheLine =
+    raft::div_rounding_up_safe<uint32_t>(kCacheLineBytes, kMinElemSize);
+  static constexpr uint32_t kCounterIncrement = raft::bound_by_power_of_two(kElemsPerCacheLine) + 1;
+  static constexpr uint32_t kCounterLocMask   = kSize - 1;
+  // These props hold by design, but we add them here as a documentation and a sanity check.
+  static_assert(
+    kCounterIncrement * kMinElemSize >= kCacheLineBytes,
+    "The counter increment should be larger than the cache line size to avoid false sharing.");
+  static_assert(
+    std::gcd(kCounterIncrement, kSize) == 1,
+    "The counter increment and the size must be coprime to allow using all of the queue slots.");
+  /** Map the sequential index onto cache-friendly strided index. */
+  static constexpr inline auto cache_friendly_idx(uint32_t source_idx) noexcept -> uint32_t
+  {
+    return (source_idx * kCounterIncrement) & kCounterLocMask;
+  }
+
+  /** The "round": the number of times the queue counter went over the whole ring buffer. */
+  static constexpr inline auto seq_round(seq_order_id id) noexcept -> uint32_t
+  {
+    return id.value & ~kCounterLocMask;
+  }
+
+  /** The "round": the number of times the queue counter went over the whole ring buffer. */
+  static constexpr inline auto seq_round(batch_token token) noexcept -> uint32_t
+  {
+    return token.id() & ~kCounterLocMask;
+  }
+};
+
+template <typename T, typename IdxT>
+struct alignas(kCacheLineBytes) request_pointers {
+  /**
+   * A pointer to `dim` values of a single query (input).
+   *
+   * Serves as a synchronization point between the CPU thread (producer) and a GPU block in the
+   * `gather_inputs` kernel (consumer).
+   */
+  cuda::atomic<const T*, cuda::thread_scope_system> query{nullptr};
+  /** A pointer to `k` nearest neighbors (output) */
+  IdxT* neighbors{nullptr};
+  /** A pointer to distances of `k` nearest neighbors (output) */
+  float* distances{nullptr};
+};
+
+/**
+ * Check the current timestamp at the moment of construction and repeatedly compare the elapsed time
+ * to the timeout value provided by the host (passed via an atomic).
+ *
+ * This is used in the gather inputs kernel to make it stop waiting for new queries in a batch
+ * once the deadline is reached.
+ */
+struct gpu_time_keeper {
+  /**
+   * @param[in] cpu_provided_remaining_time_us
+   *   a pointer to a shared atomic, represent the remaining waiting time in microseconds.
+   *   Note, the remaining time is updated atomically by each participating host thread in their
+   *   "private coordinate systems". That's ok, we don't expect a single reference time for all host
+   *   and device threads.
+   *   We tolerate the errors coming from the time difference between the host thread writing their
+   *   remaining waiting time and the GPU thread reading that value.
+   */
+  RAFT_DEVICE_INLINE_FUNCTION explicit gpu_time_keeper(
+    cuda::atomic<int32_t, cuda::thread_scope_system>* cpu_provided_remaining_time_us)
+    : cpu_provided_remaining_time_us_{cpu_provided_remaining_time_us}
+  {
+    update_timestamp();
+  }
+
+  /**
+   * Check whether the deadline is not reached yet:
+   * 1) Compare the internal clock against the last-read deadline value
+   * 2) Read the deadline value from the host-visible atomic and check the internal clock again.
+   */
+  RAFT_DEVICE_INLINE_FUNCTION auto has_time() noexcept -> bool
+  {
+    if (timeout) { return false; }
+    update_local_remaining_time();
+    if (local_remaining_time_us_ <= 0) {
+      timeout = true;
+      return false;
+    }
+    update_cpu_provided_remaining_time();
+    if (local_remaining_time_us_ <= 0) {
+      timeout = true;
+      return false;
+    }
+    return true;
+  }
+
+ private:
+  cuda::atomic<int32_t, cuda::thread_scope_system>* cpu_provided_remaining_time_us_;
+  uint64_t timestamp_ns_           = 0;
+  int32_t local_remaining_time_us_ = std::numeric_limits<int32_t>::max();
+  bool timeout                     = false;
+
+  RAFT_DEVICE_INLINE_FUNCTION void update_timestamp() noexcept
+  {
+    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(timestamp_ns_));
+  }
+
+  RAFT_DEVICE_INLINE_FUNCTION void update_local_remaining_time() noexcept
+  {
+    auto prev_timestamp = timestamp_ns_;
+    update_timestamp();
+    // subtract the time passed since the last check
+    // (assuming local time is updated every time timestamp is read)
+    local_remaining_time_us_ -= static_cast<int32_t>((timestamp_ns_ - prev_timestamp) / 1000ull);
+  }
+
+  RAFT_DEVICE_INLINE_FUNCTION void update_cpu_provided_remaining_time() noexcept
+  {
+    local_remaining_time_us_ =
+      std::min<int32_t>(local_remaining_time_us_,
+                        cpu_provided_remaining_time_us_->load(cuda::std::memory_order_relaxed));
+  }
+};
+
+/**
+ * Copy the queries from the submitted pointers to the batch store, one query per block.
+ * Upon completion of this kernel, the submitted queries are all in the contiguous buffer
+ * `batch_queries`.
+ *
+ * Block size: (n, 1, 1) any number of threads copying a single row of data.
+ * Grid size: (max_batch_size, 1, 1) - one block per query
+ *
+ * Note, we view the incoming queries and the batch as going through multiple stages:
+ *   1) A host thread "commits" a query: it reserves a slot for the query in the batch and promises
+ *      to fill-in the corresponding query pointer.
+ *   2) A host thread "submits" the query: it fills-in the pointer to the query data in the reserved
+ *      slot.
+ *   3) This kernel copies the query data to the contiguous query buffer owned by the batch.
+ *
+ * The batch is "fully committed" when the number of committed queries reaches the maximum batch
+ * size (all slots are reserved). Committing, submitting, and copying of the queries is somewhat
+ * overlapped among multiple host and device threads. Only the copying happens in a CUDA stream in
+ * this kernel, and the upstream search is dispatched right after this kernel (in the same stream).
+ *
+ */
+template <typename T, typename IdxT>
+RAFT_KERNEL gather_inputs(
+  raft::device_matrix_view<T, uint32_t, raft::row_major> batch_queries,
+  raft::pinned_vector_view<request_pointers<T, IdxT>, uint32_t> request_ptrs,
+  /* The remaining time may be updated on the host side: a thread with a tighter deadline may reduce
+     it (but not increase). */
+  cuda::atomic<int32_t, cuda::thread_scope_system>* remaining_time_us,
+  /* The token contains the current number of queries committed and is cleared in this kernel. */
+  cuda::atomic<batch_token, cuda::thread_scope_system>* batch_token_ptr,
+  /* The host-visible batch size counter (used in `conservative_dispatch`). */
+  cuda::atomic<uint32_t, cuda::thread_scope_system>* batch_size_out,
+  /**
+   * The token value considered empty depends on the round over the ring buffer
+   * (which is defined by the seq_order_id)
+   */
+  batch_token empty_token_value,
+  /**
+   * The counter is used to find the last CTA to finish and to share the batch size with the
+   * scatter_inputs kernel.
+   */
+  cuda::atomic<uint32_t, cuda::std::thread_scope_device>* kernel_progress_counter)
+{
+  const uint32_t query_id = blockIdx.x;
+  __shared__ const T* query_ptr;
+
+  if (threadIdx.x == 0) {
+    query_ptr = nullptr;
+
+    // NB: we have to read/write to `batch_token_ptr`, `bs_committed`, and `batch_fully_committed`
+    // using volatile assembly ops, because otherwise the compiler seems to fail to understand that
+    // this is the same location in memory. The order of reads in writes here is extremely
+    // important, as it involves multiple host and device threads (the host threads do RMW atomic
+    // increments on the commit counter).
+    volatile uint32_t* bs_committed =
+      reinterpret_cast<volatile uint32_t*>(batch_token_ptr) + 1 - CUVS_SYSTEM_LITTLE_ENDIAN;
+    volatile uint8_t* batch_fully_committed =
+      reinterpret_cast<volatile uint8_t*>(bs_committed) + (CUVS_SYSTEM_LITTLE_ENDIAN * 3);
+
+    gpu_time_keeper runtime{remaining_time_us};
+    bool committed          = false;  // if the query is committed, we have to wait for it to arrive
+    auto& request_query_ptr = request_ptrs(query_id).query;
+    while (true) {
+      query_ptr = request_query_ptr.load(cuda::std::memory_order_acquire);
+      if (query_ptr != nullptr) {
+        // The query is submitted to this block's slot; erase the pointer buffer for future use and
+        // exit the loop.
+        request_query_ptr.store(nullptr, cuda::std::memory_order_relaxed);
+        break;
+      }
+      // The query hasn't been submitted, but is already committed; other checks may be skipped
+      if (committed) { continue; }
+      // Check if the query is committed
+      uint32_t committed_count;
+      asm volatile("ld.volatile.global.u32 %0, [%1];"
+                   : "=r"(committed_count)
+                   : "l"(bs_committed)
+                   : "memory");
+      committed = (committed_count & 0x00ffffff) > query_id;
+      if (committed) { continue; }
+      // If the query is not committed, but the batch is past the deadline, we exit without copying
+      // the query
+      if (committed_count > 0x00ffffff) { break; }
+      // The query hasn't been submitted yet; check if we're past the deadline
+      if (runtime.has_time()) { continue; }
+      // Otherwise, let the others know time is out
+      // Set the highest byte of the commit counter to 1 (thus avoiding RMW atomic)
+      // This prevents any more CPU threads from committing to this batch.
+      asm volatile("st.volatile.global.u8 [%0], %1;"
+                   :
+                   : "l"(batch_fully_committed), "r"(1)
+                   : "memory");
+      asm volatile("ld.volatile.global.u32 %0, [%1];"
+                   : "=r"(committed_count)
+                   : "l"(bs_committed)
+                   : "memory");
+      committed = (committed_count & 0x00ffffff) > query_id;
+      if (committed) { continue; }
+      break;
+    }
+    auto progress = kernel_progress_counter->fetch_add(1, cuda::std::memory_order_acq_rel) + 1;
+    if (progress >= gridDim.x) {
+      // read the last value of the committed count to know the batch size for sure
+      uint32_t committed_count;
+      asm volatile("ld.volatile.global.u32 %0, [%1];"
+                   : "=r"(committed_count)
+                   : "l"(bs_committed)
+                   : "memory");
+      committed_count &= 0x00ffffff;  // Clear the timeout bit
+      if (batch_size_out != nullptr) {
+        // Inform the dispatcher about the final batch size if `conservative_dispatch` is enabled
+        batch_size_out->store(committed_count, cuda::std::memory_order_relaxed);
+      }
+      // store the batch size in the progress counter, so we can read it in the scatter kernel
+      kernel_progress_counter->store(committed_count, cuda::std::memory_order_relaxed);
+      // Clear the batch token slot, so it can be re-used by others
+      asm volatile("st.volatile.global.u64 [%0], %1;"
+                   :
+                   : "l"(reinterpret_cast<uint64_t*>(batch_token_ptr)),
+                     "l"(reinterpret_cast<uint64_t&>(empty_token_value))
+                   : "memory");
+    }
+  }
+  // The block waits till the leading thread gets the query pointer
+  cooperative_groups::this_thread_block().sync();
+  auto query_ptr_local = query_ptr;
+  if (query_ptr_local == nullptr) { return; }
+  // block-wide copy input query
+  auto dim = batch_queries.extent(1);
+  for (uint32_t i = threadIdx.x; i < dim; i += blockDim.x) {
+    batch_queries(query_id, i) = query_ptr_local[i];
+  }
+}
+
+/** Copy the results of the search back to the requesters. */
+template <typename T, typename IdxT>
+RAFT_KERNEL scatter_outputs(
+  raft::pinned_vector_view<request_pointers<T, IdxT>, uint32_t> request_ptrs,
+  raft::device_matrix_view<const IdxT, uint32_t> batch_neighbors,
+  raft::device_matrix_view<const float, uint32_t> batch_distances,
+  cuda::atomic<uint32_t, cuda::std::thread_scope_device>* kernel_progress_counter,
+  cuda::atomic<batch_token, cuda::thread_scope_system>* next_token,
+  uint32_t batch_id)
+{
+  __shared__ uint32_t batch_size;
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    batch_size = kernel_progress_counter->exchange(0, cuda::std::memory_order_relaxed);
+  }
+  // Copy output
+  cooperative_groups::this_thread_block().sync();
+  auto k = batch_neighbors.extent(1);
+  for (uint32_t i = threadIdx.y; i < batch_size; i += blockDim.y) {
+    auto* request_neighbors = request_ptrs(i).neighbors;
+    auto* request_distances = request_ptrs(i).distances;
+    for (uint32_t j = threadIdx.x; j < k; j += blockDim.x) {
+      request_neighbors[j] = batch_neighbors(i, j);
+      request_distances[j] = batch_distances(i, j);
+    }
+  }
+  // Clear the batch state after all threads copied the data, so the batch can be reused
+  cuda::atomic_thread_fence(cuda::std::memory_order_release, cuda::thread_scope_system);
+  cooperative_groups::this_thread_block().sync();
+  if (threadIdx.x != 0 || threadIdx.y != 0) { return; }
+  reinterpret_cast<cuda::atomic<uint32_t, cuda::thread_scope_system>*>(
+    &reinterpret_cast<batch_token*>(next_token)->id())
+    ->store(batch_id, cuda::std::memory_order_relaxed);
+}
+
+/**
+ * Batch runner is shared among the users of the `dynamic_batching::index` (i.e. the index can be
+ * copied, but the copies hold shared pointers to a single batch runner).
+ *
+ * Constructor and destructor of this class do not need to be thread-safe, as their execution is
+ * guaranteed to happen in one thread by the holding shared pointer.
+ *
+ * The search function must be thread-safe. We only have to pay attention to the `mutable` members
+ * though, because the function is marked const.
+ */
+template <typename T, typename IdxT>
+class batch_runner {
+ public:
+  constexpr static uint32_t kMaxNumQueues = 256;
+
+  using batch_queue  = batch_queue_t<kMaxNumQueues>;
+  using seq_order_id = typename batch_queue::seq_order_id;
+
+  // Save the parameters and the upstream batched search function to invoke
+  template <typename Upstream>
+  batch_runner(const raft::resources& res,
+               const dynamic_batching::index_params& params,
+               const Upstream& upstream_index,
+               const typename Upstream::search_params_type& upstream_params,
+               upstream_search_type_const<Upstream, T, IdxT>* upstream_search,
+               const cuvs::neighbors::filtering::base_filter* sample_filter)
+    : res_{res},
+      upstream_search_{[&upstream_index, upstream_search, upstream_params, sample_filter](
+                         raft::resources const& res,
+                         raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+                         raft::device_matrix_view<IdxT, int64_t, raft::row_major> neighbors,
+                         raft::device_matrix_view<float, int64_t, raft::row_major> distances) {
+        /* Note: passing sample_filter by pointer
+
+        Ideally, dynamic batching would capture the filter by value. Unfortunately, one cannot use
+        the copy constructor of the `base_filter` (it would erase the actual filter type).
+        Therefore, we can only pass the filter by pointer or reference and require the user to keep
+        the filter alive for the lifetime of the dynamic batching index.
+        This, however, may lead to a segfault when the user doesn't provide the filter argument and
+        the argument is passed by reference: the lifetime of the none_sample_filter default argument
+        is limited to the search function call, so it is destroyed while the dynamic batching index
+        is still alive.
+        Hence the solution is to pass the filter by pointer and default it to nullptr.
+        */
+        if (sample_filter == nullptr) {
+          using base_filter_type = cuvs::neighbors::filtering::base_filter;
+          const auto none_filter = cuvs::neighbors::filtering::none_sample_filter{};
+          return upstream_search(res,
+                                 upstream_params,
+                                 upstream_index,
+                                 queries,
+                                 neighbors,
+                                 distances,
+                                 static_cast<const base_filter_type&>(none_filter));
+
+        } else {
+          return upstream_search(
+            res, upstream_params, upstream_index, queries, neighbors, distances, *sample_filter);
+        }
+      }},
+      k_{uint32_t(params.k)},
+      dim_{uint32_t(upstream_index.dim())},
+      max_batch_size_{uint32_t(params.max_batch_size)},
+      n_queues_{uint32_t(params.n_queues)},
+      batch_queue_{res_, params.conservative_dispatch},
+      completion_events_(n_queues_),
+      input_extents_{n_queues_, max_batch_size_, dim_},
+      output_extents_{n_queues_, max_batch_size_, k_},
+      queries_{raft::make_device_mdarray<T>(res_, input_extents_)},
+      neighbors_{raft::make_device_mdarray<IdxT>(res_, output_extents_)},
+      distances_{raft::make_device_mdarray<float>(res_, output_extents_)},
+      kernel_progress_counters_{
+        raft::make_device_vector<cuda::atomic<uint32_t, cuda::std::thread_scope_device>>(
+          res_, n_queues_)},
+      request_ptrs_{raft::make_pinned_matrix<request_pointers<T, IdxT>, uint32_t>(
+        res_, n_queues_, max_batch_size_)}
+  {
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      kernel_progress_counters_.data_handle(),
+      0,
+      sizeof(*kernel_progress_counters_.data_handle()) * kernel_progress_counters_.size(),
+      raft::resource::get_cuda_stream(res_)));
+    // Make sure to initialize the atomic values in the batch_state structs.
+    for (uint32_t i = 0; i < n_queues_; i++) {
+      auto seq_id = batch_queue_.push();
+      batch_queue_.token(seq_id).store(batch_token{batch_queue::make_seq_batch_id(seq_id, i)});
+      // Make sure to initialize query pointers, because they are used for synchronization
+      for (uint32_t j = 0; j < max_batch_size_; j++) {
+        new (&request_ptrs_(i, j)) request_pointers<T, IdxT>{};
+      }
+    }
+  }
+
+  // A workaround for algos, which have non-const `index` type in their arguments
+  template <typename Upstream>
+  batch_runner(const raft::resources& res,
+               const dynamic_batching::index_params& params,
+               const Upstream& upstream_index,
+               const typename Upstream::search_params_type& upstream_params,
+               upstream_search_type<Upstream, T, IdxT>* upstream_search,
+               const cuvs::neighbors::filtering::base_filter* sample_filter)
+    : batch_runner{
+        res,
+        params,
+        upstream_index,
+        upstream_params,
+        reinterpret_cast<upstream_search_type_const<Upstream, T, IdxT>*>(upstream_search),
+        sample_filter}
+  {
+  }
+
+  void search(raft::resources const& res,
+              cuvs::neighbors::dynamic_batching::search_params const& params,
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+              raft::device_matrix_view<IdxT, int64_t, raft::row_major> neighbors,
+              raft::device_matrix_view<float, int64_t, raft::row_major> distances) const
+  {
+    uint32_t n_queries = queries.extent(0);
+    if (n_queries >= max_batch_size_) {
+      return upstream_search_(res, queries, neighbors, distances);
+    }
+
+    if (neighbors.extent(1) != int64_t(k_)) {
+      // TODO: the check can be relaxed to `neighbors.extent(1) > int64_t(k_)`;
+      //       this, however, would require an extra bounds check per-query in the scatter kernel.
+      RAFT_LOG_WARN(
+        "The requested number of neighbors (%zd) doesn't match the configured "
+        "dynamic_batching::index_params::k (%u); dynamic batching is disabled for the request.",
+        neighbors.extent(1),
+        k_);
+      return upstream_search_(res, queries, neighbors, distances);
+    }
+
+    auto deadline = std::chrono::system_clock::now() +
+                    std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1000000.0));
+
+    int64_t local_io_offset = 0;
+    batch_token batch_token_observed{0};
+    local_waiter to_commit{std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 3e5)),
+                           local_waiter::kNonSleepIterations};
+    while (true) {
+      const auto seq_id        = batch_queue_.head();
+      const auto commit_result = try_commit(seq_id, n_queries);
+      // The bool (busy or not) returned if no queries were committed:
+      if (std::holds_alternative<bool>(commit_result)) {
+        // Pause if the system is busy
+        // (otherwise the progress is guaranteed due to update of the head counter)
+        if (std::get<bool>(commit_result)) { to_commit.wait(); }
+        continue;  // Try to get a new batch token
+      }
+      batch_token_observed           = std::get<batch_token>(std::get<0>(commit_result));
+      const auto queries_committed   = std::get<uint32_t>(std::get<0>(commit_result));
+      const auto batch_offset        = batch_token_observed.size_committed();
+      auto& batch_token_ref          = batch_queue_.token(seq_id);
+      auto& rem_time_us_ref          = batch_queue_.rem_time_us(seq_id);
+      auto& dispatch_sequence_id_ref = batch_queue_.dispatch_sequence_id(seq_id);
+      auto* batch_size_ptr           = batch_queue_.batch_size(seq_id);
+      // sleep for 1/10 of deadline time or more
+      //   (if couldn't get the value in the first few iterations).
+      local_waiter till_full{std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1e5)),
+                             batch_queue_.niceness(seq_id)};
+      while (batch_queue::batch_status(batch_token_observed, seq_id) != slot_state::kFull) {
+        /* Note: waiting for batch IO buffers
+        The CPU threads can commit to the incoming batches in the queue in advance (this happens in
+        try_commit).
+        In this loop, a thread waits for the batch IO buffer to be released by a running search on
+        the GPU side (scatter_outputs kernel). Hence, this loop is engaged only if all buffers are
+        currently used, which suggests that the GPU is busy (or there's not enough IO buffers).
+        This also means the current search is not likely to meet the deadline set by the user.
+
+        The scatter kernel returns its buffer id into an acquired slot in the batch queue; in this
+        loop we wait for that id to arrive.
+
+        Generally, we want to waste as little as possible CPU cycles here to let other threads wait
+        on dispatch_sequence_id_ref below more efficiently. At the same time, we shouldn't use
+        `.wait()` here, because `.notify_all()` would have to come from GPU.
+        */
+        till_full.wait();
+        batch_token_observed = batch_token_ref.load(cuda::std::memory_order_acquire);
+      }
+      // Whether this thread is responsible for dispatching the batch.
+      bool is_dispatcher = batch_offset == 0;
+      auto stream        = raft::resource::get_cuda_stream(res);
+      auto batch_id      = batch_queue::batch_id(batch_token_observed);
+      auto request_ptrs  = slice_2d(batch_id, request_ptrs_);
+
+      if (is_dispatcher) {
+        // Conservatively initialize the remaining time
+        // TODO (achirkin): this initialization may happen after the other requesters update the
+        //                  time and thus erase their deadlines.
+        rem_time_us_ref.store(static_cast<int32_t>(params.dispatch_timeout_ms * 1000),
+                              cuda::std::memory_order_relaxed);
+        // run the gather kernel before submitting the data to reduce the latency
+        gather_inputs<T, IdxT><<<max_batch_size_, 32, 0, stream>>>(
+          slice_3d(batch_id, queries_),
+          request_ptrs,
+          &rem_time_us_ref,
+          &batch_token_ref,
+          batch_size_ptr,
+          // This indicates the empty token slot, which can only be used in the following round
+          batch_queue::make_empty_token(seq_id),
+          kernel_progress_counters_.data_handle() + batch_id);
+      }
+
+      // *** Set the pointers to queries, neighbors, distances - query-by-query
+      for (uint32_t i = 0; i < queries_committed; i++) {
+        const auto o   = local_io_offset + i;
+        auto& ptrs     = request_ptrs(batch_offset + i);
+        ptrs.neighbors = neighbors.data_handle() + o * k_;
+        ptrs.distances = distances.data_handle() + o * k_;
+        ptrs.query.store(queries.data_handle() + o * dim_, cuda::std::memory_order_release);
+      }
+
+      // Submit estimated remaining time
+      {
+        auto rem_time_us = static_cast<int32_t>(
+          std::max<int64_t>(0, (deadline - std::chrono::system_clock::now()).count()) / 1000);
+        rem_time_us_ref.fetch_min(rem_time_us, cuda::std::memory_order_relaxed);
+      }
+
+      if (is_dispatcher) {
+        uint32_t batch_size = max_batch_size_;
+        if (batch_size_ptr != nullptr) {
+          // Block until the real batch size is available if conservative dispatch is used.
+          local_waiter for_dispatch{
+            std::chrono::nanoseconds(size_t(params.dispatch_timeout_ms * 1e5))};
+          batch_size = batch_size_ptr->load(cuda::std::memory_order_relaxed);
+          while (batch_size == 0) {
+            for_dispatch.wait();
+            batch_size = batch_size_ptr->load(cuda::std::memory_order_relaxed);
+          }
+          batch_size_ptr->store(0, cuda::std::memory_order_relaxed);
+        }
+        auto batch_neighbors = slice_3d(batch_id, neighbors_, batch_size);
+        auto batch_distances = slice_3d(batch_id, distances_, batch_size);
+        upstream_search_(
+          res, slice_3d(batch_id, queries_, batch_size), batch_neighbors, batch_distances);
+        auto next_seq_id     = batch_queue_.push();
+        auto& next_token_ref = batch_queue_.token(next_seq_id);
+        // next_batch_token);
+        auto bs = dim3(128, 8, 1);
+        scatter_outputs<T, IdxT>
+          <<<1, bs, 0, stream>>>(request_ptrs,
+                                 batch_neighbors,
+                                 batch_distances,
+                                 kernel_progress_counters_.data_handle() + batch_id,
+                                 &next_token_ref,
+                                 batch_queue::make_seq_batch_id(next_seq_id, batch_id));
+        RAFT_CUDA_TRY(cudaEventRecord(completion_events_[batch_id].value(), stream));
+        dispatch_sequence_id_ref.store(seq_id.value, cuda::std::memory_order_release);
+        dispatch_sequence_id_ref.notify_all();
+
+      } else {
+        // Wait till the dispatch_sequence_id counter is updated, which means the event is recorded
+        auto dispatched_id_observed =
+          dispatch_sequence_id_ref.load(cuda::std::memory_order_acquire);
+        while (static_cast<int32_t>(seq_id.value - dispatched_id_observed) > 0) {
+          dispatch_sequence_id_ref.wait(dispatched_id_observed, cuda::std::memory_order_relaxed);
+          dispatched_id_observed = dispatch_sequence_id_ref.load(cuda::std::memory_order_acquire);
+        }
+        // Now we can safely record the event
+        RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, completion_events_[batch_id].value()));
+      }
+
+      n_queries -= queries_committed;
+
+      if (n_queries == 0) { return; }
+      // If not all queries were committed, continue in the loop.
+      // TODO: it could potentially be more efficient to first commit everything and only then
+      //        submit the work/wait for the event
+      local_io_offset += queries_committed;
+      to_commit.reset(
+        local_waiter::kNonSleepIterations);  // reset the waiter for the next iteration.
+    }
+  }
+
+ private:
+  raft::resources res_;  // Sic! Store by value to copy the resource.
+  std::function<function_search_type<T, IdxT>> upstream_search_;
+  uint32_t k_;
+  uint32_t dim_;
+  uint32_t max_batch_size_;
+  uint32_t n_queues_;
+
+  mutable batch_queue batch_queue_;
+  std::vector<cuda_event> completion_events_;
+
+  using batch_extents = raft::extent_3d<uint32_t>;
+  batch_extents input_extents_;
+  batch_extents output_extents_;
+
+  mutable raft::device_mdarray<T, batch_extents, raft::row_major> queries_;
+  mutable raft::device_mdarray<IdxT, batch_extents, raft::row_major> neighbors_;
+  mutable raft::device_mdarray<float, batch_extents, raft::row_major> distances_;
+  mutable raft::device_vector<cuda::atomic<uint32_t, cuda::std::thread_scope_device>>
+    kernel_progress_counters_;
+
+  mutable raft::pinned_matrix<request_pointers<T, IdxT>, uint32_t, raft::row_major> request_ptrs_;
+
+  /**
+   * Try to commit n_queries at most; returns the last observed batch_token (where `size_committed`
+   * represents offset at which new queries are committed if successful), the number of committed
+   * queries, or whether the ring buffer appears to be busy (on unsuccessful commit).
+   */
+  auto try_commit(seq_order_id seq_id, uint32_t n_queries) const
+    -> std::variant<std::tuple<batch_token, uint32_t>, bool>
+  {
+    auto& batch_token_ref            = batch_queue_.token(seq_id);
+    batch_token batch_token_observed = batch_token_ref.load(cuda::std::memory_order_relaxed);
+    batch_token batch_token_updated;
+    slot_state token_status;
+    do {
+      // The interpretation of the token status depends on the current seq_order_id and a similar
+      // counter in the token. This is to prevent conflicts when too many parallel requests wrap
+      // over the whole ring buffer (batch_queue_t).
+      token_status = batch_queue::batch_status(batch_token_observed, seq_id);
+      // Busy status means the current thread is a whole ring buffer ahead of the token.
+      // The thread should wait for the rest of the system.
+      if (token_status == slot_state::kFullBusy || token_status == slot_state::kEmptyBusy) {
+        return true;
+      }
+      // This branch checks if the token was recently filled or dispatched.
+      // This means the head counter of the ring buffer is slightly outdated.
+      if (token_status == slot_state::kEmptyPast || token_status == slot_state::kFullPast ||
+          batch_token_observed.size_committed() >= max_batch_size_) {
+        batch_queue_.pop(seq_id);
+        return false;
+      }
+      batch_token_updated = batch_token_observed;
+      batch_token_updated.size_committed() =
+        std::min(batch_token_observed.size_committed() + n_queries, max_batch_size_);
+    } while (!batch_token_ref.compare_exchange_weak(batch_token_observed,
+                                                    batch_token_updated,
+                                                    cuda::std::memory_order_acq_rel,
+                                                    cuda::std::memory_order_relaxed));
+    if (batch_token_updated.size_committed() >= max_batch_size_) {
+      // The batch is already full, let's try to pop it from the queue
+      //                                 (if nobody has done so already)
+      batch_queue_.pop(seq_id);
+    }
+    return std::make_tuple(
+      batch_token_observed,
+      batch_token_updated.size_committed() - batch_token_observed.size_committed());
+  }
+};
+
+}  // namespace cuvs::neighbors::dynamic_batching::detail
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
index ce1e03264..e129d23e8 100644
--- a/cpp/src/neighbors/detail/hnsw.hpp
+++ b/cpp/src/neighbors/detail/hnsw.hpp
@@ -22,9 +22,63 @@
 #include <hnswlib/hnswlib.h>
 #include <memory>
 #include <random>
+#include <thread>
 
 namespace cuvs::neighbors::hnsw::detail {
 
+// Multithreaded executor
+// The helper function is copied from the hnswlib repository
+// as for some reason, adding vectors to the hnswlib index does not
+// work well with omp parallel for
+template <class Function>
+inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn)
+{
+  if (numThreads <= 0) { numThreads = std::thread::hardware_concurrency(); }
+
+  if (numThreads == 1) {
+    for (size_t id = start; id < end; id++) {
+      fn(id, 0);
+    }
+  } else {
+    std::vector<std::thread> threads;
+    std::atomic<size_t> current(start);
+
+    // keep track of exceptions in threads
+    // https://stackoverflow.com/a/32428427/1713196
+    std::exception_ptr lastException = nullptr;
+    std::mutex lastExceptMutex;
+
+    for (size_t threadId = 0; threadId < numThreads; ++threadId) {
+      threads.push_back(std::thread([&, threadId] {
+        while (true) {
+          size_t id = current.fetch_add(1);
+
+          if (id >= end) { break; }
+
+          try {
+            fn(id, threadId);
+          } catch (...) {
+            std::unique_lock<std::mutex> lastExcepLock(lastExceptMutex);
+            lastException = std::current_exception();
+            /*
+             * This will work even when current is the largest value that
+             * size_t can fit, because fetch_add returns the previous value
+             * before the increment (what will result in overflow
+             * and produce 0 instead of current + 1).
+             */
+            current = end;
+            break;
+          }
+        }
+      }));
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+    if (lastException) { std::rethrow_exception(lastException); }
+  }
+}
+
 template <typename T>
 struct hnsw_dist_t {
   using type = void;
@@ -54,9 +108,10 @@ struct index_impl : index<T> {
    * @param[in] filepath path to the index
    * @param[in] dim dimensions of the training dataset
    * @param[in] metric distance metric to search. Supported metrics ("L2Expanded", "InnerProduct")
+   * @param[in] hierarchy hierarchy used for upper HNSW layers
    */
-  index_impl(const std::string& filepath, int dim, cuvs::distance::DistanceType metric)
-    : index<T>{dim, metric}
+  index_impl(int dim, cuvs::distance::DistanceType metric, HnswHierarchy hierarchy)
+    : index<T>{dim, metric, hierarchy}
   {
     if constexpr (std::is_same_v<T, float>) {
       if (metric == cuvs::distance::DistanceType::L2Expanded) {
@@ -71,11 +126,6 @@ struct index_impl : index<T> {
     }
 
     RAFT_EXPECTS(space_ != nullptr, "Unsupported metric type was used");
-
-    appr_alg_ = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
-      space_.get(), filepath);
-
-    appr_alg_->base_layer_only = true;
   }
 
   /**
@@ -88,14 +138,32 @@ struct index_impl : index<T> {
   */
   void set_ef(int ef) const override { appr_alg_->ef_ = ef; }
 
+  /**
+  @brief Set index
+   */
+  void set_index(std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>&& index)
+  {
+    appr_alg_ = std::move(index);
+  }
+
+  /**
+  @brief Get space
+   */
+  auto get_space() const -> hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>*
+  {
+    return space_.get();
+  }
+
  private:
   std::unique_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
   std::unique_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
 };
 
-template <typename T>
-std::unique_ptr<index<T>> from_cagra(raft::resources const& res,
-                                     const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index)
+template <typename T, HnswHierarchy hierarchy>
+std::enable_if_t<hierarchy == HnswHierarchy::NONE, std::unique_ptr<index<T>>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index)
 {
   std::random_device dev;
   std::mt19937 rng(dev());
@@ -103,13 +171,125 @@ std::unique_ptr<index<T>> from_cagra(raft::resources const& res,
   auto uuid            = std::to_string(dist(rng));
   std::string filepath = "/tmp/" + uuid + ".bin";
   cuvs::neighbors::cagra::serialize_to_hnswlib(res, filepath, cagra_index);
+
   index<T>* hnsw_index = nullptr;
   cuvs::neighbors::hnsw::deserialize(
-    res, filepath, cagra_index.dim(), cagra_index.metric(), &hnsw_index);
+    res, params, filepath, cagra_index.dim(), cagra_index.metric(), &hnsw_index);
   std::filesystem::remove(filepath);
   return std::unique_ptr<index<T>>(hnsw_index);
 }
 
+template <typename T, HnswHierarchy hierarchy>
+std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
+{
+  // auto host_dataset = raft::make_host_matrix<T, int64_t>(dataset.extent(0), dataset.extent(1));
+  auto host_dataset = raft::make_host_matrix<T, int64_t>(0, 0);
+  raft::host_matrix_view<const T, int64_t, raft::row_major> host_dataset_view(
+    host_dataset.data_handle(), host_dataset.extent(0), host_dataset.extent(1));
+  if (dataset.has_value()) {
+    host_dataset_view = dataset.value();
+  } else {
+    // move dataset to host, remove padding
+    auto cagra_dataset = cagra_index.dataset();
+    host_dataset =
+      raft::make_host_matrix<T, int64_t>(cagra_dataset.extent(0), cagra_dataset.extent(1));
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(host_dataset.data_handle(),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    cagra_dataset.data_handle(),
+                                    sizeof(T) * cagra_dataset.stride(0),
+                                    sizeof(T) * host_dataset.extent(1),
+                                    cagra_dataset.extent(0),
+                                    cudaMemcpyDefault,
+                                    raft::resource::get_cuda_stream(res)));
+    raft::resource::sync_stream(res);
+    host_dataset_view = host_dataset.view();
+  }
+  // build upper layers of hnsw index
+  auto hnsw_index =
+    std::make_unique<index_impl<T>>(cagra_index.dim(), cagra_index.metric(), hierarchy);
+  auto appr_algo = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(),
+    host_dataset_view.extent(0),
+    cagra_index.graph().extent(1) / 2,
+    params.ef_construction);
+  appr_algo->base_layer_init = false;  // tell hnswlib to build upper layers only
+  ParallelFor(0, host_dataset_view.extent(0), params.num_threads, [&](size_t i, size_t threadId) {
+    appr_algo->addPoint((void*)(host_dataset_view.data_handle() + i * host_dataset_view.extent(1)),
+                        i);
+  });
+  appr_algo->base_layer_init = true;  // reset to true to allow addition of new points
+
+  // move cagra graph to host
+  auto graph = cagra_index.graph();
+  auto host_graph =
+    raft::make_host_matrix<uint32_t, int64_t, raft::row_major>(graph.extent(0), graph.extent(1));
+  raft::copy(host_graph.data_handle(),
+             graph.data_handle(),
+             graph.size(),
+             raft::resource::get_cuda_stream(res));
+  raft::resource::sync_stream(res);
+
+// copy cagra graph to hnswlib base layer
+#pragma omp parallel for
+  for (size_t i = 0; i < static_cast<size_t>(host_graph.extent(0)); ++i) {
+    auto ll_i = appr_algo->get_linklist0(i);
+    appr_algo->setListCount(ll_i, host_graph.extent(1));
+    auto* data = (uint32_t*)(ll_i + 1);
+    for (size_t j = 0; j < static_cast<size_t>(host_graph.extent(1)); ++j) {
+      data[j] = host_graph(i, j);
+    }
+  }
+
+  hnsw_index->set_index(std::move(appr_algo));
+  return hnsw_index;
+}
+
+template <typename T>
+std::unique_ptr<index<T>> from_cagra(
+  raft::resources const& res,
+  const index_params& params,
+  const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
+{
+  if (params.hierarchy == HnswHierarchy::NONE) {
+    return from_cagra<T, HnswHierarchy::NONE>(res, params, cagra_index);
+  } else if (params.hierarchy == HnswHierarchy::CPU) {
+    return from_cagra<T, HnswHierarchy::CPU>(res, params, cagra_index, dataset);
+  }
+  {
+    RAFT_FAIL("Unsupported hierarchy type");
+  }
+}
+
+template <typename T>
+void extend(raft::resources const& res,
+            const extend_params& params,
+            raft::host_matrix_view<const T, int64_t, raft::row_major> additional_dataset,
+            index<T>& idx)
+{
+  auto* hnswlib_index = reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>*>(
+    const_cast<void*>(idx.get_index()));
+  auto current_element_count = hnswlib_index->getCurrentElementCount();
+  auto new_element_count     = additional_dataset.extent(0);
+  auto num_threads           = params.num_threads == 0 ? std::thread::hardware_concurrency()
+                                                       : static_cast<size_t>(params.num_threads);
+
+  hnswlib_index->resizeIndex(current_element_count + new_element_count);
+  ParallelFor(current_element_count,
+              current_element_count + new_element_count,
+              num_threads,
+              [&](size_t i, size_t threadId) {
+                hnswlib_index->addPoint(
+                  (void*)(additional_dataset.data_handle() +
+                          (i - current_element_count) * additional_dataset.extent(1)),
+                  i);
+              });
+}
+
 template <typename T>
 void get_search_knn_results(hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type> const* idx,
                             const T* query,
@@ -171,14 +351,28 @@ void search(raft::resources const& res,
   }
 }
 
+template <typename T>
+void serialize(raft::resources const& res, const std::string& filename, const index<T>& idx)
+{
+  auto* hnswlib_index = reinterpret_cast<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>*>(
+    const_cast<void*>(idx.get_index()));
+  hnswlib_index->saveIndex(filename);
+}
+
 template <typename T>
 void deserialize(raft::resources const& res,
+                 const index_params& params,
                  const std::string& filename,
                  int dim,
                  cuvs::distance::DistanceType metric,
                  index<T>** idx)
 {
-  *idx = new detail::index_impl<T>(filename, dim, metric);
+  auto hnsw_index = std::make_unique<index_impl<T>>(dim, metric, params.hierarchy);
+  auto appr_algo  = std::make_unique<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>>(
+    hnsw_index->get_space(), filename);
+  if (params.hierarchy == HnswHierarchy::NONE) { appr_algo->base_layer_only = true; }
+  hnsw_index->set_index(std::move(appr_algo));
+  *idx = hnsw_index.release();
 }
 
 }  // namespace cuvs::neighbors::hnsw::detail
diff --git a/cpp/src/neighbors/dynamic_batching.cu b/cpp/src/neighbors/dynamic_batching.cu
new file mode 100644
index 000000000..6be70353b
--- /dev/null
+++ b/cpp/src/neighbors/dynamic_batching.cu
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "detail/dynamic_batching.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+// NB: the (template) index parameter should be the last; it may contain the spaces and so split
+//       into multiple preprocessor token. Then it is consumed as __VA_ARGS__
+//
+#define CUVS_INST_DYNAMIC_BATCHING_INDEX(T, IdxT, Namespace, ...)                         \
+  template <>                                                                             \
+  template <>                                                                             \
+  index<T, IdxT>::index<Namespace ::__VA_ARGS__>(                                         \
+    const raft::resources& res,                                                           \
+    const cuvs::neighbors::dynamic_batching::index_params& params,                        \
+    const Namespace ::__VA_ARGS__& upstream_index,                                        \
+    const typename Namespace ::__VA_ARGS__::search_params_type& upstream_params,          \
+    const cuvs::neighbors::filtering::base_filter* sample_filter)                         \
+    : runner{new detail::batch_runner<T, IdxT>(                                           \
+        res, params, upstream_index, upstream_params, Namespace ::search, sample_filter)} \
+  {                                                                                       \
+  }
+
+#define CUVS_INST_DYNAMIC_BATCHING_SEARCH(T, IdxT)                                 \
+  void search(raft::resources const& res,                                          \
+              cuvs::neighbors::dynamic_batching::search_params const& params,      \
+              cuvs::neighbors::dynamic_batching::index<T, IdxT> const& index,      \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries, \
+              raft::device_matrix_view<IdxT, int64_t, raft::row_major> neighbors,  \
+              raft::device_matrix_view<float, int64_t, raft::row_major> distances) \
+  {                                                                                \
+    return index.runner->search(res, params, queries, neighbors, distances);       \
+  }
+
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, uint32_t, cuvs::neighbors::cagra, index<float, uint32_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(half, uint32_t, cuvs::neighbors::cagra, index<half, uint32_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, uint32_t, cuvs::neighbors::cagra, index<int8_t, uint32_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t,
+                                 uint32_t,
+                                 cuvs::neighbors::cagra,
+                                 index<uint8_t, uint32_t>);
+
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(half, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t, int64_t, cuvs::neighbors::ivf_pq, index<int64_t>);
+
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, int64_t, cuvs::neighbors::ivf_flat, index<float, int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t,
+                                 int64_t,
+                                 cuvs::neighbors::ivf_flat,
+                                 index<int8_t, int64_t>);
+CUVS_INST_DYNAMIC_BATCHING_INDEX(uint8_t,
+                                 int64_t,
+                                 cuvs::neighbors::ivf_flat,
+                                 index<uint8_t, int64_t>);
+
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(float, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(half, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(int8_t, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(uint8_t, int64_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(float, uint32_t);  // uint32_t index type is needed for CAGRA
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(half, uint32_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(int8_t, uint32_t);
+CUVS_INST_DYNAMIC_BATCHING_SEARCH(uint8_t, uint32_t);
+
+#undef CUVS_INST_DYNAMIC_BATCHING_INDEX
+#undef CUVS_INST_DYNAMIC_BATCHING_SEARCH
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/src/neighbors/hnsw.cpp b/cpp/src/neighbors/hnsw.cpp
index e6f3fbcc7..f165176ec 100644
--- a/cpp/src/neighbors/hnsw.cpp
+++ b/cpp/src/neighbors/hnsw.cpp
@@ -21,11 +21,14 @@
 
 namespace cuvs::neighbors::hnsw {
 
-#define CUVS_INST_HNSW_FROM_CAGRA(T)                                                           \
-  std::unique_ptr<index<T>> from_cagra(                                                        \
-    raft::resources const& res, const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index) \
-  {                                                                                            \
-    return detail::from_cagra<T>(res, cagra_index);                                            \
+#define CUVS_INST_HNSW_FROM_CAGRA(T)                                                  \
+  std::unique_ptr<index<T>> from_cagra(                                               \
+    raft::resources const& res,                                                       \
+    const index_params& params,                                                       \
+    const cuvs::neighbors::cagra::index<T, uint32_t>& cagra_index,                    \
+    std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset) \
+  {                                                                                   \
+    return detail::from_cagra<T>(res, params, cagra_index, dataset);                  \
   }
 
 CUVS_INST_HNSW_FROM_CAGRA(float);
@@ -34,6 +37,21 @@ CUVS_INST_HNSW_FROM_CAGRA(int8_t);
 
 #undef CUVS_INST_HNSW_FROM_CAGRA
 
+#define CUVS_INST_HNSW_EXTEND(T)                                                            \
+  void extend(raft::resources const& res,                                                   \
+              const extend_params& params,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> additional_dataset, \
+              index<T>& idx)                                                                \
+  {                                                                                         \
+    detail::extend<T>(res, params, additional_dataset, idx);                                \
+  }
+
+CUVS_INST_HNSW_EXTEND(float);
+CUVS_INST_HNSW_EXTEND(uint8_t);
+CUVS_INST_HNSW_EXTEND(int8_t);
+
+#undef CUVS_INST_HNSW_EXTEND
+
 #define CUVS_INST_HNSW_SEARCH(T)                                                    \
   void search(raft::resources const& res,                                           \
               const search_params& params,                                          \
@@ -51,20 +69,25 @@ CUVS_INST_HNSW_SEARCH(int8_t);
 
 #undef CUVS_INST_HNSW_SEARCH
 
-#define CUVS_INST_HNSW_DESERIALIZE(T)                        \
-  void deserialize(raft::resources const& res,               \
-                   const std::string& filename,              \
-                   int dim,                                  \
-                   cuvs::distance::DistanceType metric,      \
-                   index<T>** idx)                           \
-  {                                                          \
-    detail::deserialize<T>(res, filename, dim, metric, idx); \
+#define CUVS_INST_HNSW_SERIALIZE(T)                                                            \
+  void serialize(raft::resources const& res, const std::string& filename, const index<T>& idx) \
+  {                                                                                            \
+    detail::serialize<T>(res, filename, idx);                                                  \
+  }                                                                                            \
+  void deserialize(raft::resources const& res,                                                 \
+                   const index_params& params,                                                 \
+                   const std::string& filename,                                                \
+                   int dim,                                                                    \
+                   cuvs::distance::DistanceType metric,                                        \
+                   index<T>** idx)                                                             \
+  {                                                                                            \
+    detail::deserialize<T>(res, params, filename, dim, metric, idx);                           \
   }
 
-CUVS_INST_HNSW_DESERIALIZE(float);
-CUVS_INST_HNSW_DESERIALIZE(uint8_t);
-CUVS_INST_HNSW_DESERIALIZE(int8_t);
+CUVS_INST_HNSW_SERIALIZE(float);
+CUVS_INST_HNSW_SERIALIZE(uint8_t);
+CUVS_INST_HNSW_SERIALIZE(int8_t);
 
-#undef CUVS_INST_HNSW_DESERIALIZE
+#undef CUVS_INST_HNSW_SERIALIZE
 
 }  // namespace cuvs::neighbors::hnsw
diff --git a/cpp/src/neighbors/hnsw_c.cpp b/cpp/src/neighbors/hnsw_c.cpp
index a19875641..0233a510a 100644
--- a/cpp/src/neighbors/hnsw_c.cpp
+++ b/cpp/src/neighbors/hnsw_c.cpp
@@ -31,6 +31,44 @@
 #include <cuvs/neighbors/hnsw.hpp>
 
 namespace {
+
+template <typename T>
+void _from_cagra(cuvsResources_t res,
+                 cuvsHnswIndexParams_t params,
+                 cuvsCagraIndex_t cagra_index,
+                 cuvsHnswIndex_t hnsw_index)
+{
+  auto res_ptr = reinterpret_cast<raft::resources*>(res);
+  auto index   = reinterpret_cast<cuvs::neighbors::cagra::index<T, uint32_t>*>(cagra_index->addr);
+  auto cpp_params            = cuvs::neighbors::hnsw::index_params();
+  cpp_params.hierarchy       = static_cast<cuvs::neighbors::hnsw::HnswHierarchy>(params->hierarchy);
+  cpp_params.ef_construction = params->ef_construction;
+  cpp_params.num_threads     = params->num_threads;
+  std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset = std::nullopt;
+
+  auto hnsw_index_unique_ptr =
+    cuvs::neighbors::hnsw::from_cagra(*res_ptr, cpp_params, *index, dataset);
+  auto hnsw_index_ptr = hnsw_index_unique_ptr.release();
+  hnsw_index->addr    = reinterpret_cast<uintptr_t>(hnsw_index_ptr);
+}
+
+template <typename T>
+void _extend(cuvsResources_t res,
+             cuvsHnswExtendParams_t params,
+             DLManagedTensor* additional_dataset,
+             cuvsHnswIndex index)
+{
+  auto res_ptr           = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr         = reinterpret_cast<cuvs::neighbors::hnsw::index<T>*>(index.addr);
+  auto cpp_params        = cuvs::neighbors::hnsw::extend_params();
+  cpp_params.num_threads = params->num_threads;
+
+  using additional_dataset_mdspan_type = raft::host_matrix_view<T const, int64_t, raft::row_major>;
+  auto additional_dataset_mds =
+    cuvs::core::from_dlpack<additional_dataset_mdspan_type>(additional_dataset);
+  cuvs::neighbors::hnsw::extend(*res_ptr, cpp_params, additional_dataset_mds, *index_ptr);
+}
+
 template <typename T>
 void _search(cuvsResources_t res,
              cuvsHnswSearchParams params,
@@ -44,7 +82,7 @@ void _search(cuvsResources_t res,
 
   auto search_params        = cuvs::neighbors::hnsw::search_params();
   search_params.ef          = params.ef;
-  search_params.num_threads = params.numThreads;
+  search_params.num_threads = params.num_threads;
 
   using queries_mdspan_type   = raft::host_matrix_view<T const, int64_t, raft::row_major>;
   using neighbors_mdspan_type = raft::host_matrix_view<uint64_t, int64_t, raft::row_major>;
@@ -57,26 +95,42 @@ void _search(cuvsResources_t res,
 }
 
 template <typename T>
-void* _deserialize(cuvsResources_t res, const char* filename, int dim, cuvsDistanceType metric)
+void _serialize(cuvsResources_t res, const char* filename, cuvsHnswIndex index)
+{
+  auto res_ptr   = reinterpret_cast<raft::resources*>(res);
+  auto index_ptr = reinterpret_cast<cuvs::neighbors::hnsw::index<T>*>(index.addr);
+  cuvs::neighbors::hnsw::serialize(*res_ptr, std::string(filename), *index_ptr);
+}
+
+template <typename T>
+void* _deserialize(cuvsResources_t res,
+                   cuvsHnswIndexParams_t params,
+                   const char* filename,
+                   int dim,
+                   cuvsDistanceType metric)
 {
   auto res_ptr                           = reinterpret_cast<raft::resources*>(res);
   cuvs::neighbors::hnsw::index<T>* index = nullptr;
-  cuvs::neighbors::hnsw::deserialize(*res_ptr, std::string(filename), dim, metric, &index);
+  auto cpp_params                        = cuvs::neighbors::hnsw::index_params();
+  cpp_params.hierarchy = static_cast<cuvs::neighbors::hnsw::HnswHierarchy>(params->hierarchy);
+  cuvs::neighbors::hnsw::deserialize(
+    *res_ptr, cpp_params, std::string(filename), dim, metric, &index);
   return index;
 }
 }  // namespace
 
-extern "C" cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params)
+extern "C" cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params)
 {
-  return cuvs::core::translate_exceptions(
-    [=] { *params = new cuvsHnswSearchParams{.ef = 200, .numThreads = 0}; });
+  return cuvs::core::translate_exceptions([=] {
+    *params = new cuvsHnswIndexParams{
+      .hierarchy = cuvsHnswHierarchy::NONE, .ef_construction = 200, .num_threads = 2};
+  });
 }
 
-extern "C" cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params)
+extern "C" cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params)
 {
   return cuvs::core::translate_exceptions([=] { delete params; });
 }
-
 extern "C" cuvsError_t cuvsHnswIndexCreate(cuvsHnswIndex_t* index)
 {
   return cuvs::core::translate_exceptions([=] { *index = new cuvsHnswIndex{}; });
@@ -101,6 +155,66 @@ extern "C" cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index_c_ptr)
   });
 }
 
+extern "C" cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { *params = new cuvsHnswExtendParams{.num_threads = 0}; });
+}
+
+extern "C" cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
+extern "C" cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                                         cuvsHnswIndexParams_t params,
+                                         cuvsCagraIndex_t cagra_index,
+                                         cuvsHnswIndex_t hnsw_index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto index        = *cagra_index;
+    hnsw_index->dtype = index.dtype;
+    if (index.dtype.code == kDLFloat) {
+      _from_cagra<float>(res, params, cagra_index, hnsw_index);
+    } else if (index.dtype.code == kDLUInt) {
+      _from_cagra<uint8_t>(res, params, cagra_index, hnsw_index);
+    } else if (index.dtype.code == kDLInt) {
+      _from_cagra<int8_t>(res, params, cagra_index, hnsw_index);
+    } else {
+      RAFT_FAIL("Unsupported dtype: %d", index.dtype.code);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                                      cuvsHnswExtendParams_t params,
+                                      DLManagedTensor* additional_dataset,
+                                      cuvsHnswIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat) {
+      _extend<float>(res, params, additional_dataset, *index);
+    } else if (index->dtype.code == kDLUInt) {
+      _extend<uint8_t>(res, params, additional_dataset, *index);
+    } else if (index->dtype.code == kDLInt) {
+      _extend<int8_t>(res, params, additional_dataset, *index);
+    } else {
+      RAFT_FAIL("Unsupported dtype: %d", index->dtype.code);
+    }
+  });
+}
+
+extern "C" cuvsError_t cuvsHnswSearchParamsCreate(cuvsHnswSearchParams_t* params)
+{
+  return cuvs::core::translate_exceptions(
+    [=] { *params = new cuvsHnswSearchParams{.ef = 200, .num_threads = 0}; });
+}
+
+extern "C" cuvsError_t cuvsHnswSearchParamsDestroy(cuvsHnswSearchParams_t params)
+{
+  return cuvs::core::translate_exceptions([=] { delete params; });
+}
+
 extern "C" cuvsError_t cuvsHnswSearch(cuvsResources_t res,
                                       cuvsHnswSearchParams_t params,
                                       cuvsHnswIndex_t index_c_ptr,
@@ -140,7 +254,25 @@ extern "C" cuvsError_t cuvsHnswSearch(cuvsResources_t res,
   });
 }
 
+extern "C" cuvsError_t cuvsHnswSerialize(cuvsResources_t res,
+                                         const char* filename,
+                                         cuvsHnswIndex_t index)
+{
+  return cuvs::core::translate_exceptions([=] {
+    if (index->dtype.code == kDLFloat) {
+      _serialize<float>(res, filename, *index);
+    } else if (index->dtype.code == kDLInt) {
+      _serialize<int8_t>(res, filename, *index);
+    } else if (index->dtype.code == kDLUInt) {
+      _serialize<uint8_t>(res, filename, *index);
+    } else {
+      RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
+    }
+  });
+}
+
 extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                           cuvsHnswIndexParams_t params,
                                            const char* filename,
                                            int dim,
                                            cuvsDistanceType metric,
@@ -148,11 +280,14 @@ extern "C" cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
 {
   return cuvs::core::translate_exceptions([=] {
     if (index->dtype.code == kDLFloat && index->dtype.bits == 32) {
-      index->addr = reinterpret_cast<uintptr_t>(_deserialize<float>(res, filename, dim, metric));
+      index->addr =
+        reinterpret_cast<uintptr_t>(_deserialize<float>(res, params, filename, dim, metric));
     } else if (index->dtype.code == kDLUInt && index->dtype.bits == 8) {
-      index->addr = reinterpret_cast<uintptr_t>(_deserialize<uint8_t>(res, filename, dim, metric));
+      index->addr =
+        reinterpret_cast<uintptr_t>(_deserialize<uint8_t>(res, params, filename, dim, metric));
     } else if (index->dtype.code == kDLInt && index->dtype.bits == 8) {
-      index->addr = reinterpret_cast<uintptr_t>(_deserialize<int8_t>(res, filename, dim, metric));
+      index->addr =
+        reinterpret_cast<uintptr_t>(_deserialize<int8_t>(res, params, filename, dim, metric));
     } else {
       RAFT_FAIL("Unsupported dtype in file %s", filename);
     }
diff --git a/cpp/src/neighbors/iface/iface.hpp b/cpp/src/neighbors/iface/iface.hpp
index 9b3da75a4..98ef3fdd3 100644
--- a/cpp/src/neighbors/iface/iface.hpp
+++ b/cpp/src/neighbors/iface/iface.hpp
@@ -20,6 +20,7 @@
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
+#include <fstream>
 #include <raft/core/device_resources.hpp>
 
 #include <fstream>
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 286d721d7..1c8de2ad0 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -175,6 +175,19 @@ if(BUILD_TESTS)
     100
   )
 
+  ConfigureTest(
+    NAME
+    NEIGHBORS_DYNAMIC_BATCHING_TEST
+    PATH
+    neighbors/dynamic_batching/test_cagra.cu
+    neighbors/dynamic_batching/test_ivf_flat.cu
+    neighbors/dynamic_batching/test_ivf_pq.cu
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
   if(BUILD_CAGRA_HNSWLIB)
     ConfigureTest(NAME NEIGHBORS_HNSW_TEST PATH neighbors/hnsw.cu GPUS 1 PERCENT 100)
     target_link_libraries(NEIGHBORS_HNSW_TEST PRIVATE hnswlib::hnswlib)
diff --git a/cpp/test/core/c_api.c b/cpp/test/core/c_api.c
index a3dae6004..a51824d2b 100644
--- a/cpp/test/core/c_api.c
+++ b/cpp/test/core/c_api.c
@@ -73,6 +73,15 @@ int main()
   error = cuvsRMMMemoryResourceReset();
   if (error == CUVS_ERROR) { exit(EXIT_FAILURE); }
 
+  // Alloc memory on host (pinned)
+  void* ptr3;
+  cuvsError_t alloc_error_pinned = cuvsRMMHostAlloc(&ptr3, 1024);
+  if (alloc_error_pinned == CUVS_ERROR) { exit(EXIT_FAILURE); }
+
+  // Free memory
+  cuvsError_t free_error_pinned = cuvsRMMHostFree(ptr3, 1024);
+  if (free_error_pinned == CUVS_ERROR) { exit(EXIT_FAILURE); }
+
   // Destroy resources
   error = cuvsResourcesDestroy(res);
   if (error == CUVS_ERROR) { exit(EXIT_FAILURE); }
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 660246c67..8d5701439 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -758,11 +758,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo        = ps.algo;
         search_params.max_queries = ps.max_queries;
         search_params.team_size   = ps.team_size;
-
-        // TODO: setting search_params.itopk_size here breaks the filter tests, but is required for
-        // k>1024 skip these tests until fixed
-        if (ps.k >= 1024) { GTEST_SKIP(); }
-        // search_params.itopk_size   = ps.itopk_size;
+        search_params.itopk_size  = ps.itopk_size;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
diff --git a/cpp/test/neighbors/ann_hnsw_c.cu b/cpp/test/neighbors/ann_hnsw_c.cu
index fc740b924..2a6401b1d 100644
--- a/cpp/test/neighbors/ann_hnsw_c.cu
+++ b/cpp/test/neighbors/ann_hnsw_c.cu
@@ -111,7 +111,9 @@ TEST(CagraHnswC, BuildSearch)
   cuvsHnswIndex_t hnsw_index;
   cuvsHnswIndexCreate(&hnsw_index);
   hnsw_index->dtype = index->dtype;
-  cuvsHnswDeserialize(res, "/tmp/cagra_hnswlib.index", 2, L2Expanded, hnsw_index);
+  cuvsHnswIndexParams_t hnsw_params;
+  cuvsHnswIndexParamsCreate(&hnsw_params);
+  cuvsHnswDeserialize(res, hnsw_params, "/tmp/cagra_hnswlib.index", 2, L2Expanded, hnsw_index);
 
   // search index
   cuvsHnswSearchParams_t search_params;
diff --git a/cpp/test/neighbors/dynamic_batching.cuh b/cpp/test/neighbors/dynamic_batching.cuh
new file mode 100644
index 000000000..b64c5b01e
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching.cuh
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "ann_utils.cuh"
+
+#include <gtest/gtest.h>
+
+#include <cuvs/neighbors/dynamic_batching.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cstdint>
+#include <future>
+#include <vector>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+struct dynamic_batching_spec {
+  int64_t n_queries                   = 1000;
+  int64_t n_rows                      = 100000;
+  int64_t dim                         = 128;
+  int64_t k                           = 10;
+  int64_t max_batch_size              = 64;
+  size_t n_queues                     = 3;
+  bool conservative_dispatch          = false;
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Expanded;
+  int64_t max_concurrent_threads      = 128;
+};
+
+inline ::std::ostream& operator<<(::std::ostream& os, const dynamic_batching_spec& p)
+{
+  os << "{n_queries=" << p.n_queries;
+  os << ", dataset shape=" << p.n_rows << "x" << p.dim;
+  os << ", metric=" << print_metric{p.metric};
+  os << ", k=" << p.k;
+  os << ", max_batch_size=" << p.max_batch_size;
+  os << ", n_queues=" << p.n_queues;
+  os << ", conservative_dispatch=" << p.conservative_dispatch;
+  os << '}' << std::endl;
+  return os;
+}
+
+template <typename DataT, typename IdxT, typename UpstreamT>
+using build_function = UpstreamT(const raft::resources&,
+                                 const typename UpstreamT::index_params_type&,
+                                 raft::device_matrix_view<const DataT, int64_t, raft::row_major>);
+
+template <typename DataT, typename IdxT, typename UpstreamT>
+using search_function = void(const raft::resources&,
+                             const typename UpstreamT::search_params_type& params,
+                             const UpstreamT& index,
+                             raft::device_matrix_view<const DataT, int64_t, raft::row_major>,
+                             raft::device_matrix_view<IdxT, int64_t, raft::row_major>,
+                             raft::device_matrix_view<float, int64_t, raft::row_major>,
+                             const cuvs::neighbors::filtering::base_filter&);
+
+template <typename DataT,
+          typename IdxT,
+          typename UpstreamT,
+          build_function<DataT, IdxT, UpstreamT> UpstreamBuildF,
+          search_function<DataT, IdxT, UpstreamT> UpstreamSearchF>
+struct dynamic_batching_test : public ::testing::TestWithParam<dynamic_batching_spec> {
+  using distance_type = float;
+  using data_type     = DataT;
+  using index_type    = IdxT;
+  using upstream_type = UpstreamT;
+
+  dynamic_batching_spec ps = ::testing::TestWithParam<dynamic_batching_spec>::GetParam();
+  raft::resources res;
+
+  // input data
+  std::optional<raft::device_matrix<data_type, int64_t>> dataset            = std::nullopt;
+  std::optional<raft::device_matrix<data_type, int64_t>> queries            = std::nullopt;
+  std::optional<raft::device_matrix<index_type, int64_t>> neighbors_upsm    = std::nullopt;
+  std::optional<raft::device_matrix<index_type, int64_t>> neighbors_dynb    = std::nullopt;
+  std::optional<raft::device_matrix<distance_type, int64_t>> distances_upsm = std::nullopt;
+  std::optional<raft::device_matrix<distance_type, int64_t>> distances_dynb = std::nullopt;
+
+  // build parameters
+  cuvs::neighbors::index_params build_params_base{ps.metric};
+  typename upstream_type::index_params_type build_params_upsm{build_params_base};
+  dynamic_batching::index_params build_params_dynb{
+    build_params_base, ps.k, ps.max_batch_size, ps.n_queues, ps.conservative_dispatch};
+
+  // search parameters
+  typename upstream_type::search_params_type search_params_upsm{};
+  dynamic_batching::search_params search_params_dynb{};
+
+  // indexes
+  std::optional<upstream_type> index_upsm                                  = std::nullopt;
+  std::optional<dynamic_batching::index<data_type, index_type>> index_dynb = std::nullopt;
+
+  void build_all()
+  {
+    index_dynb.reset();
+    index_upsm.reset();
+    index_upsm = UpstreamBuildF(res, build_params_upsm, dataset->view());
+    index_dynb.emplace(res, build_params_dynb, index_upsm.value(), search_params_upsm);
+  }
+
+  void search_all()
+  {
+    // Search using upstream index - all queries at once
+    UpstreamSearchF(res,
+                    search_params_upsm,
+                    index_upsm.value(),
+                    queries->view(),
+                    neighbors_upsm->view(),
+                    distances_upsm->view(),
+                    filtering::none_sample_filter{});
+    raft::resource::sync_stream(res);
+
+    // Search with dynamic batching
+    // Streaming scenario: prepare concurrent resources
+    rmm::cuda_stream_pool worker_streams(ps.max_concurrent_threads);
+    std::vector<std::future<void>> futures(ps.max_concurrent_threads);
+    std::vector<raft::resources> resource_pool(0);
+    for (int64_t i = 0; i < ps.max_concurrent_threads; i++) {
+      resource_pool.push_back(res);  // copies the resource
+      raft::resource::set_cuda_stream(resource_pool[i], worker_streams.get_stream(i));
+    }
+
+    // Try multiple batch sizes in a round-robin to improve test coverage
+    std::vector<int64_t> minibatch_sizes{1, 3, 7, 10};
+    auto get_bs = [&minibatch_sizes](auto i) {
+      return minibatch_sizes[i % minibatch_sizes.size()];
+    };
+    int64_t i = 0;
+    for (int64_t offset = 0; offset < ps.n_queries; offset += get_bs(i++)) {
+      auto bs = std::min<int64_t>(get_bs(i), ps.n_queries - offset);
+      auto j  = i % ps.max_concurrent_threads;
+      // wait for previous job in the same slot to finish
+      if (i >= ps.max_concurrent_threads) { futures[j].wait(); }
+      // submit a new job
+      futures[j] = std::async(
+        std::launch::async,
+        [&res       = resource_pool[j],
+         &params    = search_params_dynb,
+         index      = index_dynb.value(),
+         query_view = raft::make_device_matrix_view<data_type, int64_t>(
+           queries->data_handle() + offset * ps.dim, bs, ps.dim),
+         neighbors_view = raft::make_device_matrix_view<index_type, int64_t>(
+           neighbors_dynb->data_handle() + offset * ps.k, bs, ps.k),
+         distances_view = raft::make_device_matrix_view<distance_type, int64_t>(
+           distances_dynb->data_handle() + offset * ps.k, bs, ps.k)]() {
+          dynamic_batching::search(res, params, index, query_view, neighbors_view, distances_view);
+        });
+    }
+
+    // finalize all resources
+    for (int64_t j = 0; j < ps.max_concurrent_threads && j < i; j++) {
+      futures[j].wait();
+      raft::resource::sync_stream(resource_pool[j]);
+    }
+    raft::resource::sync_stream(res);
+  }
+
+  /*
+    Check the dynamic batching generated neighbors against the upstream index. They both may be
+    imperfect w.r.t. the ground truth, but they shouldn't differ too much.
+   */
+  void check_neighbors()
+  {
+    auto stream         = raft::resource::get_cuda_stream(res);
+    size_t queries_size = ps.n_queries * ps.k;
+    std::vector<index_type> neighbors_upsm_host(queries_size);
+    std::vector<index_type> neighbors_dynb_host(queries_size);
+    std::vector<distance_type> distances_upsm_host(queries_size);
+    std::vector<distance_type> distances_dynb_host(queries_size);
+    raft::copy(neighbors_upsm_host.data(), neighbors_upsm->data_handle(), queries_size, stream);
+    raft::copy(neighbors_dynb_host.data(), neighbors_dynb->data_handle(), queries_size, stream);
+    raft::copy(distances_upsm_host.data(), distances_upsm->data_handle(), queries_size, stream);
+    raft::copy(distances_dynb_host.data(), distances_dynb->data_handle(), queries_size, stream);
+    raft::resource::sync_stream(res);
+    ASSERT_TRUE(eval_neighbours(neighbors_upsm_host,
+                                neighbors_dynb_host,
+                                distances_upsm_host,
+                                distances_dynb_host,
+                                ps.n_queries,
+                                ps.k,
+                                0.001,
+                                0.9))
+      << ps;
+  }
+
+  void SetUp() override
+  {
+    dataset.emplace(raft::make_device_matrix<data_type, int64_t>(res, ps.n_rows, ps.dim));
+    queries.emplace(raft::make_device_matrix<data_type, int64_t>(res, ps.n_queries, ps.dim));
+    neighbors_upsm.emplace(raft::make_device_matrix<index_type, int64_t>(res, ps.n_queries, ps.k));
+    neighbors_dynb.emplace(raft::make_device_matrix<index_type, int64_t>(res, ps.n_queries, ps.k));
+    distances_upsm.emplace(
+      raft::make_device_matrix<distance_type, int64_t>(res, ps.n_queries, ps.k));
+    distances_dynb.emplace(
+      raft::make_device_matrix<distance_type, int64_t>(res, ps.n_queries, ps.k));
+
+    raft::random::RngState rng(666ULL);
+    if constexpr (std::is_same_v<data_type, float> || std::is_same_v<data_type, half>) {
+      raft::random::uniform(
+        res, rng, dataset->data_handle(), dataset->size(), data_type(0.1), data_type(2.0));
+      raft::random::uniform(
+        res, rng, queries->data_handle(), queries->size(), data_type(0.1), data_type(2.0));
+    } else {
+      raft::random::uniformInt(
+        res, rng, dataset->data_handle(), dataset->size(), data_type(1), data_type(20));
+      raft::random::uniformInt(
+        res, rng, queries->data_handle(), queries->size(), data_type(1), data_type(20));
+    }
+    raft::resource::sync_stream(res);
+  }
+
+  void TearDown() override
+  {
+    index_dynb.reset();
+    index_upsm.reset();
+    dataset.reset();
+    queries.reset();
+    neighbors_upsm.reset();
+    neighbors_dynb.reset();
+    distances_upsm.reset();
+    distances_dynb.reset();
+    raft::resource::sync_stream(res);
+  }
+};
+
+inline std::vector<dynamic_batching_spec> generate_inputs()
+{
+  std::vector<dynamic_batching_spec> inputs{dynamic_batching_spec{}};
+
+  for (auto alt_n_queries : {10, 50, 100}) {
+    dynamic_batching_spec input{};
+    input.n_queries = alt_n_queries;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_k : {100, 200}) {
+    dynamic_batching_spec input{};
+    input.k = alt_k;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_max_batch_size : {4, 16, 128, 256, 512, 1024}) {
+    dynamic_batching_spec input{};
+    input.max_batch_size = alt_max_batch_size;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_n_queues : {1, 2, 16, 32}) {
+    dynamic_batching_spec input{};
+    input.n_queues = alt_n_queues;
+    inputs.push_back(input);
+  }
+
+  for (auto alt_max_concurrent_threads : {1, 2, 16, 32}) {
+    dynamic_batching_spec input{};
+    input.max_concurrent_threads = alt_max_concurrent_threads;
+    inputs.push_back(input);
+  }
+
+  {
+    auto n = inputs.size();
+    for (size_t i = 0; i < n; i++) {
+      auto input                  = inputs[i];
+      input.conservative_dispatch = !input.conservative_dispatch;
+      inputs.push_back(input);
+    }
+  }
+
+  return inputs;
+}
+
+const std::vector<dynamic_batching_spec> inputs = generate_inputs();
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/dynamic_batching/test_cagra.cu b/cpp/test/neighbors/dynamic_batching/test_cagra.cu
new file mode 100644
index 000000000..604fc29cf
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_cagra.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using cagra_F32 = dynamic_batching_test<float,
+                                        uint32_t,
+                                        cagra::index<float, uint32_t>,
+                                        cagra::build,
+                                        cagra::search>;
+
+using cagra_U8 = dynamic_batching_test<uint8_t,
+                                       uint32_t,
+                                       cagra::index<uint8_t, uint32_t>,
+                                       cagra::build,
+                                       cagra::search>;
+
+template <typename fixture>
+static void set_default_cagra_params(fixture& that)
+{
+  that.build_params_upsm.intermediate_graph_degree = 128;
+  that.build_params_upsm.graph_degree              = 64;
+  that.search_params_upsm.itopk_size =
+    std::clamp<int64_t>(raft::bound_by_power_of_two(that.ps.k) * 16, 128, 512);
+}
+
+TEST_P(cagra_F32, single_cta)
+{
+  set_default_cagra_params(*this);
+  search_params_upsm.algo = cagra::search_algo::SINGLE_CTA;
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+TEST_P(cagra_F32, multi_cta)
+{
+  set_default_cagra_params(*this);
+  search_params_upsm.algo = cagra::search_algo::MULTI_CTA;
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+TEST_P(cagra_F32, multi_kernel)
+{
+  set_default_cagra_params(*this);
+  search_params_upsm.algo = cagra::search_algo::MULTI_KERNEL;
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+TEST_P(cagra_U8, defaults)
+{
+  set_default_cagra_params(*this);
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, cagra_F32, ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(dynamic_batching, cagra_U8, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu b/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu
new file mode 100644
index 000000000..4922cffa3
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_ivf_flat.cu
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/ivf_flat.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using ivf_flat_i8 = dynamic_batching_test<uint8_t,
+                                          int64_t,
+                                          ivf_flat::index<uint8_t, int64_t>,
+                                          ivf_flat::build,
+                                          ivf_flat::search>;
+
+TEST_P(ivf_flat_i8, defaults)
+{
+  build_params_upsm.n_lists = std::round(std::sqrt(ps.n_rows));
+  search_params_upsm.n_probes =
+    std::max<uint32_t>(std::min<uint32_t>(build_params_upsm.n_lists, 10),
+                       raft::div_rounding_up_safe<uint32_t>(build_params_upsm.n_lists, 50));
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, ivf_flat_i8, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu b/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu
new file mode 100644
index 000000000..ec57e0b57
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_ivf_pq.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/ivf_pq.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using ivf_pq_f16 =
+  dynamic_batching_test<half, int64_t, ivf_pq::index<int64_t>, ivf_pq::build, ivf_pq::search>;
+
+TEST_P(ivf_pq_f16, defaults)
+{
+  build_params_upsm.n_lists = std::round(std::sqrt(ps.n_rows));
+  search_params_upsm.n_probes =
+    std::max<uint32_t>(std::min<uint32_t>(build_params_upsm.n_lists, 10),
+                       raft::div_rounding_up_safe<uint32_t>(build_params_upsm.n_lists, 50));
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, ivf_pq_f16, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching
diff --git a/cpp/test/neighbors/hnsw.cu b/cpp/test/neighbors/hnsw.cu
index 9fb88be05..20ee83a11 100644
--- a/cpp/test/neighbors/hnsw.cu
+++ b/cpp/test/neighbors/hnsw.cu
@@ -108,7 +108,8 @@ class AnnHNSWTest : public ::testing::TestWithParam<AnnHNSWInputs> {
 
       cuvs::neighbors::hnsw::search_params search_params;
       search_params.ef = ps.ef;
-      auto hnsw_index  = cuvs::neighbors::hnsw::from_cagra(handle_, index);
+      cuvs::neighbors::hnsw::index_params hnsw_params;
+      auto hnsw_index = cuvs::neighbors::hnsw::from_cagra(handle_, hnsw_params, index);
       auto queries_HNSW_view =
         raft::make_host_matrix_view<DataT, int64_t>(queries_h.data(), ps.n_queries, ps.dim);
       auto indices_HNSW_view =
diff --git a/dependencies.yaml b/dependencies.yaml
index e909ad0dc..80a7d2024 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -470,7 +470,6 @@ dependencies:
     common:
       - output_types: [conda, pyproject, requirements]
         packages:
-          - hnswlib=0.6.2
           - nlohmann_json>=3.11.2
           - glog>=0.6.0
           - h5py>=3.8.0
diff --git a/docs/source/c_api/neighbors_hnsw_c.rst b/docs/source/c_api/neighbors_hnsw_c.rst
index 988e5b6f3..22ffc236d 100644
--- a/docs/source/c_api/neighbors_hnsw_c.rst
+++ b/docs/source/c_api/neighbors_hnsw_c.rst
@@ -26,6 +26,28 @@ Index
     :members:
     :content-only:
 
+Index extend parameters
+-----------------------
+
+.. doxygengroup:: hnsw_c_extend_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
+------------
+.. doxygengroup:: hnsw_c_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index load
+----------
+.. doxygengroup:: hnsw_c_index_load
+    :project: cuvs
+    :members:
+    :content-only:
+
 Index search
 ------------
 
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index d55d58eb0..ab810ab53 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -11,6 +11,7 @@ Nearest Neighbors
 
    neighbors_bruteforce.rst
    neighbors_cagra.rst
+   neighbors_dynamic_batching.rst
    neighbors_hnsw.rst
    neighbors_ivf_flat.rst
    neighbors_ivf_pq.rst
diff --git a/docs/source/cpp_api/neighbors_dynamic_batching.rst b/docs/source/cpp_api/neighbors_dynamic_batching.rst
new file mode 100644
index 000000000..adc5cb56a
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_dynamic_batching.rst
@@ -0,0 +1,45 @@
+Dynamic Batching
+================
+
+Dynamic Batching allows grouping small search requests into batches to increase the device occupancy and throughput while keeping the latency within limits.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <cuvs/neighbors/dynamic_batching.hpp>``
+
+namespace *cuvs::neighbors::dynamic_batching*
+
+Index build parameters
+----------------------
+
+.. doxygengroup:: dynamic_batching_cpp_index_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search parameters
+-----------------------
+
+.. doxygengroup:: dynamic_batching_cpp_search_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index
+-----
+
+.. doxygengroup:: dynamic_batching_cpp_index
+    :project: cuvs
+    :members:
+    :content-only:
+
+
+Index search
+------------
+
+.. doxygengroup:: dynamic_batching_cpp_search
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/neighbors_hnsw.rst b/docs/source/cpp_api/neighbors_hnsw.rst
index b0af88af0..00dd3a213 100644
--- a/docs/source/cpp_api/neighbors_hnsw.rst
+++ b/docs/source/cpp_api/neighbors_hnsw.rst
@@ -27,10 +27,25 @@ Index
     :members:
     :content-only:
 
-Index load
+Index extend parameters
+-----------------------
+
+.. doxygengroup:: hnsw_cpp_extend_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
 ------------
+.. doxygengroup:: hnsw_cpp_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
 
-.. doxygengroup:: hnsw_cpp_index_search
+Index load
+----------
+
+.. doxygengroup:: hnsw_cpp_index_load
     :project: cuvs
     :members:
     :content-only:
@@ -43,10 +58,10 @@ Index search
     :members:
     :content-only:
 
-Index deserialize
+Index serialize
 ---------------
 
-.. doxygengroup:: hnsw_cpp_index_deserialize
+.. doxygengroup:: hnsw_cpp_index_serialize
     :project: cuvs
     :members:
     :content-only:
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 092b65ed9..951e0ad0c 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -38,6 +38,7 @@ include(../cmake/thirdparty/get_cuvs.cmake)
 # -------------- compile tasks ----------------- #
 add_executable(CAGRA_EXAMPLE src/cagra_example.cu)
 add_executable(CAGRA_PERSISTENT_EXAMPLE src/cagra_persistent_example.cu)
+add_executable(DYNAMIC_BATCHING_EXAMPLE src/dynamic_batching_example.cu)
 add_executable(IVF_FLAT_EXAMPLE src/ivf_flat_example.cu)
 add_executable(IVF_PQ_EXAMPLE src/ivf_pq_example.cu)
 add_executable(VAMANA_EXAMPLE src/vamana_example.cu)
@@ -48,6 +49,9 @@ target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:c
 target_link_libraries(
   CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
 )
+target_link_libraries(
+  DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
+)
 target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
 target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/examples/cpp/src/dynamic_batching_example.cu b/examples/cpp/src/dynamic_batching_example.cu
new file mode 100644
index 000000000..95f66a454
--- /dev/null
+++ b/examples/cpp/src/dynamic_batching_example.cu
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.cuh"
+
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/dynamic_batching.hpp>
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/random/make_blobs.cuh>
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <future>
+
+// A helper to split the dataset into chunks
+template <typename DeviceMatrixOrView>
+auto slice_matrix(const DeviceMatrixOrView &source,
+                  typename DeviceMatrixOrView::index_type offset_rows,
+                  typename DeviceMatrixOrView::index_type count_rows) {
+  auto n_cols = source.extent(1);
+  return raft::make_device_matrix_view<
+      typename DeviceMatrixOrView::element_type,
+      typename DeviceMatrixOrView::index_type>(
+      const_cast<typename DeviceMatrixOrView::element_type *>(
+          source.data_handle()) +
+          offset_rows * n_cols,
+      count_rows, n_cols);
+}
+
+// A helper to measure the execution time of a function
+template <typename F, typename... Args>
+void time_it(std::string label, F f, Args &&...xs) {
+  auto start = std::chrono::system_clock::now();
+  f(std::forward<Args>(xs)...);
+  auto end = std::chrono::system_clock::now();
+  auto t = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  auto t_ms = double(t.count()) / 1000.0;
+  std::cout << "[" << label << "] execution time: " << t_ms << " ms"
+            << std::endl;
+}
+
+/**
+ * Wrap waiting on a stream work into an async C++ future object.
+ * This is similar to recording and waiting on CUDA events, but in C++11 API.
+ */
+struct cuda_work_completion_promise {
+
+  cuda_work_completion_promise(const raft::resources &res) {
+    auto *promise = new std::promise<void>;
+    RAFT_CUDA_TRY(cudaLaunchHostFunc(raft::resource::get_cuda_stream(res),
+                                     completion_callback,
+                                     reinterpret_cast<void *>(promise)));
+    value_ = promise->get_future();
+  }
+
+  /**
+   * Waiting on the produced `future` object has the same effect as
+   * cudaEventSynchronize if an event was recorded at the time of creation of
+   * this promise object.
+   */
+  auto get_future() -> std::future<void> && { return std::move(value_); }
+
+private:
+  std::future<void> value_;
+
+  static void completion_callback(void *ptr) {
+    auto *promise = reinterpret_cast<std::promise<void> *>(ptr);
+    promise->set_value();
+    delete promise;
+  }
+};
+
+void dynamic_batching_example(
+    raft::resources const &res,
+    raft::device_matrix_view<const float, int64_t> dataset,
+    raft::device_matrix_view<const float, int64_t> queries) {
+  using namespace cuvs::neighbors;
+
+  // Number of neighbors to search
+  int64_t topk = 100;
+
+  // Streaming scenario: maximum number of requests in-flight
+  constexpr int64_t kMaxJobs = 1000;
+  // Streaming scenario: number of concurrent CUDA streams
+  constexpr int64_t kNumWorkerStreams = 5;
+
+  // Split the queries into two subsets to run every experiment twice and thus
+  // surface any initialization overheads.
+  int64_t n_queries_a = queries.extent(0) / 2;
+  int64_t n_queries_b = queries.extent(0) - n_queries_a;
+
+  auto queries_a = slice_matrix(queries, 0, n_queries_a);
+  auto queries_b = slice_matrix(queries, n_queries_a, n_queries_b);
+
+  // create output arrays
+  auto neighbors =
+      raft::make_device_matrix<uint32_t>(res, queries.extent(0), topk);
+  auto distances =
+      raft::make_device_matrix<float>(res, queries.extent(0), topk);
+  // slice them same as queries
+  auto neighbors_a = slice_matrix(neighbors, 0, n_queries_a);
+  auto distances_a = slice_matrix(distances, 0, n_queries_a);
+  auto neighbors_b = slice_matrix(neighbors, n_queries_a, n_queries_b);
+  auto distances_b = slice_matrix(distances, n_queries_a, n_queries_b);
+
+  // use default index parameters
+  cagra::index_params orig_index_params;
+
+  std::cout << "Building CAGRA index (search graph)" << std::endl;
+  auto orig_index = cagra::build(res, orig_index_params, dataset);
+
+  std::cout << "CAGRA index has " << orig_index.size() << " vectors"
+            << std::endl;
+  std::cout << "CAGRA graph has degree " << orig_index.graph_degree()
+            << ", graph size [" << orig_index.graph().extent(0) << ", "
+            << orig_index.graph().extent(1) << "]" << std::endl;
+
+  // use default search parameters
+  cagra::search_params orig_search_params;
+  // get a decent recall by increasing the internal topk list
+  orig_search_params.itopk_size = 512;
+  orig_search_params.algo = cagra::search_algo::SINGLE_CTA;
+
+  // Set up dynamic batching parameters
+  dynamic_batching::index_params dynb_index_params{
+      /* default-initializing the parent `neighbors::index_params`
+         (not used anyway) */
+      {},
+      /* Set the K in advance (the batcher needs to allocate buffers) */
+      topk,
+      /* Configure the number and the size of IO buffers */
+      64,
+      kNumWorkerStreams};
+
+  // "build" the index (it's a low-cost index wrapping),
+  //  that is we need to pass the original index and its search params here
+  dynamic_batching::index<float, uint32_t> dynb_index(
+      res, dynb_index_params, orig_index, orig_search_params);
+
+  // You can implement job priorities by varying the deadlines of individual
+  // requests
+  dynamic_batching::search_params dynb_search_params;
+  dynb_search_params.dispatch_timeout_ms = 0.1;
+
+  // Define the big-batch setting as a baseline for measuring the throughput.
+  auto search_batch_orig =
+      [&res, &orig_index, &orig_search_params](
+          raft::device_matrix_view<const float, int64_t> queries,
+          raft::device_matrix_view<uint32_t, int64_t> neighbors,
+          raft::device_matrix_view<float, int64_t> distances) {
+        cagra::search(res, orig_search_params, orig_index, queries, neighbors,
+                      distances);
+        raft::resource::sync_stream(res);
+      };
+
+  // Launch the baseline search: check the big-batch performance
+  time_it("standard/batch A", search_batch_orig, queries_a, neighbors_a,
+          distances_a);
+  time_it("standard/batch B", search_batch_orig, queries_b, neighbors_b,
+          distances_b);
+
+  // Streaming scenario: prepare concurrent resources
+  rmm::cuda_stream_pool worker_streams{kNumWorkerStreams};
+  std::vector<raft::resources> resource_pool(0);
+  for (int64_t i = 0; i < kNumWorkerStreams; i++) {
+    resource_pool.push_back(res);
+    raft::resource::set_cuda_stream(resource_pool[i],
+                                    worker_streams.get_stream(i));
+  }
+
+  // Streaming scenario:
+  // send queries one-by-one, with a maximum kMaxJobs in-flight
+  auto search_async_orig =
+      [&resource_pool, &orig_index, &orig_search_params](
+          raft::device_matrix_view<const float, int64_t> queries,
+          raft::device_matrix_view<uint32_t, int64_t> neighbors,
+          raft::device_matrix_view<float, int64_t> distances) {
+        auto work_size = queries.extent(0);
+        std::array<std::future<void>, kMaxJobs> futures;
+        for (int64_t i = 0; i < work_size + kMaxJobs; i++) {
+          // wait for previous job in the same slot to finish
+          if (i >= kMaxJobs) {
+            futures[i % kMaxJobs].wait();
+          }
+          // submit a new job
+          if (i < work_size) {
+            auto &res = resource_pool[i % kNumWorkerStreams];
+            cagra::search(res, orig_search_params, orig_index,
+                          slice_matrix(queries, i, 1),
+                          slice_matrix(neighbors, i, 1),
+                          slice_matrix(distances, i, 1));
+            futures[i % kMaxJobs] =
+                cuda_work_completion_promise(res).get_future();
+          }
+        }
+      };
+
+  // Streaming scenario with dynamic batching:
+  // send queries one-by-one, with a maximum kMaxJobs in-flight,
+  // yet allow grouping the sequential requests (subject to deadlines)
+  auto search_async_dynb =
+      [&resource_pool, &dynb_index, &dynb_search_params](
+          raft::device_matrix_view<const float, int64_t> queries,
+          raft::device_matrix_view<uint32_t, int64_t> neighbors,
+          raft::device_matrix_view<float, int64_t> distances) {
+        auto work_size = queries.extent(0);
+        std::array<std::future<void>, kMaxJobs> futures;
+        for (int64_t i = 0; i < work_size + kMaxJobs; i++) {
+          // wait for previous job in the same slot to finish
+          if (i >= kMaxJobs) {
+            futures[i % kMaxJobs].wait();
+          }
+          // submit a new job
+          if (i < work_size) {
+            auto &res = resource_pool[i % kNumWorkerStreams];
+            dynamic_batching::search(res, dynb_search_params, dynb_index,
+                                     slice_matrix(queries, i, 1),
+                                     slice_matrix(neighbors, i, 1),
+                                     slice_matrix(distances, i, 1));
+            futures[i % kMaxJobs] =
+                cuda_work_completion_promise(res).get_future();
+          }
+        }
+      };
+
+  // Try to handle the same amount of work in the async setting using the
+  // standard implementation.
+  time_it("standard/async A", search_async_orig, queries_a, neighbors_a,
+          distances_a);
+  time_it("standard/async B", search_async_orig, queries_b, neighbors_b,
+          distances_b);
+
+  // Do the same using dynamic batching
+  time_it("dynamic_batching/async A", search_async_dynb, queries_a, neighbors_a,
+          distances_a);
+  time_it("dynamic_batching/async B", search_async_dynb, queries_b, neighbors_b,
+          distances_b);
+}
+
+int main() {
+  raft::device_resources res;
+
+  // Set the raft resource to use a pool for internal memory allocations
+  // (workspace) and limit the available workspace size.
+  raft::resource::set_workspace_to_pool_resource(res,
+                                                 12ull * 1024 * 1024 * 1024ull);
+
+  // Create input arrays.
+  int64_t n_samples = 1000000;
+  int64_t n_dim = 128;
+  int64_t n_queries = 10000;
+  auto dataset =
+      raft::make_device_matrix<float, int64_t>(res, n_samples, n_dim);
+  auto queries =
+      raft::make_device_matrix<float, int64_t>(res, n_queries, n_dim);
+  generate_dataset(res, dataset.view(), queries.view());
+
+  // run the interesting part of the program
+  dynamic_batching_example(res, raft::make_const_mdspan(dataset.view()),
+                           raft::make_const_mdspan(queries.view()));
+}
diff --git a/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb b/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb
new file mode 100644
index 000000000..09a6cca43
--- /dev/null
+++ b/notebooks/VectorSearch_QuestionRetrieval_Milvus.ipynb
@@ -0,0 +1,732 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f5499b54",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Similar Questions Retrieval - Milvus - CAGRA-HNSW\n",
+    "\n",
+    "This notebook is inspired by the [similar search example of Sentence-Transformers](https://www.sbert.net/examples/applications/semantic-search/README.html#similar-questions-retrieval), and adapted to be used with [Milvus](https://milvus.io) and [cuVS](https://rapids.ai/cuvs/).\n",
+    "\n",
+    "The model was pre-trained on the [Natural Questions dataset](https://ai.google.com/research/NaturalQuestions). It consists of about 100k real Google search queries, together with an annotated passage from Wikipedia that provides the answer. It is an example of an asymmetric search task. As corpus, we use the smaller [Simple English Wikipedia](http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz) so that it fits easily into memory.\n",
+    "\n",
+    "The steps to install the latest Milvus package are available in the [Milvus documentation](https://milvus.io/docs/quickstart.md)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8d55ede",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:21.149465Z",
+     "iopub.status.busy": "2024-11-08T14:47:21.149218Z",
+     "iopub.status.idle": "2024-11-08T14:47:23.440275Z",
+     "shell.execute_reply": "2024-11-08T14:47:23.439436Z"
+    },
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!pip install sentence_transformers torch pymilvus pymilvus[bulk_writer] dask dask[distributed]\n",
+    "\n",
+    "# Note: if you have a Hopper based GPU, like an H100, use these to install:\n",
+    "# pip install torch --index-url https://download.pytorch.org/whl/cu118\n",
+    "# pip install sentence_transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb1e81c3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:23.444058Z",
+     "iopub.status.busy": "2024-11-08T14:47:23.443683Z",
+     "iopub.status.idle": "2024-11-08T14:47:24.219903Z",
+     "shell.execute_reply": "2024-11-08T14:47:24.219228Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!nvidia-smi"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee4c5cc0",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:24.223131Z",
+     "iopub.status.busy": "2024-11-08T14:47:24.222874Z",
+     "iopub.status.idle": "2024-11-08T14:47:34.024085Z",
+     "shell.execute_reply": "2024-11-08T14:47:34.023435Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import dask.array as da\n",
+    "import gzip\n",
+    "import json\n",
+    "import math\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import pymilvus\n",
+    "import time\n",
+    "import torch\n",
+    "\n",
+    "from minio import Minio\n",
+    "from multiprocessing import Process\n",
+    "from sentence_transformers import SentenceTransformer, CrossEncoder, util\n",
+    "from typing import List\n",
+    "\n",
+    "\n",
+    "from pymilvus import (\n",
+    "    connections, utility\n",
+    ")\n",
+    "from pymilvus.bulk_writer import LocalBulkWriter, BulkFileType  # pip install pymilvus[bulk_writer]\n",
+    "\n",
+    "if not torch.cuda.is_available():\n",
+    "  print(\"Warning: No GPU found. Please add GPU to your notebook\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "47cabaca",
+   "metadata": {},
+   "source": [
+    "# Setup Milvus Collection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fcd259c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:34.027677Z",
+     "iopub.status.busy": "2024-11-08T14:47:34.027288Z",
+     "iopub.status.idle": "2024-11-08T14:47:34.109212Z",
+     "shell.execute_reply": "2024-11-08T14:47:34.108609Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "DIM = 768\n",
+    "MILVUS_PORT = 30004\n",
+    "MILVUS_HOST = f\"http://localhost:{MILVUS_PORT}\"\n",
+    "ID_FIELD=\"id\"\n",
+    "EMBEDDING_FIELD=\"embedding\"\n",
+    "\n",
+    "collection_name = \"simple_wiki\"\n",
+    "\n",
+    "def get_milvus_client():\n",
+    "    return pymilvus.MilvusClient(uri=MILVUS_HOST)\n",
+    "\n",
+    "client = get_milvus_client()\n",
+    "\n",
+    "fields = [\n",
+    "    pymilvus.FieldSchema(name=ID_FIELD, dtype=pymilvus.DataType.INT64, is_primary=True),\n",
+    "    pymilvus.FieldSchema(name=EMBEDDING_FIELD, dtype=pymilvus.DataType.FLOAT_VECTOR, dim=DIM)\n",
+    "]\n",
+    "\n",
+    "schema = pymilvus.CollectionSchema(fields)\n",
+    "schema.verify()\n",
+    "\n",
+    "if collection_name in client.list_collections():\n",
+    "    print(f\"Collection '{collection_name}' already exists. Deleting collection...\")\n",
+    "    client.drop_collection(collection_name)\n",
+    "\n",
+    "client.create_collection(collection_name, schema=schema, dimension=DIM, vector_field_name=EMBEDDING_FIELD)\n",
+    "collection = pymilvus.Collection(name=collection_name, using=client._using)\n",
+    "collection.release()\n",
+    "collection.drop_index()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00bd20f5",
+   "metadata": {},
+   "source": [
+    "# Setup Sentence Transformer model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a1a6307",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:34.111782Z",
+     "iopub.status.busy": "2024-11-08T14:47:34.111556Z",
+     "iopub.status.idle": "2024-11-08T14:47:39.654323Z",
+     "shell.execute_reply": "2024-11-08T14:47:39.653386Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# We use the Bi-Encoder to encode all passages, so that we can use it with semantic search\n",
+    "model_name = 'nq-distilbert-base-v1'\n",
+    "bi_encoder = SentenceTransformer(model_name)\n",
+    "\n",
+    "# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only\n",
+    "# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder\n",
+    "\n",
+    "wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'\n",
+    "\n",
+    "if not os.path.exists(wikipedia_filepath):\n",
+    "    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz', wikipedia_filepath)\n",
+    "\n",
+    "passages = []\n",
+    "with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:\n",
+    "    for line in fIn:\n",
+    "        data = json.loads(line.strip())\n",
+    "        for paragraph in data['paragraphs']:\n",
+    "            # We encode the passages as [title, text]\n",
+    "            passages.append([data['title'], paragraph])\n",
+    "\n",
+    "# If you like, you can also limit the number of passages you want to use\n",
+    "print(\"Passages:\", len(passages))\n",
+    "\n",
+    "# To speed things up, pre-computed embeddings are downloaded.\n",
+    "# The provided file encoded the passages with the model 'nq-distilbert-base-v1'\n",
+    "if model_name == 'nq-distilbert-base-v1':\n",
+    "    embeddings_filepath = 'simplewiki-2020-11-01-nq-distilbert-base-v1.pt'\n",
+    "    if not os.path.exists(embeddings_filepath):\n",
+    "        util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01-nq-distilbert-base-v1.pt', embeddings_filepath)\n",
+    "\n",
+    "    corpus_embeddings = torch.load(embeddings_filepath, map_location='cpu', weights_only=True).float()  # Convert embedding file to float\n",
+    "    #if torch.cuda.is_available():\n",
+    "    #    corpus_embeddings = corpus_embeddings.to('cuda')\n",
+    "else:  # Here, we compute the corpus_embeddings from scratch (which can take a while depending on the GPU)\n",
+    "    corpus_embeddings = bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True).to('cpu')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1f4e9b9d",
+   "metadata": {},
+   "source": [
+    "# Vector Search using Milvus and RAPIDS cuVS \n",
+    "Now that our embeddings are ready to be indexed and that the model has been loaded, we can use Milvus and RAPIDS cuVS to do our vector search.\n",
+    "\n",
+    "This is done in 3 steps: First we ingest all the vectors in the Milvus collection, then we build the Milvus index, to finally search it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "563751c1",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:47:39.658832Z",
+     "iopub.status.busy": "2024-11-08T14:47:39.658374Z",
+     "iopub.status.idle": "2024-11-08T14:49:47.244768Z",
+     "shell.execute_reply": "2024-11-08T14:49:47.244162Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# minio\n",
+    "MINIO_PORT = 30009\n",
+    "MINIO_URL = f\"localhost:{MINIO_PORT}\"\n",
+    "MINIO_SECRET_KEY = \"minioadmin\"\n",
+    "MINIO_ACCESS_KEY = \"minioadmin\"\n",
+    "\n",
+    "def upload_to_minio(file_paths: List[List[str]], remote_paths: List[List[str]], bucket_name=\"milvus-bucket\"):\n",
+    "    minio_client = Minio(endpoint=MINIO_URL, access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=False)\n",
+    "    if not minio_client.bucket_exists(bucket_name):\n",
+    "        minio_client.make_bucket(bucket_name)\n",
+    "\n",
+    "    for local_batch, remote_batch in zip(file_paths, remote_paths):\n",
+    "        for local_file, remote_file in zip(local_batch, remote_batch):\n",
+    "            minio_client.fput_object(bucket_name, \n",
+    "                                     object_name=remote_file,\n",
+    "                                     file_path=local_file,\n",
+    "                                     part_size=512 * 1024 * 1024,\n",
+    "                                     num_parallel_uploads=5)\n",
+    "     \n",
+    "    \n",
+    "def ingest_data_bulk(collection_name, vectors, schema: pymilvus.CollectionSchema, log_times=True, bulk_writer_type=\"milvus\", debug=False):\n",
+    "    print(f\"-  Ingesting {len(vectors) // 1000}k vectors, Bulk\")\n",
+    "    tic = time.perf_counter()\n",
+    "    collection = pymilvus.Collection(collection_name, using=get_milvus_client()._using)\n",
+    "    remote_path = None\n",
+    "\n",
+    "    if bulk_writer_type == 'milvus':\n",
+    "        # # Prepare source data for faster ingestion\n",
+    "        writer = LocalBulkWriter(\n",
+    "            schema=schema,\n",
+    "            local_path='bulk_data',\n",
+    "            segment_size=512 * 1024 * 1024, # Default value\n",
+    "            file_type=BulkFileType.NPY\n",
+    "        )\n",
+    "        for id, vec in enumerate(vectors):\n",
+    "            writer.append_row({ID_FIELD: id, EMBEDDING_FIELD: vec})\n",
+    "\n",
+    "        if debug:\n",
+    "            print(writer.batch_files)\n",
+    "        def callback(file_list):\n",
+    "            if debug:\n",
+    "                print(f\"  -  Commit successful\")\n",
+    "                print(file_list)\n",
+    "        writer.commit(call_back=callback)\n",
+    "        files_to_upload = writer.batch_files\n",
+    "    elif bulk_writer_type == 'dask':\n",
+    "        # Prepare source data for faster ingestion\n",
+    "        if not os.path.isdir(\"bulk_data\"):\n",
+    "            os.mkdir(\"bulk_data\")\n",
+    "\n",
+    "        from dask.distributed import Client, LocalCluster\n",
+    "        cluster = LocalCluster(n_workers=1, threads_per_worker=1)\n",
+    "        client = Client(cluster)\n",
+    "\n",
+    "        chunk_size = 100000\n",
+    "        da_vectors = da.from_array(vectors, chunks=(chunk_size, vectors.shape[1]))\n",
+    "        da_ids = da.arange(len(vectors), chunks=(chunk_size,))\n",
+    "        da.to_npy_stack(\"bulk_data/da_embedding/\", da_vectors)\n",
+    "        da.to_npy_stack(\"bulk_data/da_id/\", da_ids)\n",
+    "        files_to_upload = []\n",
+    "        remote_path = []\n",
+    "        for chunk_nb in range(math.ceil(len(vectors) / chunk_size)):\n",
+    "            files_to_upload.append([f\"bulk_data/da_embedding/{chunk_nb}.npy\", f\"bulk_data/da_id/{chunk_nb}.npy\"])\n",
+    "            remote_path.append([f\"bulk_data/da_{chunk_nb}/embedding.npy\", f\"bulk_data/da__{chunk_nb}/id.npy\"])\n",
+    "\n",
+    "    elif bulk_writer_type == 'numpy':\n",
+    "        # Directly save NPY files\n",
+    "        np.save(\"bulk_data/embedding.npy\", vectors)\n",
+    "        np.save(\"bulk_data/id.npy\", np.arange(len(vectors)))\n",
+    "        files_to_upload = [[\"bulk_data/embedding.npy\", \"bulk_data/id.npy\"]]\n",
+    "    else:\n",
+    "        raise ValueError(\"Invalid bulk writer type\")\n",
+    "    \n",
+    "    toc = time.perf_counter()\n",
+    "    if log_times:\n",
+    "        print(f\"  -  File save time: {toc - tic:.2f} seconds\")\n",
+    "    # Import data\n",
+    "    if remote_path is None:\n",
+    "        remote_path = files_to_upload\n",
+    "    upload_to_minio(files_to_upload, remote_path)\n",
+    "    \n",
+    "    job_ids = [utility.do_bulk_insert(collection_name, batch, using=get_milvus_client()._using) for batch in remote_path]\n",
+    "\n",
+    "    while True:\n",
+    "        tasks = [utility.get_bulk_insert_state(job_id, using=get_milvus_client()._using) for job_id in job_ids]\n",
+    "        success = all(task.state_name == \"Completed\" for task in tasks)\n",
+    "        failure = any(task.state_name == \"Failed\" for task in tasks)\n",
+    "        for i in range(len(tasks)):\n",
+    "            task = tasks[i]\n",
+    "            if debug:\n",
+    "                print(f\"  -  Task {i}/{len(tasks)} state: {task.state_name}, Progress percent: {task.infos['progress_percent']}, Imported row count: {task.row_count}\")\n",
+    "            if task.state_name == \"Failed\":\n",
+    "                print(task)\n",
+    "        if success or failure:\n",
+    "            break\n",
+    "        time.sleep(2)\n",
+    "\n",
+    "    added_entities = str(sum([task.row_count for task in tasks]))\n",
+    "    failure = failure or added_entities != str(len(vectors))\n",
+    "    if failure:\n",
+    "        print(f\"-  Ingestion failed. Added entities: {added_entities}\")\n",
+    "    toc = time.perf_counter()\n",
+    "    if log_times:\n",
+    "        datasize = vectors.nbytes / 1024 / 1024\n",
+    "        print(f\"-  Ingestion time: {toc - tic:.2f} seconds. ({(datasize / (toc-tic)):.2f}MB/s)\")\n",
+    "\n",
+    "ingest_data_bulk(collection_name, np.array(corpus_embeddings), schema, bulk_writer_type='dask', log_times=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad90b4be",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:49:47.247498Z",
+     "iopub.status.busy": "2024-11-08T14:49:47.247268Z",
+     "iopub.status.idle": "2024-11-08T14:50:00.737502Z",
+     "shell.execute_reply": "2024-11-08T14:50:00.736808Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Setups the IVFPQ index\n",
+    "\n",
+    "index_params = dict(\n",
+    "    index_type=\"GPU_IVF_PQ\",\n",
+    "    metric_type=\"L2\",\n",
+    "    params={\"nlist\": 150, # Number of clusters\n",
+    "            \"m\": 96})      # Product Quantization dimension\n",
+    "\n",
+    "# Drop the index if it exists\n",
+    "if collection.has_index():\n",
+    "    collection.release()\n",
+    "    collection.drop_index()\n",
+    "\n",
+    "# Create the index\n",
+    "tic = time.perf_counter()\n",
+    "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n",
+    "collection.load()\n",
+    "toc = time.perf_counter()\n",
+    "print(f\"-  Index creation time: {toc - tic:.4f} seconds. ({index_params})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "c75acea7",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:00.740443Z",
+     "iopub.status.busy": "2024-11-08T14:50:00.740142Z",
+     "iopub.status.idle": "2024-11-08T14:50:00.745403Z",
+     "shell.execute_reply": "2024-11-08T14:50:00.744672Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Search the index\n",
+    "def search_cuvs_pq(query, top_k = 5, n_probe = 30):\n",
+    "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
+    "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
+    "\n",
+    "    search_params = {\"nprobe\": n_probe}\n",
+    "    tic = time.perf_counter()\n",
+    "    hits = collection.search(\n",
+    "                data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n",
+    "            )\n",
+    "    toc = time.perf_counter()\n",
+    "\n",
+    "    # Output of top-k hits\n",
+    "    print(\"Input question:\", query)\n",
+    "    print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n",
+    "    for k in range(top_k):\n",
+    "        print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07935bca",
+   "metadata": {},
+   "source": [
+    "The ideal use-case for the IVF-PQ algorithm is when there is a need to reduce the memory footprint while keeping a good accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c27d4715",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:00.748001Z",
+     "iopub.status.busy": "2024-11-08T14:50:00.747783Z",
+     "iopub.status.idle": "2024-11-08T14:50:01.785914Z",
+     "shell.execute_reply": "2024-11-08T14:50:01.785223Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_pq(query=\"Who was Grace Hopper?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc375518",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:01.788877Z",
+     "iopub.status.busy": "2024-11-08T14:50:01.788640Z",
+     "iopub.status.idle": "2024-11-08T14:50:01.813820Z",
+     "shell.execute_reply": "2024-11-08T14:50:01.813153Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_pq(query=\"Who was Alan Turing?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab154181",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:01.816625Z",
+     "iopub.status.busy": "2024-11-08T14:50:01.816362Z",
+     "iopub.status.idle": "2024-11-08T14:50:01.839593Z",
+     "shell.execute_reply": "2024-11-08T14:50:01.838986Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_pq(query = \"What is creating tides?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "836344ec",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:01.842319Z",
+     "iopub.status.busy": "2024-11-08T14:50:01.842022Z",
+     "iopub.status.idle": "2024-11-08T14:50:15.969324Z",
+     "shell.execute_reply": "2024-11-08T14:50:15.968562Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Drop the current index if it exists\n",
+    "if collection.has_index():\n",
+    "    collection.release()\n",
+    "    collection.drop_index()\n",
+    "\n",
+    "# Create the IVF Flat index\n",
+    "index_params = dict(\n",
+    "    index_type=\"GPU_IVF_FLAT\",\n",
+    "    metric_type=\"L2\",\n",
+    "    params={\"nlist\": 150}) # Number of clusters)\n",
+    "tic = time.perf_counter()\n",
+    "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n",
+    "collection.load()\n",
+    "toc = time.perf_counter()\n",
+    "print(f\"-  Index creation time: {toc - tic:.4f} seconds. ({index_params})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "2d6017ed",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:15.972764Z",
+     "iopub.status.busy": "2024-11-08T14:50:15.972368Z",
+     "iopub.status.idle": "2024-11-08T14:50:15.977806Z",
+     "shell.execute_reply": "2024-11-08T14:50:15.977064Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def search_cuvs_flat(query, top_k = 5, n_probe = 30):\n",
+    "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
+    "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
+    "    \n",
+    "    search_params = {\"nprobe\": n_probe}\n",
+    "    tic = time.perf_counter()\n",
+    "    hits = collection.search(\n",
+    "                data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n",
+    "            )\n",
+    "    toc = time.perf_counter()\n",
+    "\n",
+    "    # Output of top-k hits\n",
+    "    print(\"Input question:\", query)\n",
+    "    print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n",
+    "    for k in range(top_k):\n",
+    "        print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5cfb644",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:15.980796Z",
+     "iopub.status.busy": "2024-11-08T14:50:15.980408Z",
+     "iopub.status.idle": "2024-11-08T14:50:16.009271Z",
+     "shell.execute_reply": "2024-11-08T14:50:16.008579Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_flat(query=\"Who was Grace Hopper?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5694d00",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:16.012253Z",
+     "iopub.status.busy": "2024-11-08T14:50:16.011924Z",
+     "iopub.status.idle": "2024-11-08T14:50:16.043432Z",
+     "shell.execute_reply": "2024-11-08T14:50:16.042751Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_flat(query=\"Who was Alan Turing?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcfc3c5b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:16.046439Z",
+     "iopub.status.busy": "2024-11-08T14:50:16.046093Z",
+     "iopub.status.idle": "2024-11-08T14:50:16.071322Z",
+     "shell.execute_reply": "2024-11-08T14:50:16.070614Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_flat(query = \"What is creating tides?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a59d7b32-0832-4c3a-864e-aeb2e6e7fe1f",
+   "metadata": {},
+   "source": [
+    "## Using CAGRA: Hybrid GPU-CPU graph-based Vector Search\n",
+    "\n",
+    "CAGRA is a graph-based nearest neighbors implementation with state-of-the art performance for both small- and large-batch sized vector searches. \n",
+    "\n",
+    "CAGRA follows the same steps as IVF-FLAT and IVF-PQ in Milvus, but is also able to be adapted for querying on CPU.\n",
+    "This means that CAGRA is able to profit from a high training speed on GPU, as well as a low inference time on CPU, that minimize latency even on the smallest queries."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5ce4dab",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:16.074449Z",
+     "iopub.status.busy": "2024-11-08T14:50:16.074128Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.479027Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.478265Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Drop the current index if it exists\n",
+    "if collection.has_index():\n",
+    "    collection.release()\n",
+    "    collection.drop_index()\n",
+    "\n",
+    "# Create the IVF Flat index\n",
+    "index_params = dict(\n",
+    "    index_type=\"GPU_CAGRA\",\n",
+    "    metric_type=\"L2\",\n",
+    "    params={\"graph_degree\": 64, \"intermediate_graph_degree\": 128, \"build_algo\": \"NN_DESCENT\", \"adapt_for_cpu\": True})\n",
+    "tic = time.perf_counter()\n",
+    "collection.create_index(field_name=EMBEDDING_FIELD, index_params=index_params)\n",
+    "collection.load()\n",
+    "toc = time.perf_counter()\n",
+    "print(f\"-  Index creation time: {toc - tic:.4f} seconds. ({index_params})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "df229e21-f6b6-4d6c-ad54-2724f8738934",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.481748Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.481474Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.486324Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.485696Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def search_cuvs_cagra(query, top_k = 5, itopk = 32):\n",
+    "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
+    "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
+    "\n",
+    "    search_params = {\"params\": {\"itopk\": itopk, \"ef\": 35}}\n",
+    "    tic = time.perf_counter()\n",
+    "    hits = collection.search(\n",
+    "                data=np.array(question_embedding[None].cpu()), anns_field=EMBEDDING_FIELD, param=search_params, limit=top_k\n",
+    "            )\n",
+    "    toc = time.perf_counter()\n",
+    "\n",
+    "    # Output of top-k hits\n",
+    "    print(\"Input question:\", query)\n",
+    "    print(\"Results (after {:.3f} ms):\".format((toc - tic)*1000))\n",
+    "    for k in range(top_k):\n",
+    "        print(\"\\t{:.3f}\\t{}\".format(hits[0][k].distance, passages[hits[0][k].id]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5e862fd-b7e5-4423-8fbf-36918f02c8f3",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.489077Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.488790Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.513998Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.513319Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_cagra(query=\"Who was Grace Hopper?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb8a5b7b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.516748Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.516521Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.538982Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.538269Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_cagra(query=\"Who was Alan Turing?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c89810a",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-08T14:50:30.541508Z",
+     "iopub.status.busy": "2024-11-08T14:50:30.541287Z",
+     "iopub.status.idle": "2024-11-08T14:50:30.562722Z",
+     "shell.execute_reply": "2024-11-08T14:50:30.562085Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "search_cuvs_cagra(query=\"What is creating tides?\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/cuvs/README.md b/python/cuvs/README.md
index e69de29bb..27b494811 100644
--- a/python/cuvs/README.md
+++ b/python/cuvs/README.md
@@ -0,0 +1,3 @@
+# cuVS
+
+cuVS contains state-of-the-art implementations of several algorithms for running approximate nearest neighbors and clustering on the GPU. It can be used directly or through the various databases and other libraries that have integrated it. The primary goal of cuVS is to simplify the use of GPUs for vector similarity search and clustering.
diff --git a/python/cuvs/cuvs/neighbors/hnsw/__init__.py b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
index 5efcdf68b..fafff7d03 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/__init__.py
+++ b/python/cuvs/cuvs/neighbors/hnsw/__init__.py
@@ -13,10 +13,23 @@
 # limitations under the License.
 
 
-from .hnsw import Index, SearchParams, from_cagra, load, save, search
+from .hnsw import (
+    ExtendParams,
+    Index,
+    IndexParams,
+    SearchParams,
+    extend,
+    from_cagra,
+    load,
+    save,
+    search,
+)
 
 __all__ = [
+    "IndexParams",
     "Index",
+    "ExtendParams",
+    "extend",
     "SearchParams",
     "load",
     "save",
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
index 1cdc97406..e0c517933 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pxd
@@ -20,14 +20,25 @@ from libc.stdint cimport int32_t, uintptr_t
 from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t
 from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor
 from cuvs.distance_type cimport cuvsDistanceType
+from cuvs.neighbors.cagra.cagra cimport cuvsCagraIndex_t
 
 
 cdef extern from "cuvs/neighbors/hnsw.h" nogil:
-    ctypedef struct cuvsHnswSearchParams:
-        int32_t ef
-        int32_t numThreads
 
-    ctypedef cuvsHnswSearchParams* cuvsHnswSearchParams_t
+    ctypedef enum cuvsHnswHierarchy:
+        NONE
+        CPU
+
+    ctypedef struct cuvsHnswIndexParams:
+        cuvsHnswHierarchy hierarchy
+        int32_t ef_construction
+        int32_t num_threads
+
+    ctypedef cuvsHnswIndexParams* cuvsHnswIndexParams_t
+
+    cuvsError_t cuvsHnswIndexParamsCreate(cuvsHnswIndexParams_t* params)
+
+    cuvsError_t cuvsHnswIndexParamsDestroy(cuvsHnswIndexParams_t params)
 
     ctypedef struct cuvsHnswIndex:
         uintptr_t addr
@@ -39,6 +50,31 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
 
     cuvsError_t cuvsHnswIndexDestroy(cuvsHnswIndex_t index)
 
+    ctypedef struct cuvsHnswExtendParams:
+        int32_t num_threads
+
+    ctypedef cuvsHnswExtendParams* cuvsHnswExtendParams_t
+
+    cuvsError_t cuvsHnswExtendParamsCreate(cuvsHnswExtendParams_t* params)
+
+    cuvsError_t cuvsHnswExtendParamsDestroy(cuvsHnswExtendParams_t params)
+
+    cuvsError_t cuvsHnswFromCagra(cuvsResources_t res,
+                                  cuvsHnswIndexParams_t params,
+                                  cuvsCagraIndex_t cagra_index,
+                                  cuvsHnswIndex_t hnsw_index) except +
+
+    cuvsError_t cuvsHnswExtend(cuvsResources_t res,
+                               cuvsHnswExtendParams_t params,
+                               DLManagedTensor* data,
+                               cuvsHnswIndex_t index) except +
+
+    ctypedef struct cuvsHnswSearchParams:
+        int32_t ef
+        int32_t num_threads
+
+    ctypedef cuvsHnswSearchParams* cuvsHnswSearchParams_t
+
     cuvsError_t cuvsHnswSearch(cuvsResources_t res,
                                cuvsHnswSearchParams* params,
                                cuvsHnswIndex_t index,
@@ -46,7 +82,12 @@ cdef extern from "cuvs/neighbors/hnsw.h" nogil:
                                DLManagedTensor* neighbors,
                                DLManagedTensor* distances) except +
 
+    cuvsError_t cuvsHnswSerialize(cuvsResources_t res,
+                                  const char * filename,
+                                  cuvsHnswIndex_t index) except +
+
     cuvsError_t cuvsHnswDeserialize(cuvsResources_t res,
+                                    cuvsHnswIndexParams_t params,
                                     const char * filename,
                                     int32_t dim,
                                     cuvsDistanceType metric,
diff --git a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
index bcfaf167e..4c44350e8 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
+++ b/python/cuvs/cuvs/neighbors/hnsw/hnsw.pyx
@@ -39,41 +39,63 @@ from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
 
 
-cdef class SearchParams:
+cdef class IndexParams:
     """
-    HNSW search parameters
+    Parameters to build index for HNSW nearest neighbor search
 
     Parameters
     ----------
-    ef: int, default = 200
-        Maximum number of candidate list size used during search.
-    num_threads: int, default = 0
-        Number of CPU threads used to increase search parallelism.
-        When set to 0, the number of threads is automatically determined
-        using OpenMP's `omp_get_max_threads()`.
+    hierarchy : string, default = "none" (optional)
+        The hierarchy of the HNSW index. Valid values are ["none", "cpu"].
+        - "none": No hierarchy is built.
+        - "cpu": Hierarchy is built using CPU.
+    ef_construction : int, default = 200 (optional)
+        Maximum number of candidate list size used during construction
+        when hierarchy is `cpu`.
+    num_threads : int, default = 2 (optional)
+        Number of CPU threads used to increase construction parallelism
+        when hierarchy is `cpu`.
+        NOTE: Constructing the hierarchy when converting from a CAGRA graph
+        is highly sensitive to parallelism, and increasing the number of
+        threads can reduce the quality of the index.
     """
 
-    cdef cuvsHnswSearchParams params
+    cdef cuvsHnswIndexParams* params
+
+    def __cinit__(self):
+        check_cuvs(cuvsHnswIndexParamsCreate(&self.params))
+
+    def __dealloc__(self):
+        check_cuvs(cuvsHnswIndexParamsDestroy(self.params))
 
     def __init__(self, *,
-                 ef=200,
-                 num_threads=0):
-        self.params.ef = ef
-        self.params.numThreads = num_threads
+                 hierarchy="none",
+                 ef_construction=200,
+                 num_threads=2):
+        if hierarchy == "none":
+            self.params.hierarchy = cuvsHnswHierarchy.NONE
+        elif hierarchy == "cpu":
+            self.params.hierarchy = cuvsHnswHierarchy.CPU
+        else:
+            raise ValueError("Invalid hierarchy type."
+                             " Valid values are 'none' and 'cpu'.")
+        self.params.ef_construction = ef_construction
+        self.params.num_threads = num_threads
 
-    def __repr__(self):
-        attr_str = [attr + "=" + str(getattr(self, attr))
-                    for attr in [
-                        "ef", "num_threads"]]
-        return "SearchParams(type=HNSW, " + (", ".join(attr_str)) + ")"
+    @property
+    def hierarchy(self):
+        if self.params.hierarchy == cuvsHnswHierarchy.NONE:
+            return "none"
+        elif self.params.hierarchy == cuvsHnswHierarchy.CPU:
+            return "cpu"
 
     @property
-    def ef(self):
-        return self.params.ef
+    def ef_construction(self):
+        return self.params.ef_construction
 
     @property
     def num_threads(self):
-        return self.params.numThreads
+        return self.params.num_threads
 
 
 cdef class Index:
@@ -103,13 +125,44 @@ cdef class Index:
         return "Index(type=HNSW, metric=L2" + (", ".join(attr_str)) + ")"
 
 
+cdef class ExtendParams:
+    """
+    Parameters to extend the HNSW index with new data
+
+    Parameters
+    ----------
+    num_threads : int, default = 0 (optional)
+        Number of CPU threads used to increase construction parallelism.
+        When set to 0, the number of threads is automatically determined.
+    """
+
+    cdef cuvsHnswExtendParams* params
+
+    def __cinit__(self):
+        check_cuvs(cuvsHnswExtendParamsCreate(&self.params))
+
+    def __dealloc__(self):
+        check_cuvs(cuvsHnswExtendParamsDestroy(self.params))
+
+    def __init__(self, *,
+                 num_threads=0):
+        self.params.num_threads = num_threads
+
+    @property
+    def num_threads(self):
+        return self.params.num_threads
+
+
 @auto_sync_resources
-def save(filename, cagra.Index index, resources=None):
+def save(filename, Index index, resources=None):
     """
     Saves the CAGRA index to a file as an hnswlib index.
-    The saved index is immutable and can only be searched by the hnswlib
-    wrapper in cuVS, as the format is not compatible with the original
-    hnswlib.
+    If the index was constructed with `hnsw.IndexParams(hierarchy="none")`,
+    then the saved index is immutable and can only be searched by the hnswlib
+    wrapper in cuVS, as the format is not compatible with the original hnswlib.
+    However, if the index was constructed with
+    `hnsw.IndexParams(hierarchy="cpu")`, then the saved index is mutable and
+    compatible with the original hnswlib.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change.
@@ -119,7 +172,7 @@ def save(filename, cagra.Index index, resources=None):
     filename : string
         Name of the file.
     index : Index
-        Trained CAGRA index.
+        Trained HNSW index.
     {resources_docstring}
 
     Examples
@@ -131,23 +184,28 @@ def save(filename, cagra.Index index, resources=None):
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> # Build index
-    >>> index = cagra.build(cagra.IndexParams(), dataset)
+    >>> cagra_index = cagra.build(cagra.IndexParams(), dataset)
     >>> # Serialize and deserialize the cagra index built
-    >>> hnsw.save("my_index.bin", index)
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), cagra_index)
+    >>> hnsw.save("my_index.bin", hnsw_index)
     """
     cdef string c_filename = filename.encode('utf-8')
     cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
-    check_cuvs(cagra.cuvsCagraSerializeToHnswlib(res,
-                                                 c_filename.c_str(),
-                                                 index.index))
+    check_cuvs(cuvsHnswSerialize(res,
+                                 c_filename.c_str(),
+                                 index.index))
 
 
 @auto_sync_resources
-def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
+def load(IndexParams index_params, filename, dim, dtype, metric="sqeuclidean",
+         resources=None):
     """
-    Loads base-layer-only hnswlib index from file, which was originally
-    saved as a built CAGRA index. The loaded index is immutable and can only
-    be searched by the hnswlib wrapper in cuVS, as the format is not
+    Loads an HNSW index.
+    If the index was constructed with `hnsw.IndexParams(hierarchy="none")`,
+    then the loaded index is immutable and can only be searched by the hnswlib
+    wrapper in cuVS, as the format is not compatible with the original hnswlib.
+    However, if the index was constructed with
+    `hnsw.IndexParams(hierarchy="cpu")`, then the loaded index is mutable and
     compatible with the original hnswlib.
 
     Saving / loading the index is experimental. The serialization format is
@@ -156,6 +214,8 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
 
     Parameters
     ----------
+    index_params : IndexParams
+        Parameters that were used to convert CAGRA index to HNSW index.
     filename : string
         Name of the file.
     dim : int
@@ -214,6 +274,7 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
 
     check_cuvs(cuvsHnswDeserialize(
         res,
+        index_params.params,
         c_filename.c_str(),
         dim,
         distance_type,
@@ -224,26 +285,30 @@ def load(filename, dim, dtype, metric="sqeuclidean", resources=None):
 
 
 @auto_sync_resources
-def from_cagra(cagra.Index index, temporary_index_path=None, resources=None):
+def from_cagra(IndexParams index_params, cagra.Index cagra_index,
+               temporary_index_path=None, resources=None):
     """
-    Returns an hnsw base-layer-only index from a CAGRA index.
-
-    NOTE: This method uses the filesystem to write the CAGRA index in
-          `/tmp/<random_number>.bin` or the parameter `temporary_index_path`
-          if not None before reading it as an hnsw index,
-          then deleting the temporary file. The returned index is immutable
-          and can only be searched by the hnsw wrapper in cuVS, as the
-          format is not compatible with the original hnswlib library.
-          By `base_layer_only`, we mean that the hnsw index is created
-          without the additional layers that are used for the hierarchical
-          search in hnswlib. Instead, the base layer is used for the search.
+    Returns an HNSW index from a CAGRA index.
+
+    NOTE: When `index_params.hierarchy` is:
+          1. `NONE`: This method uses the filesystem to write the CAGRA index
+                     in `/tmp/<random_number>.bin` before reading it as an
+                     hnswlib index, then deleting the temporary file. The
+                     returned index is immutable and can only be searched by
+                     the hnswlib wrapper in cuVS, as the format is not
+                    compatible with the original hnswlib.
+          2. `CPU`: The returned index is mutable and can be extended with
+                    additional vectors. The serialized index is also compatible
+                    with the original hnswlib library.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change.
 
     Parameters
     ----------
-    index : Index
+    index_params : IndexParams
+        Parameters to convert the CAGRA index to HNSW index.
+    cagra_index : cagra.Index
         Trained CAGRA index.
     temporary_index_path : string, default = None
         Path to save the temporary index file. If None, the temporary file
@@ -262,18 +327,107 @@ def from_cagra(cagra.Index index, temporary_index_path=None, resources=None):
     >>> # Build index
     >>> index = cagra.build(cagra.IndexParams(), dataset)
     >>> # Serialize the CAGRA index to hnswlib base layer only index format
-    >>> hnsw_index = hnsw.from_cagra(index)
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), index)
     """
-    uuid_num = uuid.uuid4()
-    filename = temporary_index_path if temporary_index_path else \
-        f"/tmp/{uuid_num}.bin"
-    save(filename, index, resources=resources)
-    hnsw_index = load(filename, index.dim, np.dtype(index.active_index_type),
-                      "sqeuclidean", resources=resources)
-    os.remove(filename)
+
+    cdef Index hnsw_index = Index()
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+    check_cuvs(cuvsHnswFromCagra(
+        res,
+        index_params.params,
+        cagra_index.index,
+        hnsw_index.index
+    ))
+
+    hnsw_index.trained = True
     return hnsw_index
 
 
+@auto_sync_resources
+def extend(ExtendParams extend_params, Index index, data, resources=None):
+    """
+    Extends the HNSW index with new data.
+
+    Parameters
+    ----------
+    extend_params : ExtendParams
+    index : Index
+        Trained HNSW index.
+    data : Host array interface compliant matrix shape (n_samples, dim)
+        Supported dtype [float32, int8, uint8]
+    {resources_docstring}
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from cuvs.neighbors import hnsw, cagra
+    >>>
+    >>> n_samples = 50000
+    >>> n_features = 50
+    >>> dataset = np.random.random_sample((n_samples, n_features))
+    >>>
+    >>> # Build index
+    >>> index = cagra.build(hnsw.IndexParams(), dataset)
+    >>> # Load index
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(hierarchy="cpu"), index)
+    >>> # Extend the index with new data
+    >>> new_data = np.random.random_sample((n_samples, n_features))
+    >>> hnsw.extend(hnsw.ExtendParams(), hnsw_index, new_data)
+    """
+
+    data_ai = wrap_array(data)
+    _check_input_array(data_ai, [np.dtype('float32'),
+                                 np.dtype('uint8'),
+                                 np.dtype('int8')])
+
+    cdef cydlpack.DLManagedTensor* data_dlpack = cydlpack.dlpack_c(data_ai)
+    cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
+
+    check_cuvs(cuvsHnswExtend(
+        res,
+        extend_params.params,
+        data_dlpack,
+        index.index
+    ))
+
+
+cdef class SearchParams:
+    """
+    HNSW search parameters
+
+    Parameters
+    ----------
+    ef: int, default = 200
+        Maximum number of candidate list size used during search.
+    num_threads: int, default = 0
+        Number of CPU threads used to increase search parallelism.
+        When set to 0, the number of threads is automatically determined
+        using OpenMP's `omp_get_max_threads()`.
+    """
+
+    cdef cuvsHnswSearchParams params
+
+    def __init__(self, *,
+                 ef=200,
+                 num_threads=0):
+        self.params.ef = ef
+        self.params.num_threads = num_threads
+
+    def __repr__(self):
+        attr_str = [attr + "=" + str(getattr(self, attr))
+                    for attr in [
+                        "ef", "num_threads"]]
+        return "SearchParams(type=HNSW, " + (", ".join(attr_str)) + ")"
+
+    @property
+    def ef(self):
+        return self.params.ef
+
+    @property
+    def num_threads(self):
+        return self.params.num_threads
+
+
 @auto_sync_resources
 @auto_convert_output
 def search(SearchParams search_params,
@@ -290,15 +444,15 @@ def search(SearchParams search_params,
     ----------
     search_params : SearchParams
     index : Index
-        Trained CAGRA index.
-    queries : CUDA array interface compliant matrix shape (n_samples, dim)
+        Trained HNSW index.
+    queries : CPU array interface compliant matrix shape (n_samples, dim)
         Supported dtype [float, int]
     k : int
         The number of neighbors.
-    neighbors : Optional CUDA array interface compliant matrix shape
+    neighbors : Optional CPU array interface compliant matrix shape
                 (n_queries, k), dtype uint64_t. If supplied, neighbor
                 indices will be written here in-place. (default None)
-    distances : Optional CUDA array interface compliant matrix shape
+    distances : Optional CPU array interface compliant matrix shape
                 (n_queries, k) If supplied, the distances to the
                 neighbors will be written here in-place. (default None)
     {resources_docstring}
@@ -323,7 +477,7 @@ def search(SearchParams search_params,
     ...     num_threads=0
     ... )
     >>> # Convert CAGRA index to HNSW
-    >>> hnsw_index = hnsw.from_cagra(index)
+    >>> hnsw_index = hnsw.from_cagra(hnsw.IndexParams(), index)
     >>> # Using a pooling allocator reduces overhead of temporary array
     >>> # creation during search. This is useful if multiple searches
     >>> # are performed with same query size.
diff --git a/python/cuvs/cuvs/test/test_hnsw.py b/python/cuvs/cuvs/test/test_hnsw.py
index 20a35401e..20f583ae8 100644
--- a/python/cuvs/cuvs/test/test_hnsw.py
+++ b/python/cuvs/cuvs/test/test_hnsw.py
@@ -32,6 +32,7 @@ def run_hnsw_build_search_test(
     build_algo="ivf_pq",
     intermediate_graph_degree=128,
     graph_degree=64,
+    hierarchy="none",
     search_params={},
 ):
     dataset = generate_data((n_rows, n_cols), dtype)
@@ -53,7 +54,8 @@ def run_hnsw_build_search_test(
 
     assert index.trained
 
-    hnsw_index = hnsw.from_cagra(index)
+    hnsw_params = hnsw.IndexParams(hierarchy=hierarchy, num_threads=1)
+    hnsw_index = hnsw.from_cagra(hnsw_params, index)
 
     queries = generate_data((n_queries, n_cols), dtype)
 
@@ -83,10 +85,93 @@ def run_hnsw_build_search_test(
 @pytest.mark.parametrize("num_threads", [2, 4])
 @pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
 @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
-def test_hnsw(dtype, k, ef, num_threads, metric, build_algo):
+@pytest.mark.parametrize("hierarchy", ["none", "cpu"])
+def test_hnsw(dtype, k, ef, num_threads, metric, build_algo, hierarchy):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
     run_hnsw_build_search_test(
+        dtype=dtype,
+        k=k,
+        metric=metric,
+        build_algo=build_algo,
+        hierarchy=hierarchy,
+        search_params={"ef": ef, "num_threads": num_threads},
+    )
+
+
+def run_hnsw_extend_test(
+    n_rows=10000,
+    add_rows=2000,
+    n_cols=10,
+    n_queries=100,
+    k=10,
+    dtype=np.float32,
+    metric="sqeuclidean",
+    build_algo="ivf_pq",
+    intermediate_graph_degree=128,
+    graph_degree=64,
+    search_params={},
+):
+    dataset = generate_data((n_rows, n_cols), dtype)
+    add_dataset = generate_data((add_rows, n_cols), dtype)
+    if metric == "inner_product":
+        dataset = normalize(dataset, norm="l2", axis=1)
+        add_dataset = normalize(add_dataset, norm="l2", axis=1)
+        if dtype in [np.int8, np.uint8]:
+            pytest.skip(
+                "inner_product metric is not supported for int8/uint8 data"
+            )
+        if build_algo == "nn_descent":
+            pytest.skip("inner_product metric is not supported for nn_descent")
+
+    build_params = cagra.IndexParams(
+        metric=metric,
+        intermediate_graph_degree=intermediate_graph_degree,
+        graph_degree=graph_degree,
+        build_algo=build_algo,
+    )
+
+    index = cagra.build(build_params, dataset)
+
+    assert index.trained
+
+    hnsw_params = hnsw.IndexParams(hierarchy="cpu", num_threads=1)
+    hnsw_index = hnsw.from_cagra(hnsw_params, index)
+    hnsw.extend(hnsw.ExtendParams(), hnsw_index, add_dataset)
+
+    queries = generate_data((n_queries, n_cols), dtype)
+
+    search_params = hnsw.SearchParams(**search_params)
+
+    out_dist, out_idx = hnsw.search(search_params, hnsw_index, queries, k)
+
+    # Calculate reference values with sklearn
+    skl_metric = {
+        "sqeuclidean": "sqeuclidean",
+        "inner_product": "cosine",
+        "euclidean": "euclidean",
+    }[metric]
+    nn_skl = NearestNeighbors(
+        n_neighbors=k, algorithm="brute", metric=skl_metric
+    )
+    nn_skl.fit(np.vstack([dataset, add_dataset]))
+    skl_dist, skl_idx = nn_skl.kneighbors(queries, return_distance=True)
+
+    recall = calc_recall(out_idx, skl_idx)
+    print(recall)
+    assert recall > 0.95
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
+@pytest.mark.parametrize("k", [10, 20])
+@pytest.mark.parametrize("ef", [30, 40])
+@pytest.mark.parametrize("num_threads", [2, 4])
+@pytest.mark.parametrize("metric", ["sqeuclidean"])
+@pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+def test_hnsw_extend(dtype, k, ef, num_threads, metric, build_algo):
+    # Note that inner_product tests use normalized input which we cannot
+    # represent in int8, therefore we test only sqeuclidean metric here.
+    run_hnsw_extend_test(
         dtype=dtype,
         k=k,
         metric=metric,
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
index f1a7f272c..90a561bca 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_cagra_hnswlib.yaml
@@ -4,8 +4,11 @@ constraints:
 groups:
   base:
     build:
-      graph_degree: [32, 64, 128, 256]
+      graph_degree: [32, 64, 96, 128]
       intermediate_graph_degree: [32, 64, 96, 128]
       graph_build_algo: ["NN_DESCENT"]
+      hierarchy: ["none", "cpu"]
+      ef_construction: [64, 128, 256, 512]
+      num_threads: [2, 5, 10]
     search:
       ef: [10, 20, 40, 60, 80, 120, 200, 400, 600, 800]