From 0099a3a260892ec9e4f9f4fcd42e25d1b475758e Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 8 Jul 2024 13:35:16 +0000
Subject: [PATCH 01/10] forwards mean, var, stddev & sum implementation to
 reduce, also added KahanBabushkaNeumaierSum to strided summation

---
 cpp/include/raft/core/math.hpp                |   6 +-
 .../raft/linalg/detail/strided_reduction.cuh  |  70 ++++++----
 cpp/include/raft/stats/detail/mean.cuh        |  65 ++--------
 cpp/include/raft/stats/detail/stddev.cuh      | 121 +++++-------------
 cpp/include/raft/stats/detail/sum.cuh         |  99 +-------------
 cpp/test/stats/cov.cu                         |   3 +-
 cpp/test/stats/mean.cu                        | 100 ++++++++++-----
 cpp/test/stats/stddev.cu                      |  11 +-
 cpp/test/stats/sum.cu                         |  63 ++++-----
 9 files changed, 199 insertions(+), 339 deletions(-)
diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp
index 9ce768bf40..e082aaf41a 100644
--- a/cpp/include/raft/core/math.hpp
+++ b/cpp/include/raft/core/math.hpp
@@ -56,7 +56,11 @@ constexpr RAFT_INLINE_FUNCTION auto abs(T x)
                         !std::is_same_v<long long int, T>,
                       T>
 {
-  return x < T{0} ? -x : x;
+  if constexpr (std::is_unsigned_v<T>) {
+    return x;
+  } else {
+    return x < T{0} ? -x : x;
+  }
 }
 #if defined(_RAFT_HAS_CUDA)
 template <typename T>
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index 617ac6d874..a9094d3959 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -33,33 +33,43 @@ namespace detail {
 // for column major layout
 template <typename Type, typename MainLambda>
 RAFT_KERNEL stridedSummationKernel(
-  Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op)
+  Type* out, const Type* data, int D, int N, Type init, MainLambda main_op)
 {
   // Thread reduction
-  Type thread_data = Type(init);
-  int colStart     = blockIdx.x * blockDim.x + threadIdx.x;
+  Type thread_sum = Type(init);
+  Type thread_c   = Type(0);
+  int colStart    = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
     int stride   = blockDim.y * gridDim.y;
     for (int j = rowStart; j < N; j += stride) {
       int idx = colStart + j * D;
-      thread_data += main_op(data[idx], j);
+
+      // KahanBabushkaNeumaierSum
+      const Type cur_value = main_op(data[idx], j);
+      const Type t         = thread_sum + cur_value;
+      if (abs(thread_sum) >= abs(cur_value)) {
+        thread_c += (thread_sum - t) + cur_value;
+      } else {
+        thread_c += (cur_value - t) + thread_sum;
+      }
+      thread_sum = t;
     }
   }
 
+  // add correction term
+  thread_sum += thread_c;
+
   // Block reduction
-  extern __shared__ char tmp[];  // One element per thread in block
-  Type* temp  = (Type*)tmp;      // Cast to desired type
-  int myidx   = threadIdx.x + blockDim.x * threadIdx.y;
-  temp[myidx] = thread_data;
+  extern __shared__ char tmp[];
+  auto* block_sum = (Type*)tmp;
+  if (threadIdx.x < blockDim.x) block_sum[threadIdx.x] = Type(0);
+  __syncthreads();
+  raft::myAtomicAdd(block_sum + threadIdx.x, thread_sum);
   __syncthreads();
-  for (int j = blockDim.y / 2; j > 0; j /= 2) {
-    if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x];
-    __syncthreads();
-  }
 
   // Grid reduction
-  if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]);
+  if (colStart < D && (threadIdx.y == 0)) raft::myAtomicAdd(out + colStart, block_sum[threadIdx.x]);
 }
 
 // Kernel to perform reductions along the strided dimension
@@ -127,23 +137,35 @@ void stridedReduction(OutType* dots,
   /// for atomics in stridedKernel (redesign for this is already underway)
   if (!inplace) raft::linalg::unaryOp(dots, dots, D, raft::const_op(init), stream);
 
-  // Arbitrary numbers for now, probably need to tune
-  const dim3 thrds(32, 16);
-  IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
-  elemsPerThread         = (elemsPerThread > 8) ? 8 : elemsPerThread;
-  const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
-                   raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
-  const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
-
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
   if constexpr (std::is_same<ReduceLambda, raft::add_op>::value &&
-                std::is_same<InType, OutType>::value)
+                std::is_same<InType, OutType>::value) {
+    constexpr int TPB        = 256;
+    constexpr int ColsPerBlk = 8;
+    constexpr dim3 Block(ColsPerBlk, TPB / ColsPerBlk);
+    constexpr int MinRowsPerThread = 16;
+    constexpr int MinRowsPerBlk    = Block.y * MinRowsPerThread;
+    constexpr int MaxBlocksDimY    = 8192;
+
+    const dim3 grid(raft::ceildiv(D, (IdxType)ColsPerBlk),
+                    raft::min((IdxType)MaxBlocksDimY, raft::ceildiv(N, (IdxType)MinRowsPerBlk)));
+    const size_t shmemSize = sizeof(OutType) * Block.x;
+
     stridedSummationKernel<InType>
-      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
-  else
+      <<<grid, Block, shmemSize, stream>>>(dots, data, D, N, init, main_op);
+  } else {
+    // Arbitrary numbers for now, probably need to tune
+    const dim3 thrds(32, 16);
+    IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
+    elemsPerThread         = (elemsPerThread > 8) ? 8 : elemsPerThread;
+    const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
+                     raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
+    const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
+
     stridedReductionKernel<InType, OutType, IdxType>
       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op, reduce_op);
+  }
 
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 6c330acb26..76483cbfda 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <cub/cub.cuh>
@@ -25,61 +26,23 @@ namespace raft {
 namespace stats {
 namespace detail {
 
-///@todo: ColsPerBlk has been tested only for 32!
-template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-RAFT_KERNEL meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId           = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
-  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data            = Type(0);
-  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
-  for (IdxType i = rowId; i < N; i += stride)
-    thread_data += (colId < D) ? data[i * D + colId] : Type(0);
-  __shared__ Type smu[ColsPerBlk];
-  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
-  __syncthreads();
-  raft::myAtomicAdd(smu + thisColId, thread_data);
-  __syncthreads();
-  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
-}
-
-template <typename Type, typename IdxType, int TPB>
-RAFT_KERNEL meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  typedef cub::BlockReduce<Type, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type thread_data = Type(0);
-  IdxType colStart = N * blockIdx.x;
-  for (IdxType i = threadIdx.x; i < N; i += TPB) {
-    IdxType idx = colStart + i;
-    thread_data += data[idx];
-  }
-  Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; }
-}
-
 template <typename Type, typename IdxType = int>
 void mean(
   Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
 {
-  static const int TPB = 256;
-  if (rowMajor) {
-    static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream));
-    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(mu, data, D, N);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-    Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
-    raft::linalg::scalarMultiply(mu, mu, ratio, D, stream);
-  } else {
-    meanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(mu, data, D, N);
-  }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N));
+  raft::linalg::reduce(mu,
+                       data,
+                       D,
+                       N,
+                       Type(0),
+                       rowMajor,
+                       false,
+                       stream,
+                       false,
+                       raft::identity_op(),
+                       raft::add_op(),
+                       raft::mul_const_op<Type>(ratio));
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index bc2644a233..4af4060016 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -16,7 +16,9 @@
 
 #pragma once
 
+#include <raft/core/operators.hpp>
 #include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <cub/cub.cuh>
@@ -25,63 +27,6 @@ namespace raft {
 namespace stats {
 namespace detail {
 
-///@todo: ColPerBlk has been tested only for 32!
-template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-RAFT_KERNEL stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N)
-{
-  const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId           = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
-  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data            = Type(0);
-  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
-  for (IdxType i = rowId; i < N; i += stride) {
-    Type val = (colId < D) ? data[i * D + colId] : Type(0);
-    thread_data += val * val;
-  }
-  __shared__ Type sstd[ColsPerBlk];
-  if (threadIdx.x < ColsPerBlk) sstd[threadIdx.x] = Type(0);
-  __syncthreads();
-  raft::myAtomicAdd(sstd + thisColId, thread_data);
-  __syncthreads();
-  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(std + colId, sstd[thisColId]);
-}
-
-template <typename Type, typename IdxType, int TPB>
-RAFT_KERNEL stddevKernelColMajor(Type* std, const Type* data, const Type* mu, IdxType D, IdxType N)
-{
-  typedef cub::BlockReduce<Type, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type thread_data = Type(0);
-  IdxType colStart = N * blockIdx.x;
-  Type m           = mu[blockIdx.x];
-  for (IdxType i = threadIdx.x; i < N; i += TPB) {
-    IdxType idx = colStart + i;
-    Type diff   = data[idx] - m;
-    thread_data += diff * diff;
-  }
-  Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { std[blockIdx.x] = raft::sqrt(acc / N); }
-}
-
-template <typename Type, typename IdxType, int TPB>
-RAFT_KERNEL varsKernelColMajor(Type* var, const Type* data, const Type* mu, IdxType D, IdxType N)
-{
-  typedef cub::BlockReduce<Type, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type thread_data = Type(0);
-  IdxType colStart = N * blockIdx.x;
-  Type m           = mu[blockIdx.x];
-  for (IdxType i = threadIdx.x; i < N; i += TPB) {
-    IdxType idx = colStart + i;
-    Type diff   = data[idx] - m;
-    thread_data += diff * diff;
-  }
-  Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; }
-}
-
 /**
  * @brief Compute stddev of the input matrix
  *
@@ -110,26 +55,20 @@ void stddev(Type* std,
             bool rowMajor,
             cudaStream_t stream)
 {
-  static const int TPB = 256;
-  if (rowMajor) {
-    static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    RAFT_CUDA_TRY(cudaMemset(std, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(std, data, D, N);
-    Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
-    raft::linalg::binaryOp(
-      std,
-      std,
-      mu,
-      D,
-      [ratio] __device__(Type a, Type b) { return raft::sqrt(a * ratio - b * b); },
-      stream);
-  } else {
-    stddevKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(std, data, mu, D, N);
-  }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  raft::linalg::reduce(
+    std, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) {
+      return a * a;
+    });
+  Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N));
+  raft::linalg::binaryOp(
+    std,
+    std,
+    mu,
+    D,
+    raft::compose_op(raft::sqrt_op(),
+                     raft::abs_op(),
+                     [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }),
+    stream);
 }
 
 /**
@@ -160,21 +99,19 @@ void vars(Type* var,
           bool rowMajor,
           cudaStream_t stream)
 {
-  static const int TPB = 256;
-  if (rowMajor) {
-    static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
-    RAFT_CUDA_TRY(cudaMemset(var, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(var, data, D, N);
-    Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
-    raft::linalg::binaryOp(
-      var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
-  } else {
-    varsKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(var, data, mu, D, N);
-  }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  raft::linalg::reduce(
+    var, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) {
+      return a * a;
+    });
+  Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N));
+  raft::linalg::binaryOp(
+    var,
+    var,
+    mu,
+    D,
+    raft::compose_op(raft::abs_op(),
+                     [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }),
+    stream);
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index 4f85536e6c..39bd2c3b6c 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/reduce.cuh>
 #include <raft/util/cuda_utils.cuh>
 
 #include <cub/cub.cuh>
@@ -25,106 +26,10 @@ namespace raft {
 namespace stats {
 namespace detail {
 
-///@todo: ColsPerBlk has been tested only for 32!
-template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-RAFT_KERNEL sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId           = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
-  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_sum             = Type(0);
-  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
-  for (IdxType i = rowId; i < N; i += stride) {
-    thread_sum += (colId < D) ? data[i * D + colId] : Type(0);
-  }
-  __shared__ Type smu[ColsPerBlk];
-  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
-  __syncthreads();
-  raft::myAtomicAdd(smu + thisColId, thread_sum);
-  __syncthreads();
-  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
-}
-
-template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-RAFT_KERNEL sumKahanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId               = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId               = threadIdx.x / ColsPerBlk;
-  IdxType colId                   = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId                   = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_sum                 = Type(0);
-  Type thread_c                   = Type(0);
-  const IdxType stride            = RowsPerBlkPerIter * gridDim.x;
-  for (IdxType i = rowId; i < N; i += stride) {
-    // KahanBabushkaNeumaierSum
-    const Type cur_value = (colId < D) ? data[i * D + colId] : Type(0);
-    const Type t         = thread_sum + cur_value;
-    if (abs(thread_sum) >= abs(cur_value)) {
-      thread_c += (thread_sum - t) + cur_value;
-    } else {
-      thread_c += (cur_value - t) + thread_sum;
-    }
-    thread_sum = t;
-  }
-  thread_sum += thread_c;
-  __shared__ Type smu[ColsPerBlk];
-  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0);
-  __syncthreads();
-  raft::myAtomicAdd(smu + thisColId, thread_sum);
-  __syncthreads();
-  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
-}
-
-template <typename Type, typename IdxType, int TPB>
-RAFT_KERNEL sumKahanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
-  typedef cub::BlockReduce<Type, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  Type thread_sum  = Type(0);
-  Type thread_c    = Type(0);
-  IdxType colStart = N * blockIdx.x;
-  for (IdxType i = threadIdx.x; i < N; i += TPB) {
-    // KahanBabushkaNeumaierSum
-    IdxType idx          = colStart + i;
-    const Type cur_value = data[idx];
-    const Type t         = thread_sum + cur_value;
-    if (abs(thread_sum) >= abs(cur_value)) {
-      thread_c += (thread_sum - t) + cur_value;
-    } else {
-      thread_c += (cur_value - t) + thread_sum;
-    }
-    thread_sum = t;
-  }
-  thread_sum += thread_c;
-  Type acc = BlockReduce(temp_storage).Sum(thread_sum);
-  if (threadIdx.x == 0) { mu[blockIdx.x] = acc; }
-}
-
 template <typename Type, typename IdxType = int>
 void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
 {
-  static const int TPB = 256;
-  if (rowMajor) {
-    static const int ColsPerBlk       = 8;
-    static const int MinRowsPerThread = 16;
-    static const int MinRowsPerBlk    = (TPB / ColsPerBlk) * MinRowsPerThread;
-    static const int MaxBlocksDimX    = 8192;
-
-    const IdxType grid_y = raft::ceildiv(D, (IdxType)ColsPerBlk);
-    const IdxType grid_x =
-      raft::min((IdxType)MaxBlocksDimX, raft::ceildiv(N, (IdxType)MinRowsPerBlk));
-
-    dim3 grid(grid_x, grid_y);
-    RAFT_CUDA_TRY(cudaMemset(output, 0, sizeof(Type) * D));
-    sumKahanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(output, input, D, N);
-  } else {
-    sumKahanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
-  }
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  raft::linalg::reduce(output, input, D, N, Type(0), rowMajor, false, stream);
 }
 
 }  // namespace detail
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index 41812979b6..df5177056d 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -40,7 +40,8 @@ struct CovInputs {
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const CovInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.sample << ", " << dims.rowMajor << "}" << std::endl;
 }
 
 template <typename T>
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 61b57ce739..dbd8549049 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -35,12 +35,14 @@ struct MeanInputs {
   int rows, cols;
   bool sample, rowMajor;
   unsigned long long int seed;
+  T stddev = (T)1.0;
 };
 
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const MeanInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.sample << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl;
 }
 
 template <typename T>
@@ -61,7 +63,7 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
   {
     raft::random::RngState r(params.seed);
     int len = rows * cols;
-    normal(handle, r, data.data(), len, params.mean, (T)1.0);
+    normal(handle, r, data.data(), len, params.mean, params.stddev);
     meanSGtest(data.data(), stream);
   }
 
@@ -96,38 +98,72 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
 const std::vector<MeanInputs<float>> inputsf = {
-  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},    {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},   {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},     {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},    {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},   {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},  {0.15f, -1.f, 1024, 256, false, true, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, false, 1234ULL},   {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, false, 1234ULL},    {2.0f, -1.f, 1, 130, true, false, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, true, 1234ULL},    {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, true, 1234ULL},     {2.0f, -1.f, 1, 130, false, true, 1234ULL},
-  {2.0f, -1.f, 1, 1, false, false, 1234ULL},       {2.0f, -1.f, 1, 1, false, true, 1234ULL},
-  {2.0f, -1.f, 7, 23, false, false, 1234ULL},      {2.0f, -1.f, 7, 23, false, true, 1234ULL},
-  {2.0f, -1.f, 17, 5, false, false, 1234ULL},      {2.0f, -1.f, 17, 5, false, true, 1234ULL}};
+  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
+  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, true, 1234ULL},
+  {0.15f, -1.f, 1030, 1, false, false, 1234ULL},
+  {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
+  {2.0f, -1.f, 31, 120, false, false, 1234ULL},
+  {2.0f, -1.f, 1, 130, true, false, 1234ULL},
+  {0.15f, -1.f, 1030, 1, false, true, 1234ULL},
+  {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
+  {2.0f, -1.f, 31, 120, false, true, 1234ULL},
+  {2.0f, -1.f, 1, 130, false, true, 1234ULL},
+  {2.0f, -1.f, 1, 1, false, false, 1234ULL},
+  {2.0f, -1.f, 1, 1, false, true, 1234ULL},
+  {2.0f, -1.f, 7, 23, false, false, 1234ULL},
+  {2.0f, -1.f, 7, 23, false, true, 1234ULL},
+  {2.0f, -1.f, 17, 5, false, false, 1234ULL},
+  {2.0f, -1.f, 17, 5, false, true, 1234ULL},
+  {0.0001f, 0.1f, 1 << 27, 2, false, false, 1234ULL, 0.0001f},
+  {0.0001f, 0.1f, 1 << 27, 2, false, true, 1234ULL, 0.0001f}};
 
 const std::vector<MeanInputs<double>> inputsd = {
-  {0.15, 1.0, 1024, 32, true, false, 1234ULL},    {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, false, 1234ULL},   {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, false, 1234ULL},  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, false, 1234ULL}, {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-  {0.15, 1.0, 1024, 32, true, true, 1234ULL},     {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, true, 1234ULL},    {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, true, 1234ULL},   {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, true, 1234ULL},  {0.15, -1.0, 1024, 256, false, true, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, false, 1234ULL},   {0.15, -1.0, 1030, 60, true, false, 1234ULL},
-  {2.0, -1.0, 31, 120, false, false, 1234ULL},    {2.0, -1.0, 1, 130, true, false, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, true, 1234ULL},    {0.15, -1.0, 1030, 60, true, true, 1234ULL},
-  {2.0, -1.0, 31, 120, false, true, 1234ULL},     {2.0, -1.0, 1, 130, false, true, 1234ULL},
-  {2.0, -1.0, 1, 1, false, false, 1234ULL},       {2.0, -1.0, 1, 1, false, true, 1234ULL},
-  {2.0, -1.0, 7, 23, false, false, 1234ULL},      {2.0, -1.0, 7, 23, false, true, 1234ULL},
-  {2.0, -1.0, 17, 5, false, false, 1234ULL},      {2.0, -1.0, 17, 5, false, true, 1234ULL}};
+  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
+  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 256, false, true, 1234ULL},
+  {0.15, -1.0, 1030, 1, false, false, 1234ULL},
+  {0.15, -1.0, 1030, 60, true, false, 1234ULL},
+  {2.0, -1.0, 31, 120, false, false, 1234ULL},
+  {2.0, -1.0, 1, 130, true, false, 1234ULL},
+  {0.15, -1.0, 1030, 1, false, true, 1234ULL},
+  {0.15, -1.0, 1030, 60, true, true, 1234ULL},
+  {2.0, -1.0, 31, 120, false, true, 1234ULL},
+  {2.0, -1.0, 1, 130, false, true, 1234ULL},
+  {2.0, -1.0, 1, 1, false, false, 1234ULL},
+  {2.0, -1.0, 1, 1, false, true, 1234ULL},
+  {2.0, -1.0, 7, 23, false, false, 1234ULL},
+  {2.0, -1.0, 7, 23, false, true, 1234ULL},
+  {2.0, -1.0, 17, 5, false, false, 1234ULL},
+  {2.0, -1.0, 17, 5, false, true, 1234ULL},
+  {1e-8, 1e-1, 1 << 27, 2, false, false, 1234ULL, 0.0001},
+  {1e-8, 1e-1, 1 << 27, 2, false, true, 1234ULL, 0.0001}};
 
 typedef MeanTest<float> MeanTestF;
 TEST_P(MeanTestF, Result)
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 641621c1c6..1459c8fe14 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -39,7 +39,8 @@ struct StdDevInputs {
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const StdDevInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.sample << ", " << dims.rowMajor << "}" << std::endl;
 }
 
 template <typename T>
@@ -153,7 +154,9 @@ const std::vector<StdDevInputs<float>> inputsf = {
   {1.f, -1.f, 2.f, 17, 5, false, false, 1234ULL},
   {1.f, -1.f, 2.f, 1, 1, false, true, 1234ULL},
   {1.f, -1.f, 2.f, 7, 23, false, true, 1234ULL},
-  {1.f, -1.f, 2.f, 17, 5, false, true, 1234ULL}};
+  {1.f, -1.f, 2.f, 17, 5, false, true, 1234ULL},
+  {0.00001f, 0.001f, 0.f, 1 << 27, 2, false, true, 1234ULL},
+  {0.00001f, 0.001f, 0.f, 1 << 27, 2, false, false, 1234ULL}};
 
 const std::vector<StdDevInputs<double>> inputsd = {
   {0.1, 1.0, 2.0, 1024, 32, true, false, 1234ULL},
@@ -183,7 +186,9 @@ const std::vector<StdDevInputs<double>> inputsd = {
   {1.0, -1.0, 2.0, 17, 5, false, false, 1234ULL},
   {1.0, -1.0, 2.0, 1, 1, false, true, 1234ULL},
   {1.0, -1.0, 2.0, 7, 23, false, true, 1234ULL},
-  {1.0, -1.0, 2.0, 17, 5, false, true, 1234ULL}};
+  {1.0, -1.0, 2.0, 17, 5, false, true, 1234ULL},
+  {1e-7, 0.001, 0.0, 1 << 27, 2, false, true, 1234ULL},
+  {1e-7, 0.001, 0.0, 1 << 27, 2, false, false, 1234ULL}};
 
 typedef StdDevTest<float> StdDevTestF;
 TEST_P(StdDevTestF, Result)
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index bf2aa44a2c..14f30dfa5b 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -40,7 +40,8 @@ struct SumInputs {
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const SumInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.rowMajor << "}" << std::endl;
 }
 
 template <typename T>
@@ -96,43 +97,29 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
   rmm::device_uvector<T> data, sum_act;
 };
 
-const std::vector<SumInputs<float>> inputsf = {{0.0001f, 4, 5, true, 1},
-                                               {0.0001f, 1024, 32, true, 1},
-                                               {0.0001f, 1024, 256, true, 1},
-                                               {0.0001f, 100000000, 1, true, 0.001},
-                                               {0.0001f, 1, 30, true, 0.001},
-                                               {0.0001f, 1, 1, true, 0.001},
-                                               {0.0001f, 17, 5, true, 0.001},
-                                               {0.0001f, 7, 23, true, 0.001},
-                                               {0.0001f, 3, 97, true, 0.001},
-                                               {0.0001f, 4, 5, false, 1},
-                                               {0.0001f, 1024, 32, false, 1},
-                                               {0.0001f, 1024, 256, false, 1},
-                                               {0.0001f, 100000000, 1, false, 0.001},
-                                               {0.0001f, 1, 30, false, 0.001},
-                                               {0.0001f, 1, 1, false, 0.001},
-                                               {0.0001f, 17, 5, false, 0.001},
-                                               {0.0001f, 7, 23, false, 0.001},
-                                               {0.0001f, 3, 97, false, 0.001}};
-
-const std::vector<SumInputs<double>> inputsd = {{0.000001, 1024, 32, true, 1},
-                                                {0.000001, 1024, 256, true, 1},
-                                                {0.000001, 1024, 256, true, 1},
-                                                {0.000001, 100000000, 1, true, 0.001},
-                                                {0.000001, 1, 30, true, 0.0001},
-                                                {0.000001, 1, 1, true, 0.0001},
-                                                {0.000001, 17, 5, true, 0.0001},
-                                                {0.000001, 7, 23, true, 0.0001},
-                                                {0.000001, 3, 97, true, 0.0001},
-                                                {0.000001, 1024, 32, false, 1},
-                                                {0.000001, 1024, 256, false, 1},
-                                                {0.000001, 1024, 256, false, 1},
-                                                {0.000001, 100000000, 1, false, 0.001},
-                                                {0.000001, 1, 30, false, 0.0001},
-                                                {0.000001, 1, 1, false, 0.0001},
-                                                {0.000001, 17, 5, false, 0.0001},
-                                                {0.000001, 7, 23, false, 0.0001},
-                                                {0.000001, 3, 97, false, 0.0001}};
+const std::vector<SumInputs<float>> inputsf = {
+  {0.0001f, 4, 5, true, 1},          {0.0001f, 1024, 32, true, 1},
+  {0.0001f, 1024, 256, true, 1},     {0.0001f, 100000000, 1, true, 0.001},
+  {0.0001f, 1 << 27, 2, true, 0.1},  {0.0001f, 1, 30, true, 0.001},
+  {0.0001f, 1, 1, true, 0.001},      {0.0001f, 17, 5, true, 0.001},
+  {0.0001f, 7, 23, true, 0.001},     {0.0001f, 3, 97, true, 0.001},
+  {0.0001f, 4, 5, false, 1},         {0.0001f, 1024, 32, false, 1},
+  {0.0001f, 1024, 256, false, 1},    {0.0001f, 100000000, 1, false, 0.001},
+  {0.0001f, 1 << 27, 2, false, 0.1}, {0.0001f, 1, 30, false, 0.001},
+  {0.0001f, 1, 1, false, 0.001},     {0.0001f, 17, 5, false, 0.001},
+  {0.0001f, 7, 23, false, 0.001},    {0.0001f, 3, 97, false, 0.001}};
+
+const std::vector<SumInputs<double>> inputsd = {
+  {0.000001, 1024, 32, true, 1},    {0.000001, 1024, 256, true, 1},
+  {0.000001, 1024, 256, true, 1},   {0.000001, 100000000, 1, true, 0.001},
+  {1e-10, 1 << 27, 2, true, 0.1},   {0.000001, 1, 30, true, 0.0001},
+  {0.000001, 1, 1, true, 0.0001},   {0.000001, 17, 5, true, 0.0001},
+  {0.000001, 7, 23, true, 0.0001},  {0.000001, 3, 97, true, 0.0001},
+  {0.000001, 1024, 32, false, 1},   {0.000001, 1024, 256, false, 1},
+  {0.000001, 1024, 256, false, 1},  {0.000001, 100000000, 1, false, 0.001},
+  {1e-10, 1 << 27, 2, false, 0.1},  {0.000001, 1, 30, false, 0.0001},
+  {0.000001, 1, 1, false, 0.0001},  {0.000001, 17, 5, false, 0.0001},
+  {0.000001, 7, 23, false, 0.0001}, {0.000001, 3, 97, false, 0.0001}};
 
 typedef SumTest<float> SumTestF;
 typedef SumTest<double> SumTestD;

From 11afafc4130b04fe281ce292f4ce263d4d1b62e4 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 9 Jul 2024 22:14:45 +0000
Subject: [PATCH 02/10] add Kahan sum for coalesced kernels

---
 .../linalg/detail/coalesced_reduction-inl.cuh | 197 +++++++++++++++++-
 cpp/test/stats/sum.cu                         |  35 +++-
 2 files changed, 219 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index 9f3be7ce0e..6485dff8a7 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -42,6 +42,18 @@ struct ReductionThinPolicy {
   static constexpr bool NoSequentialReduce = noLoop;
 };
 
+template <typename Type>
+DI void KahanBabushkaNeumaierSum(Type& sum, Type& compensation, const Type& cur_value)
+{
+  const Type t = sum + cur_value;
+  if (abs(sum) >= abs(cur_value)) {
+    compensation += (sum - t) + cur_value;
+  } else {
+    compensation += (cur_value - t) + sum;
+  }
+  sum = t;
+}
+
 template <typename Policy,
           typename InType,
           typename OutType,
@@ -130,6 +142,99 @@ RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   }
 }
 
+template <typename Policy,
+          typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename FinalLambda>
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedSumThinKernel(OutType* dots,
+                                                                              const InType* data,
+                                                                              IdxType D,
+                                                                              IdxType N,
+                                                                              OutType init,
+                                                                              MainLambda main_op,
+                                                                              FinalLambda final_op,
+                                                                              bool inplace = false)
+{
+  /* The strategy to achieve near-SOL memory bandwidth differs based on D:
+   *  - For small D, we need to process multiple rows per logical warp in order to have
+   *    multiple loads per thread and increase bytes in flight and amortize latencies.
+   *  - For large D, we start with a sequential reduction. The compiler partially unrolls
+   *    that loop (e.g. first a loop of stride 16, then 8, 4, and 1).
+   */
+  IdxType i0 = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
+  if (i0 >= N) return;
+
+  OutType acc[Policy::RowsPerLogicalWarp];
+  OutType thread_c[Policy::RowsPerLogicalWarp];
+
+#pragma unroll
+  for (int k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+    acc[k]      = init;
+    thread_c[k] = 0;
+  }
+
+  if constexpr (Policy::NoSequentialReduce) {
+    IdxType j = threadIdx.x;
+    if (j < D) {
+#pragma unroll
+      for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+        // Only the first row is known to be within bounds. Clamp to avoid out-of-mem read.
+        const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1);
+        // acc[k]          = reduce_op(acc[k], main_op(data[j + (D * i)], j));
+        KahanBabushkaNeumaierSum<OutType>(acc[k], thread_c[k], main_op(data[j + (D * i)], j));
+      }
+    }
+  } else {
+    for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
+#pragma unroll
+      for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+        const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1);
+        // acc[k]          = reduce_op(acc[k], main_op(data[j + (D * i)], j));
+        KahanBabushkaNeumaierSum<OutType>(acc[k], thread_c[k], main_op(data[j + (D * i)], j));
+      }
+    }
+  }
+
+  /* This vector reduction has two benefits compared to naive separate reductions:
+   * - It avoids the LSU bottleneck when the number of columns is around 32 (e.g. for 32, 5 shuffles
+   *   are required and there is no initial sequential reduction to amortize that cost).
+   * - It distributes the outputs to multiple threads, enabling a coalesced store when the number of
+   *   rows per logical warp and logical warp size are equal.
+   */
+  raft::logicalWarpReduceVector<Policy::LogicalWarpSize, Policy::RowsPerLogicalWarp>(
+    acc, threadIdx.x, raft::add_op());
+
+  raft::logicalWarpReduceVector<Policy::LogicalWarpSize, Policy::RowsPerLogicalWarp>(
+    thread_c, threadIdx.x, raft::add_op());
+
+  constexpr int reducOutVecWidth =
+    std::max(1, Policy::RowsPerLogicalWarp / Policy::LogicalWarpSize);
+  constexpr int reducOutGroupSize =
+    std::max(1, Policy::LogicalWarpSize / Policy::RowsPerLogicalWarp);
+  constexpr int reducNumGroups = Policy::LogicalWarpSize / reducOutGroupSize;
+
+  if (threadIdx.x % reducOutGroupSize == 0) {
+    const int groupId = threadIdx.x / reducOutGroupSize;
+    if (inplace) {
+#pragma unroll
+      for (int k = 0; k < reducOutVecWidth; k++) {
+        const int reductionId = k * reducNumGroups + groupId;
+        const IdxType i       = i0 + reductionId * Policy::NumLogicalWarps;
+        if (i < N) { dots[i] = final_op(dots[i] + acc[k] + thread_c[k]); }
+      }
+    } else {
+#pragma unroll
+      for (int k = 0; k < reducOutVecWidth; k++) {
+        const int reductionId = k * reducNumGroups + groupId;
+        const IdxType i       = i0 + reductionId * Policy::NumLogicalWarps;
+        if (i < N) { dots[i] = final_op(acc[k] + thread_c[k]); }
+      }
+    }
+  }
+}
+
 template <typename Policy,
           typename InType,
           typename OutType      = InType,
@@ -156,8 +261,13 @@ void coalescedReductionThin(OutType* dots,
     static_cast<int>(Policy::NoSequentialReduce));
   dim3 threads(Policy::LogicalWarpSize, Policy::NumLogicalWarps, 1);
   dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
-  coalescedReductionThinKernel<Policy>
-    <<<blocks, threads, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  if constexpr (std::is_same_v<ReduceLambda, raft::add_op>) {
+    coalescedSumThinKernel<Policy>
+      <<<blocks, threads, 0, stream>>>(dots, data, D, N, init, main_op, final_op, inplace);
+  } else {
+    coalescedReductionThinKernel<Policy><<<blocks, threads, 0, stream>>>(
+      dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -240,6 +350,44 @@ RAFT_KERNEL __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots,
   }
 }
 
+template <int TPB,
+          typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename FinalLambda>
+RAFT_KERNEL __launch_bounds__(TPB) coalescedSumMediumKernel(OutType* dots,
+                                                            const InType* data,
+                                                            IdxType D,
+                                                            IdxType N,
+                                                            OutType init,
+                                                            MainLambda main_op,
+                                                            FinalLambda final_op,
+                                                            bool inplace = false)
+{
+  typedef cub::BlockReduce<OutType, TPB, cub::BLOCK_REDUCE_RAKING> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage1;
+  __shared__ typename BlockReduce::TempStorage temp_storage2;
+  OutType thread_data = init;
+  OutType thread_c    = (OutType)0;
+
+  IdxType rowStart = blockIdx.x * D;
+  for (IdxType i = threadIdx.x; i < D; i += TPB) {
+    IdxType idx = rowStart + i;
+    KahanBabushkaNeumaierSum<OutType>(thread_data, thread_c, main_op(data[idx], i));
+  }
+  OutType block_acc = BlockReduce(temp_storage1).Sum(thread_data);
+  OutType block_c   = BlockReduce(temp_storage2).Sum(thread_c);
+
+  if (threadIdx.x == 0) {
+    if (inplace) {
+      dots[blockIdx.x] = final_op(dots[blockIdx.x] + block_acc + block_c);
+    } else {
+      dots[blockIdx.x] = final_op(block_acc + block_c);
+    }
+  }
+}
+
 template <int TPB,
           typename InType,
           typename OutType      = InType,
@@ -259,8 +407,13 @@ void coalescedReductionMedium(OutType* dots,
                               FinalLambda final_op   = raft::identity_op())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("coalescedReductionMedium<%d>", TPB);
-  coalescedReductionMediumKernel<TPB>
-    <<<N, TPB, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  if constexpr (std::is_same_v<ReduceLambda, raft::add_op>) {
+    coalescedSumMediumKernel<TPB>
+      <<<N, TPB, 0, stream>>>(dots, data, D, N, init, main_op, final_op, inplace);
+  } else {
+    coalescedReductionMediumKernel<TPB>
+      <<<N, TPB, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+  }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -322,6 +475,32 @@ RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
   if (threadIdx.x == 0) { buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = acc; }
 }
 
+template <typename Policy, typename InType, typename OutType, typename IdxType, typename MainLambda>
+RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedSumThickKernel(
+  OutType* buffer, const InType* data, IdxType D, IdxType N, OutType init, MainLambda main_op)
+{
+  typedef cub::BlockReduce<OutType, Policy::ThreadsPerBlock, cub::BLOCK_REDUCE_RAKING> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage1;
+  __shared__ typename BlockReduce::TempStorage temp_storage2;
+
+  OutType thread_data = init;
+  OutType thread_c    = (OutType)0;
+
+  IdxType rowStart = blockIdx.x * D;
+  for (IdxType i = blockIdx.y * Policy::ThreadsPerBlock + threadIdx.x; i < D;
+       i += Policy::BlockStride) {
+    IdxType idx = rowStart + i;
+    KahanBabushkaNeumaierSum<OutType>(thread_data, thread_c, main_op(data[idx], i));
+  }
+
+  OutType block_acc = BlockReduce(temp_storage1).Sum(thread_data);
+  OutType block_c   = BlockReduce(temp_storage2).Sum(thread_c);
+
+  if (threadIdx.x == 0) {
+    buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = block_acc + block_c;
+  }
+}
+
 template <typename ThickPolicy,
           typename ThinPolicy,
           typename InType,
@@ -355,9 +534,13 @@ void coalescedReductionThick(OutType* dots,
    *  2. coalescedReductionThinKernel reduces [N x BlocksPerRow] to [N x 1]. It doesn't apply any
    *     main_op but applies final_op. If in-place, the existing and new values are reduced.
    */
-
-  coalescedReductionThickKernel<ThickPolicy>
-    <<<blocks, threads, 0, stream>>>(buffer.data(), data, D, N, init, main_op, reduce_op);
+  if constexpr (std::is_same_v<ReduceLambda, raft::add_op>) {
+    coalescedSumThickKernel<ThickPolicy>
+      <<<blocks, threads, 0, stream>>>(buffer.data(), data, D, N, init, main_op);
+  } else {
+    coalescedReductionThickKernel<ThickPolicy>
+      <<<blocks, threads, 0, stream>>>(buffer.data(), data, D, N, init, main_op, reduce_op);
+  }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   coalescedReductionThin<ThinPolicy>(dots,
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index 14f30dfa5b..fbb398cc5b 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -41,7 +41,7 @@ template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const SumInputs<T>& dims)
 {
   return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
-            << dims.rowMajor << "}" << std::endl;
+            << dims.rowMajor << ", " << dims.value << "}" << std::endl;
 }
 
 template <typename T>
@@ -58,13 +58,31 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
   }
 
  protected:
-  void runTest()
+  void runTest(bool checkErrorCompensation = false)
   {
     int len = rows * cols;
 
+    double large_factor = 1e7;
+
+    if constexpr (std::is_same_v<double, T>) large_factor = 1e12;
+
     std::vector<T> data_h(len);
     for (int i = 0; i < len; i++) {
-      data_h[i] = T(params.value);
+      data_h[i] = double(params.value);
+      int row   = params.rowMajor ? i / cols : i % rows;
+
+      // every 3 elements (in a column) contain 2 large dummy elements
+      // (one of them negative) and one element with 3x compensating
+      // for the 2 missing elements
+      if (checkErrorCompensation && row % 3 == 2) {
+        data_h[i] = double(params.value) * large_factor;
+        // compensate with opposite error 3 rows up
+        int idx2     = params.rowMajor ? (i - cols) : (i - 1);
+        data_h[idx2] = -1 * double(params.value) * large_factor;
+        // compensate 2 missing values
+        int idx3     = params.rowMajor ? (i - 2 * cols) : (i - 2);
+        data_h[idx3] = 3.0 * double(params.value);
+      }
     }
 
     raft::update_device(data.data(), data_h.data(), len, stream);
@@ -84,8 +102,10 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
 
     double expected = double(params.rows) * params.value;
 
+    double tolerance = checkErrorCompensation ? 100 * params.tolerance : params.tolerance;
+
     ASSERT_TRUE(raft::devArrMatch(
-      T(expected), sum_act.data(), params.cols, raft::CompareApprox<T>(params.tolerance)));
+      T(expected), sum_act.data(), params.cols, raft::CompareApprox<T>(tolerance)));
   }
 
  protected:
@@ -112,12 +132,12 @@ const std::vector<SumInputs<float>> inputsf = {
 const std::vector<SumInputs<double>> inputsd = {
   {0.000001, 1024, 32, true, 1},    {0.000001, 1024, 256, true, 1},
   {0.000001, 1024, 256, true, 1},   {0.000001, 100000000, 1, true, 0.001},
-  {1e-10, 1 << 27, 2, true, 0.1},   {0.000001, 1, 30, true, 0.0001},
+  {1e-9, 1 << 27, 2, true, 0.1},    {0.000001, 1, 30, true, 0.0001},
   {0.000001, 1, 1, true, 0.0001},   {0.000001, 17, 5, true, 0.0001},
   {0.000001, 7, 23, true, 0.0001},  {0.000001, 3, 97, true, 0.0001},
   {0.000001, 1024, 32, false, 1},   {0.000001, 1024, 256, false, 1},
   {0.000001, 1024, 256, false, 1},  {0.000001, 100000000, 1, false, 0.001},
-  {1e-10, 1 << 27, 2, false, 0.1},  {0.000001, 1, 30, false, 0.0001},
+  {1e-9, 1 << 27, 2, false, 0.1},   {0.000001, 1, 30, false, 0.0001},
   {0.000001, 1, 1, false, 0.0001},  {0.000001, 17, 5, false, 0.0001},
   {0.000001, 7, 23, false, 0.0001}, {0.000001, 3, 97, false, 0.0001}};
 
@@ -127,6 +147,9 @@ typedef SumTest<double> SumTestD;
 TEST_P(SumTestF, Result) { runTest(); }
 TEST_P(SumTestD, Result) { runTest(); }
 
+TEST_P(SumTestF, Accuracy) { runTest(true); }
+TEST_P(SumTestD, Accuracy) { runTest(true); }
+
 INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf));
 
 INSTANTIATE_TEST_CASE_P(SumTests, SumTestD, ::testing::ValuesIn(inputsd));

From c2f5d8f42f463c43f99e9b81ca7ba7ae003239bf Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Tue, 9 Jul 2024 22:20:54 +0000
Subject: [PATCH 03/10] improve Kahan sum for strided kernels, also compensate
 on block

---
 .../raft/linalg/detail/strided_reduction.cuh  | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index a9094d3959..1a0bc4b32f 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -57,19 +57,30 @@ RAFT_KERNEL stridedSummationKernel(
     }
   }
 
-  // add correction term
-  thread_sum += thread_c;
-
   // Block reduction
   extern __shared__ char tmp[];
   auto* block_sum = (Type*)tmp;
-  if (threadIdx.x < blockDim.x) block_sum[threadIdx.x] = Type(0);
+  auto* block_c   = block_sum + blockDim.x;
+
+  if (threadIdx.y == 0) {
+    block_sum[threadIdx.x] = Type(0);
+    block_c[threadIdx.x]   = Type(0);
+  }
   __syncthreads();
-  raft::myAtomicAdd(block_sum + threadIdx.x, thread_sum);
+  // also compute compensation for block-sum
+  const Type old_sum = atomicAdd(block_sum + threadIdx.x, thread_sum);
+  const Type t       = old_sum + thread_sum;
+  if (abs(old_sum) >= abs(thread_sum)) {
+    thread_c += (old_sum - t) + thread_sum;
+  } else {
+    thread_c += (thread_sum - t) + old_sum;
+  }
+  raft::myAtomicAdd(block_c + threadIdx.x, thread_c);
   __syncthreads();
 
   // Grid reduction
-  if (colStart < D && (threadIdx.y == 0)) raft::myAtomicAdd(out + colStart, block_sum[threadIdx.x]);
+  if (colStart < D && (threadIdx.y == 0))
+    raft::myAtomicAdd(out + colStart, block_sum[threadIdx.x] + block_c[threadIdx.x]);
 }
 
 // Kernel to perform reductions along the strided dimension
@@ -150,7 +161,7 @@ void stridedReduction(OutType* dots,
 
     const dim3 grid(raft::ceildiv(D, (IdxType)ColsPerBlk),
                     raft::min((IdxType)MaxBlocksDimY, raft::ceildiv(N, (IdxType)MinRowsPerBlk)));
-    const size_t shmemSize = sizeof(OutType) * Block.x;
+    const size_t shmemSize = sizeof(OutType) * Block.x * 2;
 
     stridedSummationKernel<InType>
       <<<grid, Block, shmemSize, stream>>>(dots, data, D, N, init, main_op);

From e2ff5ac4633753c854dcdaccc8643dbffc079aff Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Mon, 15 Jul 2024 21:33:37 +0200
Subject: [PATCH 04/10] adjust heuristic for coalesced reduction kernel choice

---
 .../linalg/detail/coalesced_reduction-inl.cuh  | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index 6485dff8a7..feef71818e 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -574,13 +574,8 @@ void coalescedReductionThickDispatcher(OutType* dots,
 {
   // Note: multiple elements per thread to take advantage of the sequential reduction and loop
   // unrolling
-  if (D < IdxType(32768)) {
-    coalescedReductionThick<ReductionThickPolicy<256, 32>, ReductionThinPolicy<32, 128, 1>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else {
-    coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 128, 1>>(
-      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  }
+  coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 128, 1>>(
+    dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
 }
 
 // Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along
@@ -605,14 +600,17 @@ void coalescedReduction(OutType* dots,
 {
   /* The primitive selects one of three implementations based on heuristics:
    *  - Thin: very efficient when D is small and/or N is large
+   *    At most one warp is processing each row
    *  - Thick: used when N is very small and D very large
-   *  - Medium: used when N is too small to fill the GPU with the thin kernel
+   *    Multiple blocks (32/64) processing each row
+   *  - Medium: everything in between
+   *    One block is processing each row
    */
   const IdxType numSMs = raft::getMultiProcessorCount();
-  if (D <= IdxType(256) || N >= IdxType(4) * numSMs) {
+  if (D <= IdxType(512) || (N >= IdxType(16) * numSMs && D < IdxType(2048))) {
     coalescedReductionThinDispatcher(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
-  } else if (N < numSMs && D >= IdxType(16384)) {
+  } else if (N < numSMs && D >= IdxType(1 << 17)) {
     coalescedReductionThickDispatcher(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else {

From 6ad9a780b847daeb6d5b6e4e3bad8ec4f0b42a9a Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 18 Jul 2024 09:34:16 +0000
Subject: [PATCH 05/10] add comments

---
 cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh | 3 +++
 cpp/include/raft/linalg/detail/strided_reduction.cuh       | 6 +++++-
 cpp/include/raft/linalg/reduce.cuh                         | 4 +++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index feef71818e..9680cbc636 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -581,6 +581,9 @@ void coalescedReductionThickDispatcher(OutType* dots,
 // Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along
 // rows for row major or reduce along columns for column major layout. Can do an inplace reduction
 // adding to original values of dots if requested.
+// In case of an add-reduction, a compensated summation will be performed in order to reduce
+// numerical error. Note that the compensation will only be performed 'per-thread' for performance
+// reasons and therefore not be equivalent to a sequential compensation.
 template <typename InType,
           typename OutType      = InType,
           typename IdxType      = int,
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index 1a0bc4b32f..567dc6220e 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -28,9 +28,13 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-// Kernel to perform reductions along the strided dimension
+// Kernel to perform summation along the strided dimension
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
+// A compensated summation will be performed in order to reduce numerical error.
+// Note that the compensation will only be performed 'per-block' for performance
+// reasons and therefore not be equivalent to a sequential compensation.
+
 template <typename Type, typename MainLambda>
 RAFT_KERNEL stridedSummationKernel(
   Type* out, const Type* data, int D, int N, Type init, MainLambda main_op)
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index a4523c6926..bbf0a52e04 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -92,7 +92,9 @@ void reduce(OutType* dots,
  *        is either row-major or column-major, while allowing the choose the
  *        dimension for reduction. Depending upon the dimension chosen for
  *        reduction, the memory accesses may be coalesced or strided.
- *
+ *        In case of an add-reduction, a compensated summation will be performed
+ *        in order to reduce numerical error. Note that the compensation will not
+ *        be equivalent to a sequential compensation to preserve parallel efficiency.
  * @tparam InElementType the input data-type of underlying raft::matrix_view
  * @tparam LayoutPolicy The layout of Input/Output (row or col major)
  * @tparam OutElementType the output data-type of underlying raft::matrix_view and reduction

From 6f0b012d95c916e30d2823fb6b0aeb3478555b96 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 18 Jul 2024 09:39:22 +0000
Subject: [PATCH 06/10] add comment for second API

---
 cpp/include/raft/linalg/reduce.cuh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index bbf0a52e04..8fd6e45d37 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -31,6 +31,9 @@ namespace linalg {
 
 /**
  * @brief Compute reduction of the input matrix along the requested dimension
+ *        In case of an add-reduction, a compensated summation will be performed
+ *        in order to reduce numerical error. Note that the compensation will not
+ *        be equivalent to a sequential compensation to preserve parallel efficiency.
  *
  * @tparam InType the data type of the input
  * @tparam OutType the data type of the output (as well as the data type for

From 5b9837b9e8841a79cf5851d1fccac577fb116f4f Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 18 Jul 2024 11:23:13 +0000
Subject: [PATCH 07/10] enable sample parameter for column based

---
 cpp/include/raft/stats/detail/mean.cuh   | 2 +-
 cpp/include/raft/stats/detail/stddev.cuh | 4 ++--
 cpp/test/stats/mean.cu                   | 4 ++--
 cpp/test/stats/stddev.cu                 | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 76483cbfda..ee39c87a68 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -30,7 +30,7 @@ template <typename Type, typename IdxType = int>
 void mean(
   Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
 {
-  Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N));
+  Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
   raft::linalg::reduce(mu,
                        data,
                        D,
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index 4af4060016..df70ace617 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -59,7 +59,7 @@ void stddev(Type* std,
     std, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) {
       return a * a;
     });
-  Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N));
+  Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
   raft::linalg::binaryOp(
     std,
     std,
@@ -103,7 +103,7 @@ void vars(Type* var,
     var, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) {
       return a * a;
     });
-  Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N));
+  Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
   raft::linalg::binaryOp(
     var,
     var,
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index dbd8549049..c5fe83d95b 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -117,7 +117,7 @@ const std::vector<MeanInputs<float>> inputsf = {
   {0.15f, -1.f, 1030, 1, false, false, 1234ULL},
   {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
   {2.0f, -1.f, 31, 120, false, false, 1234ULL},
-  {2.0f, -1.f, 1, 130, true, false, 1234ULL},
+  {2.0f, -1.f, 1, 130, false, false, 1234ULL},
   {0.15f, -1.f, 1030, 1, false, true, 1234ULL},
   {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
   {2.0f, -1.f, 31, 120, false, true, 1234ULL},
@@ -151,7 +151,7 @@ const std::vector<MeanInputs<double>> inputsd = {
   {0.15, -1.0, 1030, 1, false, false, 1234ULL},
   {0.15, -1.0, 1030, 60, true, false, 1234ULL},
   {2.0, -1.0, 31, 120, false, false, 1234ULL},
-  {2.0, -1.0, 1, 130, true, false, 1234ULL},
+  {2.0, -1.0, 1, 130, false, false, 1234ULL},
   {0.15, -1.0, 1030, 1, false, true, 1234ULL},
   {0.15, -1.0, 1030, 60, true, true, 1234ULL},
   {2.0, -1.0, 31, 120, false, true, 1234ULL},
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 1459c8fe14..f0fae195e1 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -148,7 +148,7 @@ const std::vector<StdDevInputs<float>> inputsf = {
   {0.5f, -1.f, 2.f, 31, 1, true, true, 1234ULL},
   {1.f, -1.f, 2.f, 1, 257, false, true, 1234ULL},
   {0.5f, -1.f, 2.f, 31, 1, false, false, 1234ULL},
-  {1.f, -1.f, 2.f, 1, 257, true, false, 1234ULL},
+  {1.f, -1.f, 2.f, 1, 257, false, false, 1234ULL},
   {1.f, -1.f, 2.f, 1, 1, false, false, 1234ULL},
   {1.f, -1.f, 2.f, 7, 23, false, false, 1234ULL},
   {1.f, -1.f, 2.f, 17, 5, false, false, 1234ULL},
@@ -180,7 +180,7 @@ const std::vector<StdDevInputs<double>> inputsd = {
   {0.5, -1.0, 2.0, 31, 1, true, true, 1234ULL},
   {1.0, -1.0, 2.0, 1, 257, false, true, 1234ULL},
   {0.5, -1.0, 2.0, 31, 1, false, false, 1234ULL},
-  {1.0, -1.0, 2.0, 1, 257, true, false, 1234ULL},
+  {1.0, -1.0, 2.0, 1, 257, false, false, 1234ULL},
   {1.0, -1.0, 2.0, 1, 1, false, false, 1234ULL},
   {1.0, -1.0, 2.0, 7, 23, false, false, 1234ULL},
   {1.0, -1.0, 2.0, 17, 5, false, false, 1234ULL},

From 0461fd3de1efc65c81b5beeb52e58feeddce0caa Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Thu, 18 Jul 2024 15:42:21 +0000
Subject: [PATCH 08/10] fix covariance should always use mean with sample=false

---
 cpp/test/stats/cov.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index df5177056d..602f356b9f 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -72,8 +72,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     cov_act.resize(cols * cols, stream);
 
     normal(handle, r, data.data(), len, params.mean, var);
-    raft::stats::mean(
-      mean_act.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
+    raft::stats::mean(mean_act.data(), data.data(), cols, rows, false, params.rowMajor, stream);
     if (params.rowMajor) {
       using layout = raft::row_major;
       cov(handle,
@@ -103,7 +102,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     raft::update_device(data_cm.data(), data_h, 6, stream);
     raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream);
 
-    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, true, false, stream);
+    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, false, stream);
     cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream);
   }
 

From f2a76be246f15e75ece0ae74e1e805c7d41d4a77 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 24 Jul 2024 11:19:25 +0000
Subject: [PATCH 09/10] fix stddev and var for sample=true

---
 cpp/include/raft/stats/detail/stddev.cuh | 42 +++++++++++++-----------
 cpp/test/stats/stddev.cu                 |  4 +--
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index df70ace617..4c861b49fb 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -59,16 +59,18 @@ void stddev(Type* std,
     std, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) {
       return a * a;
     });
-  Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
-  raft::linalg::binaryOp(
-    std,
-    std,
-    mu,
-    D,
-    raft::compose_op(raft::sqrt_op(),
-                     raft::abs_op(),
-                     [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }),
-    stream);
+  Type ratio      = Type(1) / ((sample) ? Type(N - 1) : Type(N));
+  Type ratio_mean = sample ? ratio * Type(N) : Type(1);
+  raft::linalg::binaryOp(std,
+                         std,
+                         mu,
+                         D,
+                         raft::compose_op(raft::sqrt_op(),
+                                          raft::abs_op(),
+                                          [ratio, ratio_mean] __device__(Type a, Type b) {
+                                            return a * ratio - b * b * ratio_mean;
+                                          }),
+                         stream);
 }
 
 /**
@@ -103,15 +105,17 @@ void vars(Type* var,
     var, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) {
       return a * a;
     });
-  Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
-  raft::linalg::binaryOp(
-    var,
-    var,
-    mu,
-    D,
-    raft::compose_op(raft::abs_op(),
-                     [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }),
-    stream);
+  Type ratio      = Type(1) / ((sample) ? Type(N - 1) : Type(N));
+  Type ratio_mean = sample ? ratio * Type(N) : Type(1);
+  raft::linalg::binaryOp(var,
+                         var,
+                         mu,
+                         D,
+                         raft::compose_op(raft::abs_op(),
+                                          [ratio, ratio_mean] __device__(Type a, Type b) {
+                                            return a * ratio - b * b * ratio_mean;
+                                          }),
+                         stream);
 }
 
 }  // namespace detail
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index f0fae195e1..f4c5f92f49 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -82,7 +82,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
            raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           params.sample);
+           false);
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
@@ -100,7 +100,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
            raft::make_device_vector_view<T>(mean_act.data(), cols),
-           params.sample);
+           false);
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),

From 876b9ead11eb450a95e1e16ab3a475a6bf152c34 Mon Sep 17 00:00:00 2001
From: Malte Foerster <mfoerster@nvidia.com>
Date: Wed, 24 Jul 2024 12:05:38 +0000
Subject: [PATCH 10/10] remove sample paramemter from mean API

---
 cpp/include/raft/stats/detail/mean.cuh   |   5 +-
 cpp/include/raft/stats/detail/scores.cuh |   2 +-
 cpp/include/raft/stats/mean.cuh          |  16 +--
 cpp/test/neighbors/ann_ivf_flat.cuh      |   2 +-
 cpp/test/random/rng.cu                   |   3 +-
 cpp/test/stats/cov.cu                    |   4 +-
 cpp/test/stats/mean.cu                   | 121 ++++++++----------
 cpp/test/stats/mean_center.cu            | 149 ++++++++---------------
 cpp/test/stats/stddev.cu                 |   6 +-
 9 files changed, 112 insertions(+), 196 deletions(-)

diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index ee39c87a68..d1de474e8d 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -27,10 +27,9 @@ namespace stats {
 namespace detail {
 
 template <typename Type, typename IdxType = int>
-void mean(
-  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
+void mean(Type* mu, const Type* data, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
 {
-  Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N));
+  Type ratio = Type(1) / Type(N);
   raft::linalg::reduce(mu,
                        data,
                        D,
diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh
index 947df6848a..66951f52ab 100644
--- a/cpp/include/raft/stats/detail/scores.cuh
+++ b/cpp/include/raft/stats/detail/scores.cuh
@@ -59,7 +59,7 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
 {
   rmm::device_scalar<math_t> y_bar(stream);
 
-  raft::stats::mean(y_bar.data(), y, 1, n, false, false, stream);
+  raft::stats::mean(y_bar.data(), y, 1, n, false, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   rmm::device_uvector<math_t> sse_arr(n, stream);
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index 43d39cfd6c..3147216211 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,17 +38,13 @@ namespace stats {
  * @param data: the input matrix
  * @param D: number of columns of data
  * @param N: number of rows of data
- * @param sample: whether to evaluate sample mean or not. In other words,
- * whether
- *  to normalize the output using N-1 or N, for true or false, respectively
  * @param rowMajor: whether the input data is row or col major
  * @param stream: cuda stream
  */
 template <typename Type, typename IdxType = int>
-void mean(
-  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
+void mean(Type* mu, const Type* data, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
 {
-  detail::mean(mu, data, D, N, sample, rowMajor, stream);
+  detail::mean(mu, data, D, N, rowMajor, stream);
 }
 
 /**
@@ -67,14 +63,11 @@ void mean(
  * @param[in]  handle the raft handle
  * @param[in]  data: the input matrix
  * @param[out] mu: the output mean vector
- * @param[in]  sample: whether to evaluate sample mean or not. In other words, whether
- *   to normalize the output using N-1 or N, for true or false, respectively
  */
 template <typename value_t, typename idx_t, typename layout_t>
 void mean(raft::resources const& handle,
           raft::device_matrix_view<const value_t, idx_t, layout_t> data,
-          raft::device_vector_view<value_t, idx_t> mu,
-          bool sample)
+          raft::device_vector_view<value_t, idx_t> mu)
 {
   static_assert(
     std::is_same_v<layout_t, raft::row_major> || std::is_same_v<layout_t, raft::col_major>,
@@ -86,7 +79,6 @@ void mean(raft::resources const& handle,
                data.data_handle(),
                data.extent(1),
                data.extent(0),
-               sample,
                std::is_same_v<layout_t, raft::row_major>,
                resource::get_cuda_stream(handle));
 }
diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh
index de6af589fa..314a0efae1 100644
--- a/cpp/test/neighbors/ann_ivf_flat.cuh
+++ b/cpp/test/neighbors/ann_ivf_flat.cuh
@@ -288,7 +288,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam<AnnIvfFlatInputs<IdxT>> {
                                                                     (IdxT)ps.dim,
                                                                     stream_);
             raft::stats::mean<float, uint32_t>(
-              centroid.data(), cluster_data.data(), ps.dim, list_sizes[l], false, true, stream_);
+              centroid.data(), cluster_data.data(), ps.dim, list_sizes[l], true, stream_);
             ASSERT_TRUE(raft::devArrMatch(index_2.centers().data_handle() + ps.dim * l,
                                           centroid.data(),
                                           ps.dim,
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index a37f150d4c..172f94ae50 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -407,8 +407,7 @@ TEST(Rng, MeanError)
     RngState r(seed, rtype);
     normal(handle, r, data.data(), len, 3.3f, 0.23f);
     // uniform(r, data, len, -1.0, 2.0);
-    raft::stats::mean(
-      mean_result.data(), data.data(), num_samples, num_experiments, false, false, stream);
+    raft::stats::mean(mean_result.data(), data.data(), num_samples, num_experiments, false, stream);
     raft::stats::stddev(std_result.data(),
                         data.data(),
                         mean_result.data(),
diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu
index 602f356b9f..3f2a3dcebf 100644
--- a/cpp/test/stats/cov.cu
+++ b/cpp/test/stats/cov.cu
@@ -72,7 +72,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     cov_act.resize(cols * cols, stream);
 
     normal(handle, r, data.data(), len, params.mean, var);
-    raft::stats::mean(mean_act.data(), data.data(), cols, rows, false, params.rowMajor, stream);
+    raft::stats::mean(mean_act.data(), data.data(), cols, rows, params.rowMajor, stream);
     if (params.rowMajor) {
       using layout = raft::row_major;
       cov(handle,
@@ -102,7 +102,7 @@ class CovTest : public ::testing::TestWithParam<CovInputs<T>> {
     raft::update_device(data_cm.data(), data_h, 6, stream);
     raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream);
 
-    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, false, stream);
+    raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, stream);
     cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream);
   }
 
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index c5fe83d95b..e72d4eaf74 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -33,7 +33,7 @@ template <typename T>
 struct MeanInputs {
   T tolerance, mean;
   int rows, cols;
-  bool sample, rowMajor;
+  bool rowMajor;
   unsigned long long int seed;
   T stddev = (T)1.0;
 };
@@ -42,7 +42,7 @@ template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const MeanInputs<T>& dims)
 {
   return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
-            << dims.sample << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl;
+            << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl;
 }
 
 template <typename T>
@@ -74,14 +74,12 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
       using layout = raft::row_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           params.sample);
+           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
     } else {
       using layout = raft::col_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           params.sample);
+           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
     }
   }
 
@@ -98,72 +96,51 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
 const std::vector<MeanInputs<float>> inputsf = {
-  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, true, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, false, 1234ULL},
-  {0.15f, -1.f, 1030, 60, true, false, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, false, 1234ULL},
-  {2.0f, -1.f, 1, 130, false, false, 1234ULL},
-  {0.15f, -1.f, 1030, 1, false, true, 1234ULL},
-  {0.15f, -1.f, 1030, 60, true, true, 1234ULL},
-  {2.0f, -1.f, 31, 120, false, true, 1234ULL},
-  {2.0f, -1.f, 1, 130, false, true, 1234ULL},
-  {2.0f, -1.f, 1, 1, false, false, 1234ULL},
-  {2.0f, -1.f, 1, 1, false, true, 1234ULL},
-  {2.0f, -1.f, 7, 23, false, false, 1234ULL},
-  {2.0f, -1.f, 7, 23, false, true, 1234ULL},
-  {2.0f, -1.f, 17, 5, false, false, 1234ULL},
-  {2.0f, -1.f, 17, 5, false, true, 1234ULL},
-  {0.0001f, 0.1f, 1 << 27, 2, false, false, 1234ULL, 0.0001f},
-  {0.0001f, 0.1f, 1 << 27, 2, false, true, 1234ULL, 0.0001f}};
-
-const std::vector<MeanInputs<double>> inputsd = {
-  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, true, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, false, 1234ULL},
-  {0.15, -1.0, 1030, 60, true, false, 1234ULL},
-  {2.0, -1.0, 31, 120, false, false, 1234ULL},
-  {2.0, -1.0, 1, 130, false, false, 1234ULL},
-  {0.15, -1.0, 1030, 1, false, true, 1234ULL},
-  {0.15, -1.0, 1030, 60, true, true, 1234ULL},
-  {2.0, -1.0, 31, 120, false, true, 1234ULL},
-  {2.0, -1.0, 1, 130, false, true, 1234ULL},
-  {2.0, -1.0, 1, 1, false, false, 1234ULL},
-  {2.0, -1.0, 1, 1, false, true, 1234ULL},
-  {2.0, -1.0, 7, 23, false, false, 1234ULL},
-  {2.0, -1.0, 7, 23, false, true, 1234ULL},
-  {2.0, -1.0, 17, 5, false, false, 1234ULL},
-  {2.0, -1.0, 17, 5, false, true, 1234ULL},
-  {1e-8, 1e-1, 1 << 27, 2, false, false, 1234ULL, 0.0001},
-  {1e-8, 1e-1, 1 << 27, 2, false, true, 1234ULL, 0.0001}};
+  {0.15f, -1.f, 1024, 32, false, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, 1234ULL},
+  {0.15f, -1.f, 1024, 32, true, 1234ULL},
+  {0.15f, -1.f, 1024, 64, true, 1234ULL},
+  {0.15f, -1.f, 1024, 128, true, 1234ULL},
+  {0.15f, -1.f, 1024, 256, true, 1234ULL},
+  {0.15f, -1.f, 1030, 1, false, 1234ULL},
+  {2.0f, -1.f, 31, 120, false, 1234ULL},
+  {2.0f, -1.f, 1, 130, false, 1234ULL},
+  {0.15f, -1.f, 1030, 1, true, 1234ULL},
+  {2.0f, -1.f, 31, 120, true, 1234ULL},
+  {2.0f, -1.f, 1, 130, true, 1234ULL},
+  {2.0f, -1.f, 1, 1, false, 1234ULL},
+  {2.0f, -1.f, 1, 1, true, 1234ULL},
+  {2.0f, -1.f, 7, 23, false, 1234ULL},
+  {2.0f, -1.f, 7, 23, true, 1234ULL},
+  {2.0f, -1.f, 17, 5, false, 1234ULL},
+  {2.0f, -1.f, 17, 5, true, 1234ULL},
+  {0.0001f, 0.1f, 1 << 27, 2, false, 1234ULL, 0.0001f},
+  {0.0001f, 0.1f, 1 << 27, 2, true, 1234ULL, 0.0001f}};
+
+const std::vector<MeanInputs<double>> inputsd = {{0.15, -1.0, 1024, 32, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 32, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, true, 1234ULL},
+                                                 {0.15, -1.0, 1030, 1, false, 1234ULL},
+                                                 {2.0, -1.0, 31, 120, false, 1234ULL},
+                                                 {2.0, -1.0, 1, 130, false, 1234ULL},
+                                                 {0.15, -1.0, 1030, 1, true, 1234ULL},
+                                                 {2.0, -1.0, 31, 120, true, 1234ULL},
+                                                 {2.0, -1.0, 1, 130, true, 1234ULL},
+                                                 {2.0, -1.0, 1, 1, false, 1234ULL},
+                                                 {2.0, -1.0, 1, 1, true, 1234ULL},
+                                                 {2.0, -1.0, 7, 23, false, 1234ULL},
+                                                 {2.0, -1.0, 7, 23, true, 1234ULL},
+                                                 {2.0, -1.0, 17, 5, false, 1234ULL},
+                                                 {2.0, -1.0, 17, 5, true, 1234ULL},
+                                                 {1e-8, 1e-1, 1 << 27, 2, false, 1234ULL, 0.0001},
+                                                 {1e-8, 1e-1, 1 << 27, 2, true, 1234ULL, 0.0001}};
 
 typedef MeanTest<float> MeanTestF;
 TEST_P(MeanTestF, Result)
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index b44d87d1bd..48bf50056c 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -32,7 +32,7 @@ template <typename T, typename IdxType>
 struct MeanCenterInputs {
   T tolerance, mean;
   IdxType rows, cols;
-  bool sample, rowMajor, bcastAlongRows;
+  bool rowMajor, bcastAlongRows;
   unsigned long long int seed;
 };
 
@@ -64,8 +64,7 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
     auto len         = rows * cols;
     auto meanVecSize = params.bcastAlongRows ? cols : rows;
     normal(handle, r, data.data(), len, params.mean, (T)1.0);
-    raft::stats::mean(
-      meanVec.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
+    raft::stats::mean(meanVec.data(), data.data(), cols, rows, params.rowMajor, stream);
     if (params.rowMajor) {
       using layout = raft::row_major;
       mean_center(handle,
@@ -103,30 +102,18 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
 };
 
 const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
-  {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
+  {0.05f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, false, 1234ULL}};
 typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
 TEST_P(MeanCenterTestF_i32, Result)
 {
@@ -136,30 +123,18 @@ TEST_P(MeanCenterTestF_i32, Result)
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
-  {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL},
-  {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL},
-  {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
-  {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
+  {0.05f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, true, 1234ULL},
+  {0.05f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.05f, -1.f, 1024, 32, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 64, true, false, 1234ULL},
+  {0.05f, -1.f, 1024, 128, true, false, 1234ULL}};
 typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
 TEST_P(MeanCenterTestF_i64, Result)
 {
@@ -169,30 +144,18 @@ TEST_P(MeanCenterTestF_i64, Result)
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
-  {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
+  {0.05, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, false, 1234ULL}};
 typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
 TEST_P(MeanCenterTestD_i32, Result)
 {
@@ -202,30 +165,18 @@ TEST_P(MeanCenterTestD_i32, Result)
 INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
-  {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, true, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, true, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, false, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, false, false, 1234ULL},
-  {0.05, 1.0, 1024, 32, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 64, true, true, false, 1234ULL},
-  {0.05, 1.0, 1024, 128, true, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 32, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
-  {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
+  {0.05, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, true, 1234ULL},
+  {0.05, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.05, -1.0, 1024, 32, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 64, true, false, 1234ULL},
+  {0.05, -1.0, 1024, 128, true, false, 1234ULL}};
 typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
 TEST_P(MeanCenterTestD_i64, Result)
 {
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index f4c5f92f49..a9a70b1e60 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -81,8 +81,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       using layout_t = raft::row_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T, int>(mean_act.data(), cols),
-           false);
+           raft::make_device_vector_view<T, int>(mean_act.data(), cols));
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
@@ -99,8 +98,7 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       using layout_t = raft::col_major;
       mean(handle,
            raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),
-           raft::make_device_vector_view<T>(mean_act.data(), cols),
-           false);
+           raft::make_device_vector_view<T>(mean_act.data(), cols));
 
       stddev(handle,
              raft::make_device_matrix_view<const T, int, layout_t>(data, rows, cols),