From 0099a3a260892ec9e4f9f4fcd42e25d1b475758e Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 8 Jul 2024 13:35:16 +0000 Subject: [PATCH 01/10] forwards mean, var, stddev & sum implementation to reduce, also added KahanBabushkaNeumaierSum to strided summation --- cpp/include/raft/core/math.hpp | 6 +- .../raft/linalg/detail/strided_reduction.cuh | 70 ++++++---- cpp/include/raft/stats/detail/mean.cuh | 65 ++-------- cpp/include/raft/stats/detail/stddev.cuh | 121 +++++------------- cpp/include/raft/stats/detail/sum.cuh | 99 +------------- cpp/test/stats/cov.cu | 3 +- cpp/test/stats/mean.cu | 100 ++++++++++----- cpp/test/stats/stddev.cu | 11 +- cpp/test/stats/sum.cu | 63 ++++----- 9 files changed, 199 insertions(+), 339 deletions(-) diff --git a/cpp/include/raft/core/math.hpp b/cpp/include/raft/core/math.hpp index 9ce768bf40..e082aaf41a 100644 --- a/cpp/include/raft/core/math.hpp +++ b/cpp/include/raft/core/math.hpp @@ -56,7 +56,11 @@ constexpr RAFT_INLINE_FUNCTION auto abs(T x) !std::is_same_v, T> { - return x < T{0} ? -x : x; + if constexpr (std::is_unsigned_v) { + return x; + } else { + return x < T{0} ? -x : x; + } } #if defined(_RAFT_HAS_CUDA) template diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index 617ac6d874..a9094d3959 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -33,33 +33,43 @@ namespace detail { // for column major layout template RAFT_KERNEL stridedSummationKernel( - Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op) + Type* out, const Type* data, int D, int N, Type init, MainLambda main_op) { // Thread reduction - Type thread_data = Type(init); - int colStart = blockIdx.x * blockDim.x + threadIdx.x; + Type thread_sum = Type(init); + Type thread_c = Type(0); + int colStart = blockIdx.x * blockDim.x + threadIdx.x; if (colStart < D) { int rowStart = blockIdx.y * blockDim.y + threadIdx.y; int stride = blockDim.y * gridDim.y; for (int j = rowStart; j < N; j += stride) { int idx = colStart + j * D; - thread_data += main_op(data[idx], j); + + // KahanBabushkaNeumaierSum + const Type cur_value = main_op(data[idx], j); + const Type t = thread_sum + cur_value; + if (abs(thread_sum) >= abs(cur_value)) { + thread_c += (thread_sum - t) + cur_value; + } else { + thread_c += (cur_value - t) + thread_sum; + } + thread_sum = t; } } + // add correction term + thread_sum += thread_c; + // Block reduction - extern __shared__ char tmp[]; // One element per thread in block - Type* temp = (Type*)tmp; // Cast to desired type - int myidx = threadIdx.x + blockDim.x * threadIdx.y; - temp[myidx] = thread_data; + extern __shared__ char tmp[]; + auto* block_sum = (Type*)tmp; + if (threadIdx.x < blockDim.x) block_sum[threadIdx.x] = Type(0); + __syncthreads(); + raft::myAtomicAdd(block_sum + threadIdx.x, thread_sum); __syncthreads(); - for (int j = blockDim.y / 2; j > 0; j /= 2) { - if (threadIdx.y < j) temp[myidx] += temp[myidx + j * blockDim.x]; - __syncthreads(); - } // Grid reduction - if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]); + if (colStart < D && (threadIdx.y == 0)) raft::myAtomicAdd(out + colStart, block_sum[threadIdx.x]); } // Kernel to perform reductions along the strided dimension @@ -127,23 +137,35 @@ void stridedReduction(OutType* dots, /// for atomics in stridedKernel (redesign for this is already underway) if (!inplace) raft::linalg::unaryOp(dots, dots, D, raft::const_op(init), stream); - // Arbitrary numbers for now, probably need to tune - const dim3 thrds(32, 16); - IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); - elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; - const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), - raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); - const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; - ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) if constexpr (std::is_same::value && - std::is_same::value) + std::is_same::value) { + constexpr int TPB = 256; + constexpr int ColsPerBlk = 8; + constexpr dim3 Block(ColsPerBlk, TPB / ColsPerBlk); + constexpr int MinRowsPerThread = 16; + constexpr int MinRowsPerBlk = Block.y * MinRowsPerThread; + constexpr int MaxBlocksDimY = 8192; + + const dim3 grid(raft::ceildiv(D, (IdxType)ColsPerBlk), + raft::min((IdxType)MaxBlocksDimY, raft::ceildiv(N, (IdxType)MinRowsPerBlk))); + const size_t shmemSize = sizeof(OutType) * Block.x; + stridedSummationKernel - <<>>(dots, data, D, N, init, main_op); - else + <<>>(dots, data, D, N, init, main_op); + } else { + // Arbitrary numbers for now, probably need to tune + const dim3 thrds(32, 16); + IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y); + elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread; + const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x), + raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread)); + const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y; + stridedReductionKernel <<>>(dots, data, D, N, init, main_op, reduce_op); + } ///@todo: this complication should go away once we have eliminated the need /// for atomics in stridedKernel (redesign for this is already underway) diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh index 6c330acb26..76483cbfda 100644 --- a/cpp/include/raft/stats/detail/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -25,61 +26,23 @@ namespace raft { namespace stats { namespace detail { -///@todo: ColsPerBlk has been tested only for 32! -template -RAFT_KERNEL meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) - thread_data += (colId < D) ? data[i * D + colId] : Type(0); - __shared__ Type smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_data); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -RAFT_KERNEL meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - thread_data += data[idx]; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; } -} - template void mean( Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) { - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream)); - meanKernelRowMajor<<>>(mu, data, D, N); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); - raft::linalg::scalarMultiply(mu, mu, ratio, D, stream); - } else { - meanKernelColMajor<<>>(mu, data, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); + Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N)); + raft::linalg::reduce(mu, + data, + D, + N, + Type(0), + rowMajor, + false, + stream, + false, + raft::identity_op(), + raft::add_op(), + raft::mul_const_op(ratio)); } } // namespace detail diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh index bc2644a233..4af4060016 100644 --- a/cpp/include/raft/stats/detail/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -16,7 +16,9 @@ #pragma once +#include #include +#include #include #include @@ -25,63 +27,6 @@ namespace raft { namespace stats { namespace detail { -///@todo: ColPerBlk has been tested only for 32! -template -RAFT_KERNEL stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N) -{ - const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_data = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) { - Type val = (colId < D) ? data[i * D + colId] : Type(0); - thread_data += val * val; - } - __shared__ Type sstd[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) sstd[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(sstd + thisColId, thread_data); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(std + colId, sstd[thisColId]); -} - -template -RAFT_KERNEL stddevKernelColMajor(Type* std, const Type* data, const Type* mu, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - Type diff = data[idx] - m; - thread_data += diff * diff; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { std[blockIdx.x] = raft::sqrt(acc / N); } -} - -template -RAFT_KERNEL varsKernelColMajor(Type* var, const Type* data, const Type* mu, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_data = Type(0); - IdxType colStart = N * blockIdx.x; - Type m = mu[blockIdx.x]; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - IdxType idx = colStart + i; - Type diff = data[idx] - m; - thread_data += diff * diff; - } - Type acc = BlockReduce(temp_storage).Sum(thread_data); - if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; } -} - /** * @brief Compute stddev of the input matrix * @@ -110,26 +55,20 @@ void stddev(Type* std, bool rowMajor, cudaStream_t stream) { - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemset(std, 0, sizeof(Type) * D)); - stddevKernelRowMajor<<>>(std, data, D, N); - Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); - raft::linalg::binaryOp( - std, - std, - mu, - D, - [ratio] __device__(Type a, Type b) { return raft::sqrt(a * ratio - b * b); }, - stream); - } else { - stddevKernelColMajor<<>>(std, data, mu, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); + raft::linalg::reduce( + std, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) { + return a * a; + }); + Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N)); + raft::linalg::binaryOp( + std, + std, + mu, + D, + raft::compose_op(raft::sqrt_op(), + raft::abs_op(), + [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }), + stream); } /** @@ -160,21 +99,19 @@ void vars(Type* var, bool rowMajor, cudaStream_t stream) { - static const int TPB = 256; - if (rowMajor) { - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk)); - RAFT_CUDA_TRY(cudaMemset(var, 0, sizeof(Type) * D)); - stddevKernelRowMajor<<>>(var, data, D, N); - Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N)); - raft::linalg::binaryOp( - var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream); - } else { - varsKernelColMajor<<>>(var, data, mu, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); + raft::linalg::reduce( + var, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) { + return a * a; + }); + Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N)); + raft::linalg::binaryOp( + var, + var, + mu, + D, + raft::compose_op(raft::abs_op(), + [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }), + stream); } } // namespace detail diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh index 4f85536e6c..39bd2c3b6c 100644 --- a/cpp/include/raft/stats/detail/sum.cuh +++ b/cpp/include/raft/stats/detail/sum.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -25,106 +26,10 @@ namespace raft { namespace stats { namespace detail { -///@todo: ColsPerBlk has been tested only for 32! -template -RAFT_KERNEL sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - const int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_sum = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) { - thread_sum += (colId < D) ? data[i * D + colId] : Type(0); - } - __shared__ Type smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_sum); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -RAFT_KERNEL sumKahanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxType thisColId = threadIdx.x % ColsPerBlk; - IdxType thisRowId = threadIdx.x / ColsPerBlk; - IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk); - IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter); - Type thread_sum = Type(0); - Type thread_c = Type(0); - const IdxType stride = RowsPerBlkPerIter * gridDim.x; - for (IdxType i = rowId; i < N; i += stride) { - // KahanBabushkaNeumaierSum - const Type cur_value = (colId < D) ? data[i * D + colId] : Type(0); - const Type t = thread_sum + cur_value; - if (abs(thread_sum) >= abs(cur_value)) { - thread_c += (thread_sum - t) + cur_value; - } else { - thread_c += (cur_value - t) + thread_sum; - } - thread_sum = t; - } - thread_sum += thread_c; - __shared__ Type smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = Type(0); - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_sum); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -RAFT_KERNEL sumKahanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N) -{ - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - Type thread_sum = Type(0); - Type thread_c = Type(0); - IdxType colStart = N * blockIdx.x; - for (IdxType i = threadIdx.x; i < N; i += TPB) { - // KahanBabushkaNeumaierSum - IdxType idx = colStart + i; - const Type cur_value = data[idx]; - const Type t = thread_sum + cur_value; - if (abs(thread_sum) >= abs(cur_value)) { - thread_c += (thread_sum - t) + cur_value; - } else { - thread_c += (cur_value - t) + thread_sum; - } - thread_sum = t; - } - thread_sum += thread_c; - Type acc = BlockReduce(temp_storage).Sum(thread_sum); - if (threadIdx.x == 0) { mu[blockIdx.x] = acc; } -} - template void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) { - static const int TPB = 256; - if (rowMajor) { - static const int ColsPerBlk = 8; - static const int MinRowsPerThread = 16; - static const int MinRowsPerBlk = (TPB / ColsPerBlk) * MinRowsPerThread; - static const int MaxBlocksDimX = 8192; - - const IdxType grid_y = raft::ceildiv(D, (IdxType)ColsPerBlk); - const IdxType grid_x = - raft::min((IdxType)MaxBlocksDimX, raft::ceildiv(N, (IdxType)MinRowsPerBlk)); - - dim3 grid(grid_x, grid_y); - RAFT_CUDA_TRY(cudaMemset(output, 0, sizeof(Type) * D)); - sumKahanKernelRowMajor - <<>>(output, input, D, N); - } else { - sumKahanKernelColMajor<<>>(output, input, D, N); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); + raft::linalg::reduce(output, input, D, N, Type(0), rowMajor, false, stream); } } // namespace detail diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu index 41812979b6..df5177056d 100644 --- a/cpp/test/stats/cov.cu +++ b/cpp/test/stats/cov.cu @@ -40,7 +40,8 @@ struct CovInputs { template ::std::ostream& operator<<(::std::ostream& os, const CovInputs& dims) { - return os; + return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " + << dims.sample << ", " << dims.rowMajor << "}" << std::endl; } template diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index 61b57ce739..dbd8549049 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -35,12 +35,14 @@ struct MeanInputs { int rows, cols; bool sample, rowMajor; unsigned long long int seed; + T stddev = (T)1.0; }; template ::std::ostream& operator<<(::std::ostream& os, const MeanInputs& dims) { - return os; + return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " + << dims.sample << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl; } template @@ -61,7 +63,7 @@ class MeanTest : public ::testing::TestWithParam> { { raft::random::RngState r(params.seed); int len = rows * cols; - normal(handle, r, data.data(), len, params.mean, (T)1.0); + normal(handle, r, data.data(), len, params.mean, params.stddev); meanSGtest(data.data(), stream); } @@ -96,38 +98,72 @@ class MeanTest : public ::testing::TestWithParam> { // measured mean (of a normal distribution) will fall outside of an epsilon of // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times) const std::vector> inputsf = { - {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, - {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, {0.15f, -1.f, 1024, 256, false, true, 1234ULL}, - {0.15f, -1.f, 1030, 1, false, false, 1234ULL}, {0.15f, -1.f, 1030, 60, true, false, 1234ULL}, - {2.0f, -1.f, 31, 120, false, false, 1234ULL}, {2.0f, -1.f, 1, 130, true, false, 1234ULL}, - {0.15f, -1.f, 1030, 1, false, true, 1234ULL}, {0.15f, -1.f, 1030, 60, true, true, 1234ULL}, - {2.0f, -1.f, 31, 120, false, true, 1234ULL}, {2.0f, -1.f, 1, 130, false, true, 1234ULL}, - {2.0f, -1.f, 1, 1, false, false, 1234ULL}, {2.0f, -1.f, 1, 1, false, true, 1234ULL}, - {2.0f, -1.f, 7, 23, false, false, 1234ULL}, {2.0f, -1.f, 7, 23, false, true, 1234ULL}, - {2.0f, -1.f, 17, 5, false, false, 1234ULL}, {2.0f, -1.f, 17, 5, false, true, 1234ULL}}; + {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, + {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, + {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, + {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, true, 1234ULL}, + {0.15f, -1.f, 1030, 1, false, false, 1234ULL}, + {0.15f, -1.f, 1030, 60, true, false, 1234ULL}, + {2.0f, -1.f, 31, 120, false, false, 1234ULL}, + {2.0f, -1.f, 1, 130, true, false, 1234ULL}, + {0.15f, -1.f, 1030, 1, false, true, 1234ULL}, + {0.15f, -1.f, 1030, 60, true, true, 1234ULL}, + {2.0f, -1.f, 31, 120, false, true, 1234ULL}, + {2.0f, -1.f, 1, 130, false, true, 1234ULL}, + {2.0f, -1.f, 1, 1, false, false, 1234ULL}, + {2.0f, -1.f, 1, 1, false, true, 1234ULL}, + {2.0f, -1.f, 7, 23, false, false, 1234ULL}, + {2.0f, -1.f, 7, 23, false, true, 1234ULL}, + {2.0f, -1.f, 17, 5, false, false, 1234ULL}, + {2.0f, -1.f, 17, 5, false, true, 1234ULL}, + {0.0001f, 0.1f, 1 << 27, 2, false, false, 1234ULL, 0.0001f}, + {0.0001f, 0.1f, 1 << 27, 2, false, true, 1234ULL, 0.0001f}}; const std::vector> inputsd = { - {0.15, 1.0, 1024, 32, true, false, 1234ULL}, {0.15, 1.0, 1024, 64, true, false, 1234ULL}, - {0.15, 1.0, 1024, 128, true, false, 1234ULL}, {0.15, 1.0, 1024, 256, true, false, 1234ULL}, - {0.15, -1.0, 1024, 32, false, false, 1234ULL}, {0.15, -1.0, 1024, 64, false, false, 1234ULL}, - {0.15, -1.0, 1024, 128, false, false, 1234ULL}, {0.15, -1.0, 1024, 256, false, false, 1234ULL}, - {0.15, 1.0, 1024, 32, true, true, 1234ULL}, {0.15, 1.0, 1024, 64, true, true, 1234ULL}, - {0.15, 1.0, 1024, 128, true, true, 1234ULL}, {0.15, 1.0, 1024, 256, true, true, 1234ULL}, - {0.15, -1.0, 1024, 32, false, true, 1234ULL}, {0.15, -1.0, 1024, 64, false, true, 1234ULL}, - {0.15, -1.0, 1024, 128, false, true, 1234ULL}, {0.15, -1.0, 1024, 256, false, true, 1234ULL}, - {0.15, -1.0, 1030, 1, false, false, 1234ULL}, {0.15, -1.0, 1030, 60, true, false, 1234ULL}, - {2.0, -1.0, 31, 120, false, false, 1234ULL}, {2.0, -1.0, 1, 130, true, false, 1234ULL}, - {0.15, -1.0, 1030, 1, false, true, 1234ULL}, {0.15, -1.0, 1030, 60, true, true, 1234ULL}, - {2.0, -1.0, 31, 120, false, true, 1234ULL}, {2.0, -1.0, 1, 130, false, true, 1234ULL}, - {2.0, -1.0, 1, 1, false, false, 1234ULL}, {2.0, -1.0, 1, 1, false, true, 1234ULL}, - {2.0, -1.0, 7, 23, false, false, 1234ULL}, {2.0, -1.0, 7, 23, false, true, 1234ULL}, - {2.0, -1.0, 17, 5, false, false, 1234ULL}, {2.0, -1.0, 17, 5, false, true, 1234ULL}}; + {0.15, 1.0, 1024, 32, true, false, 1234ULL}, + {0.15, 1.0, 1024, 64, true, false, 1234ULL}, + {0.15, 1.0, 1024, 128, true, false, 1234ULL}, + {0.15, 1.0, 1024, 256, true, false, 1234ULL}, + {0.15, -1.0, 1024, 32, false, false, 1234ULL}, + {0.15, -1.0, 1024, 64, false, false, 1234ULL}, + {0.15, -1.0, 1024, 128, false, false, 1234ULL}, + {0.15, -1.0, 1024, 256, false, false, 1234ULL}, + {0.15, 1.0, 1024, 32, true, true, 1234ULL}, + {0.15, 1.0, 1024, 64, true, true, 1234ULL}, + {0.15, 1.0, 1024, 128, true, true, 1234ULL}, + {0.15, 1.0, 1024, 256, true, true, 1234ULL}, + {0.15, -1.0, 1024, 32, false, true, 1234ULL}, + {0.15, -1.0, 1024, 64, false, true, 1234ULL}, + {0.15, -1.0, 1024, 128, false, true, 1234ULL}, + {0.15, -1.0, 1024, 256, false, true, 1234ULL}, + {0.15, -1.0, 1030, 1, false, false, 1234ULL}, + {0.15, -1.0, 1030, 60, true, false, 1234ULL}, + {2.0, -1.0, 31, 120, false, false, 1234ULL}, + {2.0, -1.0, 1, 130, true, false, 1234ULL}, + {0.15, -1.0, 1030, 1, false, true, 1234ULL}, + {0.15, -1.0, 1030, 60, true, true, 1234ULL}, + {2.0, -1.0, 31, 120, false, true, 1234ULL}, + {2.0, -1.0, 1, 130, false, true, 1234ULL}, + {2.0, -1.0, 1, 1, false, false, 1234ULL}, + {2.0, -1.0, 1, 1, false, true, 1234ULL}, + {2.0, -1.0, 7, 23, false, false, 1234ULL}, + {2.0, -1.0, 7, 23, false, true, 1234ULL}, + {2.0, -1.0, 17, 5, false, false, 1234ULL}, + {2.0, -1.0, 17, 5, false, true, 1234ULL}, + {1e-8, 1e-1, 1 << 27, 2, false, false, 1234ULL, 0.0001}, + {1e-8, 1e-1, 1 << 27, 2, false, true, 1234ULL, 0.0001}}; typedef MeanTest MeanTestF; TEST_P(MeanTestF, Result) diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index 641621c1c6..1459c8fe14 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -39,7 +39,8 @@ struct StdDevInputs { template ::std::ostream& operator<<(::std::ostream& os, const StdDevInputs& dims) { - return os; + return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " + << dims.sample << ", " << dims.rowMajor << "}" << std::endl; } template @@ -153,7 +154,9 @@ const std::vector> inputsf = { {1.f, -1.f, 2.f, 17, 5, false, false, 1234ULL}, {1.f, -1.f, 2.f, 1, 1, false, true, 1234ULL}, {1.f, -1.f, 2.f, 7, 23, false, true, 1234ULL}, - {1.f, -1.f, 2.f, 17, 5, false, true, 1234ULL}}; + {1.f, -1.f, 2.f, 17, 5, false, true, 1234ULL}, + {0.00001f, 0.001f, 0.f, 1 << 27, 2, false, true, 1234ULL}, + {0.00001f, 0.001f, 0.f, 1 << 27, 2, false, false, 1234ULL}}; const std::vector> inputsd = { {0.1, 1.0, 2.0, 1024, 32, true, false, 1234ULL}, @@ -183,7 +186,9 @@ const std::vector> inputsd = { {1.0, -1.0, 2.0, 17, 5, false, false, 1234ULL}, {1.0, -1.0, 2.0, 1, 1, false, true, 1234ULL}, {1.0, -1.0, 2.0, 7, 23, false, true, 1234ULL}, - {1.0, -1.0, 2.0, 17, 5, false, true, 1234ULL}}; + {1.0, -1.0, 2.0, 17, 5, false, true, 1234ULL}, + {1e-7, 0.001, 0.0, 1 << 27, 2, false, true, 1234ULL}, + {1e-7, 0.001, 0.0, 1 << 27, 2, false, false, 1234ULL}}; typedef StdDevTest StdDevTestF; TEST_P(StdDevTestF, Result) diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index bf2aa44a2c..14f30dfa5b 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -40,7 +40,8 @@ struct SumInputs { template ::std::ostream& operator<<(::std::ostream& os, const SumInputs& dims) { - return os; + return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " + << dims.rowMajor << "}" << std::endl; } template @@ -96,43 +97,29 @@ class SumTest : public ::testing::TestWithParam> { rmm::device_uvector data, sum_act; }; -const std::vector> inputsf = {{0.0001f, 4, 5, true, 1}, - {0.0001f, 1024, 32, true, 1}, - {0.0001f, 1024, 256, true, 1}, - {0.0001f, 100000000, 1, true, 0.001}, - {0.0001f, 1, 30, true, 0.001}, - {0.0001f, 1, 1, true, 0.001}, - {0.0001f, 17, 5, true, 0.001}, - {0.0001f, 7, 23, true, 0.001}, - {0.0001f, 3, 97, true, 0.001}, - {0.0001f, 4, 5, false, 1}, - {0.0001f, 1024, 32, false, 1}, - {0.0001f, 1024, 256, false, 1}, - {0.0001f, 100000000, 1, false, 0.001}, - {0.0001f, 1, 30, false, 0.001}, - {0.0001f, 1, 1, false, 0.001}, - {0.0001f, 17, 5, false, 0.001}, - {0.0001f, 7, 23, false, 0.001}, - {0.0001f, 3, 97, false, 0.001}}; - -const std::vector> inputsd = {{0.000001, 1024, 32, true, 1}, - {0.000001, 1024, 256, true, 1}, - {0.000001, 1024, 256, true, 1}, - {0.000001, 100000000, 1, true, 0.001}, - {0.000001, 1, 30, true, 0.0001}, - {0.000001, 1, 1, true, 0.0001}, - {0.000001, 17, 5, true, 0.0001}, - {0.000001, 7, 23, true, 0.0001}, - {0.000001, 3, 97, true, 0.0001}, - {0.000001, 1024, 32, false, 1}, - {0.000001, 1024, 256, false, 1}, - {0.000001, 1024, 256, false, 1}, - {0.000001, 100000000, 1, false, 0.001}, - {0.000001, 1, 30, false, 0.0001}, - {0.000001, 1, 1, false, 0.0001}, - {0.000001, 17, 5, false, 0.0001}, - {0.000001, 7, 23, false, 0.0001}, - {0.000001, 3, 97, false, 0.0001}}; +const std::vector> inputsf = { + {0.0001f, 4, 5, true, 1}, {0.0001f, 1024, 32, true, 1}, + {0.0001f, 1024, 256, true, 1}, {0.0001f, 100000000, 1, true, 0.001}, + {0.0001f, 1 << 27, 2, true, 0.1}, {0.0001f, 1, 30, true, 0.001}, + {0.0001f, 1, 1, true, 0.001}, {0.0001f, 17, 5, true, 0.001}, + {0.0001f, 7, 23, true, 0.001}, {0.0001f, 3, 97, true, 0.001}, + {0.0001f, 4, 5, false, 1}, {0.0001f, 1024, 32, false, 1}, + {0.0001f, 1024, 256, false, 1}, {0.0001f, 100000000, 1, false, 0.001}, + {0.0001f, 1 << 27, 2, false, 0.1}, {0.0001f, 1, 30, false, 0.001}, + {0.0001f, 1, 1, false, 0.001}, {0.0001f, 17, 5, false, 0.001}, + {0.0001f, 7, 23, false, 0.001}, {0.0001f, 3, 97, false, 0.001}}; + +const std::vector> inputsd = { + {0.000001, 1024, 32, true, 1}, {0.000001, 1024, 256, true, 1}, + {0.000001, 1024, 256, true, 1}, {0.000001, 100000000, 1, true, 0.001}, + {1e-10, 1 << 27, 2, true, 0.1}, {0.000001, 1, 30, true, 0.0001}, + {0.000001, 1, 1, true, 0.0001}, {0.000001, 17, 5, true, 0.0001}, + {0.000001, 7, 23, true, 0.0001}, {0.000001, 3, 97, true, 0.0001}, + {0.000001, 1024, 32, false, 1}, {0.000001, 1024, 256, false, 1}, + {0.000001, 1024, 256, false, 1}, {0.000001, 100000000, 1, false, 0.001}, + {1e-10, 1 << 27, 2, false, 0.1}, {0.000001, 1, 30, false, 0.0001}, + {0.000001, 1, 1, false, 0.0001}, {0.000001, 17, 5, false, 0.0001}, + {0.000001, 7, 23, false, 0.0001}, {0.000001, 3, 97, false, 0.0001}}; typedef SumTest SumTestF; typedef SumTest SumTestD; From 11afafc4130b04fe281ce292f4ce263d4d1b62e4 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Tue, 9 Jul 2024 22:14:45 +0000 Subject: [PATCH 02/10] add Kahan sum for coalesced kernels --- .../linalg/detail/coalesced_reduction-inl.cuh | 197 +++++++++++++++++- cpp/test/stats/sum.cu | 35 +++- 2 files changed, 219 insertions(+), 13 deletions(-) diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh index 9f3be7ce0e..6485dff8a7 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh @@ -42,6 +42,18 @@ struct ReductionThinPolicy { static constexpr bool NoSequentialReduce = noLoop; }; +template +DI void KahanBabushkaNeumaierSum(Type& sum, Type& compensation, const Type& cur_value) +{ + const Type t = sum + cur_value; + if (abs(sum) >= abs(cur_value)) { + compensation += (sum - t) + cur_value; + } else { + compensation += (cur_value - t) + sum; + } + sum = t; +} + template +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedSumThinKernel(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + MainLambda main_op, + FinalLambda final_op, + bool inplace = false) +{ + /* The strategy to achieve near-SOL memory bandwidth differs based on D: + * - For small D, we need to process multiple rows per logical warp in order to have + * multiple loads per thread and increase bytes in flight and amortize latencies. + * - For large D, we start with a sequential reduction. The compiler partially unrolls + * that loop (e.g. first a loop of stride 16, then 8, 4, and 1). + */ + IdxType i0 = threadIdx.y + (Policy::RowsPerBlock * static_cast(blockIdx.x)); + if (i0 >= N) return; + + OutType acc[Policy::RowsPerLogicalWarp]; + OutType thread_c[Policy::RowsPerLogicalWarp]; + +#pragma unroll + for (int k = 0; k < Policy::RowsPerLogicalWarp; k++) { + acc[k] = init; + thread_c[k] = 0; + } + + if constexpr (Policy::NoSequentialReduce) { + IdxType j = threadIdx.x; + if (j < D) { +#pragma unroll + for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) { + // Only the first row is known to be within bounds. Clamp to avoid out-of-mem read. + const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1); + // acc[k] = reduce_op(acc[k], main_op(data[j + (D * i)], j)); + KahanBabushkaNeumaierSum(acc[k], thread_c[k], main_op(data[j + (D * i)], j)); + } + } + } else { + for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) { +#pragma unroll + for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) { + const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1); + // acc[k] = reduce_op(acc[k], main_op(data[j + (D * i)], j)); + KahanBabushkaNeumaierSum(acc[k], thread_c[k], main_op(data[j + (D * i)], j)); + } + } + } + + /* This vector reduction has two benefits compared to naive separate reductions: + * - It avoids the LSU bottleneck when the number of columns is around 32 (e.g. for 32, 5 shuffles + * are required and there is no initial sequential reduction to amortize that cost). + * - It distributes the outputs to multiple threads, enabling a coalesced store when the number of + * rows per logical warp and logical warp size are equal. + */ + raft::logicalWarpReduceVector( + acc, threadIdx.x, raft::add_op()); + + raft::logicalWarpReduceVector( + thread_c, threadIdx.x, raft::add_op()); + + constexpr int reducOutVecWidth = + std::max(1, Policy::RowsPerLogicalWarp / Policy::LogicalWarpSize); + constexpr int reducOutGroupSize = + std::max(1, Policy::LogicalWarpSize / Policy::RowsPerLogicalWarp); + constexpr int reducNumGroups = Policy::LogicalWarpSize / reducOutGroupSize; + + if (threadIdx.x % reducOutGroupSize == 0) { + const int groupId = threadIdx.x / reducOutGroupSize; + if (inplace) { +#pragma unroll + for (int k = 0; k < reducOutVecWidth; k++) { + const int reductionId = k * reducNumGroups + groupId; + const IdxType i = i0 + reductionId * Policy::NumLogicalWarps; + if (i < N) { dots[i] = final_op(dots[i] + acc[k] + thread_c[k]); } + } + } else { +#pragma unroll + for (int k = 0; k < reducOutVecWidth; k++) { + const int reductionId = k * reducNumGroups + groupId; + const IdxType i = i0 + reductionId * Policy::NumLogicalWarps; + if (i < N) { dots[i] = final_op(acc[k] + thread_c[k]); } + } + } + } +} + template (Policy::NoSequentialReduce)); dim3 threads(Policy::LogicalWarpSize, Policy::NumLogicalWarps, 1); dim3 blocks(ceildiv(N, Policy::RowsPerBlock), 1, 1); - coalescedReductionThinKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + if constexpr (std::is_same_v) { + coalescedSumThinKernel + <<>>(dots, data, D, N, init, main_op, final_op, inplace); + } else { + coalescedReductionThinKernel<<>>( + dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + } RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -240,6 +350,44 @@ RAFT_KERNEL __launch_bounds__(TPB) coalescedReductionMediumKernel(OutType* dots, } } +template +RAFT_KERNEL __launch_bounds__(TPB) coalescedSumMediumKernel(OutType* dots, + const InType* data, + IdxType D, + IdxType N, + OutType init, + MainLambda main_op, + FinalLambda final_op, + bool inplace = false) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage1; + __shared__ typename BlockReduce::TempStorage temp_storage2; + OutType thread_data = init; + OutType thread_c = (OutType)0; + + IdxType rowStart = blockIdx.x * D; + for (IdxType i = threadIdx.x; i < D; i += TPB) { + IdxType idx = rowStart + i; + KahanBabushkaNeumaierSum(thread_data, thread_c, main_op(data[idx], i)); + } + OutType block_acc = BlockReduce(temp_storage1).Sum(thread_data); + OutType block_c = BlockReduce(temp_storage2).Sum(thread_c); + + if (threadIdx.x == 0) { + if (inplace) { + dots[blockIdx.x] = final_op(dots[blockIdx.x] + block_acc + block_c); + } else { + dots[blockIdx.x] = final_op(block_acc + block_c); + } + } +} + template fun_scope("coalescedReductionMedium<%d>", TPB); - coalescedReductionMediumKernel - <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + if constexpr (std::is_same_v) { + coalescedSumMediumKernel + <<>>(dots, data, D, N, init, main_op, final_op, inplace); + } else { + coalescedReductionMediumKernel + <<>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace); + } RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -322,6 +475,32 @@ RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) if (threadIdx.x == 0) { buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = acc; } } +template +RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock) coalescedSumThickKernel( + OutType* buffer, const InType* data, IdxType D, IdxType N, OutType init, MainLambda main_op) +{ + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage1; + __shared__ typename BlockReduce::TempStorage temp_storage2; + + OutType thread_data = init; + OutType thread_c = (OutType)0; + + IdxType rowStart = blockIdx.x * D; + for (IdxType i = blockIdx.y * Policy::ThreadsPerBlock + threadIdx.x; i < D; + i += Policy::BlockStride) { + IdxType idx = rowStart + i; + KahanBabushkaNeumaierSum(thread_data, thread_c, main_op(data[idx], i)); + } + + OutType block_acc = BlockReduce(temp_storage1).Sum(thread_data); + OutType block_c = BlockReduce(temp_storage2).Sum(thread_c); + + if (threadIdx.x == 0) { + buffer[Policy::BlocksPerRow * blockIdx.x + blockIdx.y] = block_acc + block_c; + } +} + template - <<>>(buffer.data(), data, D, N, init, main_op, reduce_op); + if constexpr (std::is_same_v) { + coalescedSumThickKernel + <<>>(buffer.data(), data, D, N, init, main_op); + } else { + coalescedReductionThickKernel + <<>>(buffer.data(), data, D, N, init, main_op, reduce_op); + } RAFT_CUDA_TRY(cudaPeekAtLastError()); coalescedReductionThin(dots, diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu index 14f30dfa5b..fbb398cc5b 100644 --- a/cpp/test/stats/sum.cu +++ b/cpp/test/stats/sum.cu @@ -41,7 +41,7 @@ template ::std::ostream& operator<<(::std::ostream& os, const SumInputs& dims) { return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " - << dims.rowMajor << "}" << std::endl; + << dims.rowMajor << ", " << dims.value << "}" << std::endl; } template @@ -58,13 +58,31 @@ class SumTest : public ::testing::TestWithParam> { } protected: - void runTest() + void runTest(bool checkErrorCompensation = false) { int len = rows * cols; + double large_factor = 1e7; + + if constexpr (std::is_same_v) large_factor = 1e12; + std::vector data_h(len); for (int i = 0; i < len; i++) { - data_h[i] = T(params.value); + data_h[i] = double(params.value); + int row = params.rowMajor ? i / cols : i % rows; + + // every 3 elements (in a column) contain 2 large dummy elements + // (one of them negative) and one element with 3x compensating + // for the 2 missing elements + if (checkErrorCompensation && row % 3 == 2) { + data_h[i] = double(params.value) * large_factor; + // compensate with opposite error 3 rows up + int idx2 = params.rowMajor ? (i - cols) : (i - 1); + data_h[idx2] = -1 * double(params.value) * large_factor; + // compensate 2 missing values + int idx3 = params.rowMajor ? (i - 2 * cols) : (i - 2); + data_h[idx3] = 3.0 * double(params.value); + } } raft::update_device(data.data(), data_h.data(), len, stream); @@ -84,8 +102,10 @@ class SumTest : public ::testing::TestWithParam> { double expected = double(params.rows) * params.value; + double tolerance = checkErrorCompensation ? 100 * params.tolerance : params.tolerance; + ASSERT_TRUE(raft::devArrMatch( - T(expected), sum_act.data(), params.cols, raft::CompareApprox(params.tolerance))); + T(expected), sum_act.data(), params.cols, raft::CompareApprox(tolerance))); } protected: @@ -112,12 +132,12 @@ const std::vector> inputsf = { const std::vector> inputsd = { {0.000001, 1024, 32, true, 1}, {0.000001, 1024, 256, true, 1}, {0.000001, 1024, 256, true, 1}, {0.000001, 100000000, 1, true, 0.001}, - {1e-10, 1 << 27, 2, true, 0.1}, {0.000001, 1, 30, true, 0.0001}, + {1e-9, 1 << 27, 2, true, 0.1}, {0.000001, 1, 30, true, 0.0001}, {0.000001, 1, 1, true, 0.0001}, {0.000001, 17, 5, true, 0.0001}, {0.000001, 7, 23, true, 0.0001}, {0.000001, 3, 97, true, 0.0001}, {0.000001, 1024, 32, false, 1}, {0.000001, 1024, 256, false, 1}, {0.000001, 1024, 256, false, 1}, {0.000001, 100000000, 1, false, 0.001}, - {1e-10, 1 << 27, 2, false, 0.1}, {0.000001, 1, 30, false, 0.0001}, + {1e-9, 1 << 27, 2, false, 0.1}, {0.000001, 1, 30, false, 0.0001}, {0.000001, 1, 1, false, 0.0001}, {0.000001, 17, 5, false, 0.0001}, {0.000001, 7, 23, false, 0.0001}, {0.000001, 3, 97, false, 0.0001}}; @@ -127,6 +147,9 @@ typedef SumTest SumTestD; TEST_P(SumTestF, Result) { runTest(); } TEST_P(SumTestD, Result) { runTest(); } +TEST_P(SumTestF, Accuracy) { runTest(true); } +TEST_P(SumTestD, Accuracy) { runTest(true); } + INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf)); INSTANTIATE_TEST_CASE_P(SumTests, SumTestD, ::testing::ValuesIn(inputsd)); From c2f5d8f42f463c43f99e9b81ca7ba7ae003239bf Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Tue, 9 Jul 2024 22:20:54 +0000 Subject: [PATCH 03/10] improve Kahan sum for strided kernels, also compensate on block --- .../raft/linalg/detail/strided_reduction.cuh | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index a9094d3959..1a0bc4b32f 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -57,19 +57,30 @@ RAFT_KERNEL stridedSummationKernel( } } - // add correction term - thread_sum += thread_c; - // Block reduction extern __shared__ char tmp[]; auto* block_sum = (Type*)tmp; - if (threadIdx.x < blockDim.x) block_sum[threadIdx.x] = Type(0); + auto* block_c = block_sum + blockDim.x; + + if (threadIdx.y == 0) { + block_sum[threadIdx.x] = Type(0); + block_c[threadIdx.x] = Type(0); + } __syncthreads(); - raft::myAtomicAdd(block_sum + threadIdx.x, thread_sum); + // also compute compensation for block-sum + const Type old_sum = atomicAdd(block_sum + threadIdx.x, thread_sum); + const Type t = old_sum + thread_sum; + if (abs(old_sum) >= abs(thread_sum)) { + thread_c += (old_sum - t) + thread_sum; + } else { + thread_c += (thread_sum - t) + old_sum; + } + raft::myAtomicAdd(block_c + threadIdx.x, thread_c); __syncthreads(); // Grid reduction - if (colStart < D && (threadIdx.y == 0)) raft::myAtomicAdd(out + colStart, block_sum[threadIdx.x]); + if (colStart < D && (threadIdx.y == 0)) + raft::myAtomicAdd(out + colStart, block_sum[threadIdx.x] + block_c[threadIdx.x]); } // Kernel to perform reductions along the strided dimension @@ -150,7 +161,7 @@ void stridedReduction(OutType* dots, const dim3 grid(raft::ceildiv(D, (IdxType)ColsPerBlk), raft::min((IdxType)MaxBlocksDimY, raft::ceildiv(N, (IdxType)MinRowsPerBlk))); - const size_t shmemSize = sizeof(OutType) * Block.x; + const size_t shmemSize = sizeof(OutType) * Block.x * 2; stridedSummationKernel <<>>(dots, data, D, N, init, main_op); From e2ff5ac4633753c854dcdaccc8643dbffc079aff Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Mon, 15 Jul 2024 21:33:37 +0200 Subject: [PATCH 04/10] adjust heuristic for coalesced reduction kernel choice --- .../linalg/detail/coalesced_reduction-inl.cuh | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh index 6485dff8a7..feef71818e 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh @@ -574,13 +574,8 @@ void coalescedReductionThickDispatcher(OutType* dots, { // Note: multiple elements per thread to take advantage of the sequential reduction and loop // unrolling - if (D < IdxType(32768)) { - coalescedReductionThick, ReductionThinPolicy<32, 128, 1>>( - dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } else { - coalescedReductionThick, ReductionThinPolicy<32, 128, 1>>( - dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } + coalescedReductionThick, ReductionThinPolicy<32, 128, 1>>( + dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } // Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along @@ -605,14 +600,17 @@ void coalescedReduction(OutType* dots, { /* The primitive selects one of three implementations based on heuristics: * - Thin: very efficient when D is small and/or N is large + * At most one warp is processing each row * - Thick: used when N is very small and D very large - * - Medium: used when N is too small to fill the GPU with the thin kernel + * Multiple blocks (32/64) processing each row + * - Medium: everything in between + * One block is processing each row */ const IdxType numSMs = raft::getMultiProcessorCount(); - if (D <= IdxType(256) || N >= IdxType(4) * numSMs) { + if (D <= IdxType(512) || (N >= IdxType(16) * numSMs && D < IdxType(2048))) { coalescedReductionThinDispatcher( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); - } else if (N < numSMs && D >= IdxType(16384)) { + } else if (N < numSMs && D >= IdxType(1 << 17)) { coalescedReductionThickDispatcher( dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op); } else { From 6ad9a780b847daeb6d5b6e4e3bad8ec4f0b42a9a Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 18 Jul 2024 09:34:16 +0000 Subject: [PATCH 05/10] add comments --- cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh | 3 +++ cpp/include/raft/linalg/detail/strided_reduction.cuh | 6 +++++- cpp/include/raft/linalg/reduce.cuh | 4 +++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh index feef71818e..9680cbc636 100644 --- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh +++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh @@ -581,6 +581,9 @@ void coalescedReductionThickDispatcher(OutType* dots, // Primitive to perform reductions along the coalesced dimension of the matrix, i.e. reduce along // rows for row major or reduce along columns for column major layout. Can do an inplace reduction // adding to original values of dots if requested. +// In case of an add-reduction, a compensated summation will be performed in order to reduce +// numerical error. Note that the compensation will only be performed 'per-thread' for performance +// reasons and therefore not be equivalent to a sequential compensation. template RAFT_KERNEL stridedSummationKernel( Type* out, const Type* data, int D, int N, Type init, MainLambda main_op) diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index a4523c6926..bbf0a52e04 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -92,7 +92,9 @@ void reduce(OutType* dots, * is either row-major or column-major, while allowing the choose the * dimension for reduction. Depending upon the dimension chosen for * reduction, the memory accesses may be coalesced or strided. - * + * In case of an add-reduction, a compensated summation will be performed + * in order to reduce numerical error. Note that the compensation will not + * be equivalent to a sequential compensation to preserve parallel efficiency. * @tparam InElementType the input data-type of underlying raft::matrix_view * @tparam LayoutPolicy The layout of Input/Output (row or col major) * @tparam OutElementType the output data-type of underlying raft::matrix_view and reduction From 6f0b012d95c916e30d2823fb6b0aeb3478555b96 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 18 Jul 2024 09:39:22 +0000 Subject: [PATCH 06/10] add comment for second API --- cpp/include/raft/linalg/reduce.cuh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh index bbf0a52e04..8fd6e45d37 100644 --- a/cpp/include/raft/linalg/reduce.cuh +++ b/cpp/include/raft/linalg/reduce.cuh @@ -31,6 +31,9 @@ namespace linalg { /** * @brief Compute reduction of the input matrix along the requested dimension + * In case of an add-reduction, a compensated summation will be performed + * in order to reduce numerical error. Note that the compensation will not + * be equivalent to a sequential compensation to preserve parallel efficiency. * * @tparam InType the data type of the input * @tparam OutType the data type of the output (as well as the data type for From 5b9837b9e8841a79cf5851d1fccac577fb116f4f Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 18 Jul 2024 11:23:13 +0000 Subject: [PATCH 07/10] enable sample parameter for column based --- cpp/include/raft/stats/detail/mean.cuh | 2 +- cpp/include/raft/stats/detail/stddev.cuh | 4 ++-- cpp/test/stats/mean.cu | 4 ++-- cpp/test/stats/stddev.cu | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh index 76483cbfda..ee39c87a68 100644 --- a/cpp/include/raft/stats/detail/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -30,7 +30,7 @@ template void mean( Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) { - Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N)); + Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); raft::linalg::reduce(mu, data, D, diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh index 4af4060016..df70ace617 100644 --- a/cpp/include/raft/stats/detail/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -59,7 +59,7 @@ void stddev(Type* std, std, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) { return a * a; }); - Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N)); + Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( std, std, @@ -103,7 +103,7 @@ void vars(Type* var, var, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) { return a * a; }); - Type ratio = Type(1) / ((sample && rowMajor) ? Type(N - 1) : Type(N)); + Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); raft::linalg::binaryOp( var, var, diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index dbd8549049..c5fe83d95b 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -117,7 +117,7 @@ const std::vector> inputsf = { {0.15f, -1.f, 1030, 1, false, false, 1234ULL}, {0.15f, -1.f, 1030, 60, true, false, 1234ULL}, {2.0f, -1.f, 31, 120, false, false, 1234ULL}, - {2.0f, -1.f, 1, 130, true, false, 1234ULL}, + {2.0f, -1.f, 1, 130, false, false, 1234ULL}, {0.15f, -1.f, 1030, 1, false, true, 1234ULL}, {0.15f, -1.f, 1030, 60, true, true, 1234ULL}, {2.0f, -1.f, 31, 120, false, true, 1234ULL}, @@ -151,7 +151,7 @@ const std::vector> inputsd = { {0.15, -1.0, 1030, 1, false, false, 1234ULL}, {0.15, -1.0, 1030, 60, true, false, 1234ULL}, {2.0, -1.0, 31, 120, false, false, 1234ULL}, - {2.0, -1.0, 1, 130, true, false, 1234ULL}, + {2.0, -1.0, 1, 130, false, false, 1234ULL}, {0.15, -1.0, 1030, 1, false, true, 1234ULL}, {0.15, -1.0, 1030, 60, true, true, 1234ULL}, {2.0, -1.0, 31, 120, false, true, 1234ULL}, diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index 1459c8fe14..f0fae195e1 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -148,7 +148,7 @@ const std::vector> inputsf = { {0.5f, -1.f, 2.f, 31, 1, true, true, 1234ULL}, {1.f, -1.f, 2.f, 1, 257, false, true, 1234ULL}, {0.5f, -1.f, 2.f, 31, 1, false, false, 1234ULL}, - {1.f, -1.f, 2.f, 1, 257, true, false, 1234ULL}, + {1.f, -1.f, 2.f, 1, 257, false, false, 1234ULL}, {1.f, -1.f, 2.f, 1, 1, false, false, 1234ULL}, {1.f, -1.f, 2.f, 7, 23, false, false, 1234ULL}, {1.f, -1.f, 2.f, 17, 5, false, false, 1234ULL}, @@ -180,7 +180,7 @@ const std::vector> inputsd = { {0.5, -1.0, 2.0, 31, 1, true, true, 1234ULL}, {1.0, -1.0, 2.0, 1, 257, false, true, 1234ULL}, {0.5, -1.0, 2.0, 31, 1, false, false, 1234ULL}, - {1.0, -1.0, 2.0, 1, 257, true, false, 1234ULL}, + {1.0, -1.0, 2.0, 1, 257, false, false, 1234ULL}, {1.0, -1.0, 2.0, 1, 1, false, false, 1234ULL}, {1.0, -1.0, 2.0, 7, 23, false, false, 1234ULL}, {1.0, -1.0, 2.0, 17, 5, false, false, 1234ULL}, From 0461fd3de1efc65c81b5beeb52e58feeddce0caa Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Thu, 18 Jul 2024 15:42:21 +0000 Subject: [PATCH 08/10] fix covariance should always use mean with sample=false --- cpp/test/stats/cov.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu index df5177056d..602f356b9f 100644 --- a/cpp/test/stats/cov.cu +++ b/cpp/test/stats/cov.cu @@ -72,8 +72,7 @@ class CovTest : public ::testing::TestWithParam> { cov_act.resize(cols * cols, stream); normal(handle, r, data.data(), len, params.mean, var); - raft::stats::mean( - mean_act.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream); + raft::stats::mean(mean_act.data(), data.data(), cols, rows, false, params.rowMajor, stream); if (params.rowMajor) { using layout = raft::row_major; cov(handle, @@ -103,7 +102,7 @@ class CovTest : public ::testing::TestWithParam> { raft::update_device(data_cm.data(), data_h, 6, stream); raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream); - raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, true, false, stream); + raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, false, stream); cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream); } From f2a76be246f15e75ece0ae74e1e805c7d41d4a77 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 24 Jul 2024 11:19:25 +0000 Subject: [PATCH 09/10] fix stddev and var for sample=true --- cpp/include/raft/stats/detail/stddev.cuh | 42 +++++++++++++----------- cpp/test/stats/stddev.cu | 4 +-- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh index df70ace617..4c861b49fb 100644 --- a/cpp/include/raft/stats/detail/stddev.cuh +++ b/cpp/include/raft/stats/detail/stddev.cuh @@ -59,16 +59,18 @@ void stddev(Type* std, std, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) { return a * a; }); - Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); - raft::linalg::binaryOp( - std, - std, - mu, - D, - raft::compose_op(raft::sqrt_op(), - raft::abs_op(), - [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }), - stream); + Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); + Type ratio_mean = sample ? ratio * Type(N) : Type(1); + raft::linalg::binaryOp(std, + std, + mu, + D, + raft::compose_op(raft::sqrt_op(), + raft::abs_op(), + [ratio, ratio_mean] __device__(Type a, Type b) { + return a * ratio - b * b * ratio_mean; + }), + stream); } /** @@ -103,15 +105,17 @@ void vars(Type* var, var, data, D, N, Type(0), rowMajor, false, stream, false, [mu] __device__(Type a, IdxType i) { return a * a; }); - Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); - raft::linalg::binaryOp( - var, - var, - mu, - D, - raft::compose_op(raft::abs_op(), - [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }), - stream); + Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); + Type ratio_mean = sample ? ratio * Type(N) : Type(1); + raft::linalg::binaryOp(var, + var, + mu, + D, + raft::compose_op(raft::abs_op(), + [ratio, ratio_mean] __device__(Type a, Type b) { + return a * ratio - b * b * ratio_mean; + }), + stream); } } // namespace detail diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index f0fae195e1..f4c5f92f49 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -82,7 +82,7 @@ class StdDevTest : public ::testing::TestWithParam> { mean(handle, raft::make_device_matrix_view(data, rows, cols), raft::make_device_vector_view(mean_act.data(), cols), - params.sample); + false); stddev(handle, raft::make_device_matrix_view(data, rows, cols), @@ -100,7 +100,7 @@ class StdDevTest : public ::testing::TestWithParam> { mean(handle, raft::make_device_matrix_view(data, rows, cols), raft::make_device_vector_view(mean_act.data(), cols), - params.sample); + false); stddev(handle, raft::make_device_matrix_view(data, rows, cols), From 876b9ead11eb450a95e1e16ab3a475a6bf152c34 Mon Sep 17 00:00:00 2001 From: Malte Foerster Date: Wed, 24 Jul 2024 12:05:38 +0000 Subject: [PATCH 10/10] remove sample paramemter from mean API --- cpp/include/raft/stats/detail/mean.cuh | 5 +- cpp/include/raft/stats/detail/scores.cuh | 2 +- cpp/include/raft/stats/mean.cuh | 16 +-- cpp/test/neighbors/ann_ivf_flat.cuh | 2 +- cpp/test/random/rng.cu | 3 +- cpp/test/stats/cov.cu | 4 +- cpp/test/stats/mean.cu | 121 ++++++++---------- cpp/test/stats/mean_center.cu | 149 ++++++++--------------- cpp/test/stats/stddev.cu | 6 +- 9 files changed, 112 insertions(+), 196 deletions(-) diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh index ee39c87a68..d1de474e8d 100644 --- a/cpp/include/raft/stats/detail/mean.cuh +++ b/cpp/include/raft/stats/detail/mean.cuh @@ -27,10 +27,9 @@ namespace stats { namespace detail { template -void mean( - Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) +void mean(Type* mu, const Type* data, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) { - Type ratio = Type(1) / ((sample) ? Type(N - 1) : Type(N)); + Type ratio = Type(1) / Type(N); raft::linalg::reduce(mu, data, D, diff --git a/cpp/include/raft/stats/detail/scores.cuh b/cpp/include/raft/stats/detail/scores.cuh index 947df6848a..66951f52ab 100644 --- a/cpp/include/raft/stats/detail/scores.cuh +++ b/cpp/include/raft/stats/detail/scores.cuh @@ -59,7 +59,7 @@ math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream) { rmm::device_scalar y_bar(stream); - raft::stats::mean(y_bar.data(), y, 1, n, false, false, stream); + raft::stats::mean(y_bar.data(), y, 1, n, false, stream); RAFT_CUDA_TRY(cudaPeekAtLastError()); rmm::device_uvector sse_arr(n, stream); diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh index 43d39cfd6c..3147216211 100644 --- a/cpp/include/raft/stats/mean.cuh +++ b/cpp/include/raft/stats/mean.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * Copyright (c) 2018-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,17 +38,13 @@ namespace stats { * @param data: the input matrix * @param D: number of columns of data * @param N: number of rows of data - * @param sample: whether to evaluate sample mean or not. In other words, - * whether - * to normalize the output using N-1 or N, for true or false, respectively * @param rowMajor: whether the input data is row or col major * @param stream: cuda stream */ template -void mean( - Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream) +void mean(Type* mu, const Type* data, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream) { - detail::mean(mu, data, D, N, sample, rowMajor, stream); + detail::mean(mu, data, D, N, rowMajor, stream); } /** @@ -67,14 +63,11 @@ void mean( * @param[in] handle the raft handle * @param[in] data: the input matrix * @param[out] mu: the output mean vector - * @param[in] sample: whether to evaluate sample mean or not. In other words, whether - * to normalize the output using N-1 or N, for true or false, respectively */ template void mean(raft::resources const& handle, raft::device_matrix_view data, - raft::device_vector_view mu, - bool sample) + raft::device_vector_view mu) { static_assert( std::is_same_v || std::is_same_v, @@ -86,7 +79,6 @@ void mean(raft::resources const& handle, data.data_handle(), data.extent(1), data.extent(0), - sample, std::is_same_v, resource::get_cuda_stream(handle)); } diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index de6af589fa..314a0efae1 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -288,7 +288,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { (IdxT)ps.dim, stream_); raft::stats::mean( - centroid.data(), cluster_data.data(), ps.dim, list_sizes[l], false, true, stream_); + centroid.data(), cluster_data.data(), ps.dim, list_sizes[l], true, stream_); ASSERT_TRUE(raft::devArrMatch(index_2.centers().data_handle() + ps.dim * l, centroid.data(), ps.dim, diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu index a37f150d4c..172f94ae50 100644 --- a/cpp/test/random/rng.cu +++ b/cpp/test/random/rng.cu @@ -407,8 +407,7 @@ TEST(Rng, MeanError) RngState r(seed, rtype); normal(handle, r, data.data(), len, 3.3f, 0.23f); // uniform(r, data, len, -1.0, 2.0); - raft::stats::mean( - mean_result.data(), data.data(), num_samples, num_experiments, false, false, stream); + raft::stats::mean(mean_result.data(), data.data(), num_samples, num_experiments, false, stream); raft::stats::stddev(std_result.data(), data.data(), mean_result.data(), diff --git a/cpp/test/stats/cov.cu b/cpp/test/stats/cov.cu index 602f356b9f..3f2a3dcebf 100644 --- a/cpp/test/stats/cov.cu +++ b/cpp/test/stats/cov.cu @@ -72,7 +72,7 @@ class CovTest : public ::testing::TestWithParam> { cov_act.resize(cols * cols, stream); normal(handle, r, data.data(), len, params.mean, var); - raft::stats::mean(mean_act.data(), data.data(), cols, rows, false, params.rowMajor, stream); + raft::stats::mean(mean_act.data(), data.data(), cols, rows, params.rowMajor, stream); if (params.rowMajor) { using layout = raft::row_major; cov(handle, @@ -102,7 +102,7 @@ class CovTest : public ::testing::TestWithParam> { raft::update_device(data_cm.data(), data_h, 6, stream); raft::update_device(cov_cm_ref.data(), cov_cm_ref_h, 4, stream); - raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, false, stream); + raft::stats::mean(mean_cm.data(), data_cm.data(), 2, 3, false, stream); cov(handle, cov_cm.data(), data_cm.data(), mean_cm.data(), 2, 3, true, false, true, stream); } diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu index c5fe83d95b..e72d4eaf74 100644 --- a/cpp/test/stats/mean.cu +++ b/cpp/test/stats/mean.cu @@ -33,7 +33,7 @@ template struct MeanInputs { T tolerance, mean; int rows, cols; - bool sample, rowMajor; + bool rowMajor; unsigned long long int seed; T stddev = (T)1.0; }; @@ -42,7 +42,7 @@ template ::std::ostream& operator<<(::std::ostream& os, const MeanInputs& dims) { return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", " - << dims.sample << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl; + << ", " << dims.rowMajor << ", " << dims.stddev << "}" << std::endl; } template @@ -74,14 +74,12 @@ class MeanTest : public ::testing::TestWithParam> { using layout = raft::row_major; mean(handle, raft::make_device_matrix_view(data, rows, cols), - raft::make_device_vector_view(mean_act.data(), cols), - params.sample); + raft::make_device_vector_view(mean_act.data(), cols)); } else { using layout = raft::col_major; mean(handle, raft::make_device_matrix_view(data, rows, cols), - raft::make_device_vector_view(mean_act.data(), cols), - params.sample); + raft::make_device_vector_view(mean_act.data(), cols)); } } @@ -98,72 +96,51 @@ class MeanTest : public ::testing::TestWithParam> { // measured mean (of a normal distribution) will fall outside of an epsilon of // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times) const std::vector> inputsf = { - {0.15f, 1.f, 1024, 32, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, false, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, false, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, false, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, false, 1234ULL}, - {0.15f, 1.f, 1024, 32, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 64, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 128, true, true, 1234ULL}, - {0.15f, 1.f, 1024, 256, true, true, 1234ULL}, - {0.15f, -1.f, 1024, 32, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 64, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 128, false, true, 1234ULL}, - {0.15f, -1.f, 1024, 256, false, true, 1234ULL}, - {0.15f, -1.f, 1030, 1, false, false, 1234ULL}, - {0.15f, -1.f, 1030, 60, true, false, 1234ULL}, - {2.0f, -1.f, 31, 120, false, false, 1234ULL}, - {2.0f, -1.f, 1, 130, false, false, 1234ULL}, - {0.15f, -1.f, 1030, 1, false, true, 1234ULL}, - {0.15f, -1.f, 1030, 60, true, true, 1234ULL}, - {2.0f, -1.f, 31, 120, false, true, 1234ULL}, - {2.0f, -1.f, 1, 130, false, true, 1234ULL}, - {2.0f, -1.f, 1, 1, false, false, 1234ULL}, - {2.0f, -1.f, 1, 1, false, true, 1234ULL}, - {2.0f, -1.f, 7, 23, false, false, 1234ULL}, - {2.0f, -1.f, 7, 23, false, true, 1234ULL}, - {2.0f, -1.f, 17, 5, false, false, 1234ULL}, - {2.0f, -1.f, 17, 5, false, true, 1234ULL}, - {0.0001f, 0.1f, 1 << 27, 2, false, false, 1234ULL, 0.0001f}, - {0.0001f, 0.1f, 1 << 27, 2, false, true, 1234ULL, 0.0001f}}; - -const std::vector> inputsd = { - {0.15, 1.0, 1024, 32, true, false, 1234ULL}, - {0.15, 1.0, 1024, 64, true, false, 1234ULL}, - {0.15, 1.0, 1024, 128, true, false, 1234ULL}, - {0.15, 1.0, 1024, 256, true, false, 1234ULL}, - {0.15, -1.0, 1024, 32, false, false, 1234ULL}, - {0.15, -1.0, 1024, 64, false, false, 1234ULL}, - {0.15, -1.0, 1024, 128, false, false, 1234ULL}, - {0.15, -1.0, 1024, 256, false, false, 1234ULL}, - {0.15, 1.0, 1024, 32, true, true, 1234ULL}, - {0.15, 1.0, 1024, 64, true, true, 1234ULL}, - {0.15, 1.0, 1024, 128, true, true, 1234ULL}, - {0.15, 1.0, 1024, 256, true, true, 1234ULL}, - {0.15, -1.0, 1024, 32, false, true, 1234ULL}, - {0.15, -1.0, 1024, 64, false, true, 1234ULL}, - {0.15, -1.0, 1024, 128, false, true, 1234ULL}, - {0.15, -1.0, 1024, 256, false, true, 1234ULL}, - {0.15, -1.0, 1030, 1, false, false, 1234ULL}, - {0.15, -1.0, 1030, 60, true, false, 1234ULL}, - {2.0, -1.0, 31, 120, false, false, 1234ULL}, - {2.0, -1.0, 1, 130, false, false, 1234ULL}, - {0.15, -1.0, 1030, 1, false, true, 1234ULL}, - {0.15, -1.0, 1030, 60, true, true, 1234ULL}, - {2.0, -1.0, 31, 120, false, true, 1234ULL}, - {2.0, -1.0, 1, 130, false, true, 1234ULL}, - {2.0, -1.0, 1, 1, false, false, 1234ULL}, - {2.0, -1.0, 1, 1, false, true, 1234ULL}, - {2.0, -1.0, 7, 23, false, false, 1234ULL}, - {2.0, -1.0, 7, 23, false, true, 1234ULL}, - {2.0, -1.0, 17, 5, false, false, 1234ULL}, - {2.0, -1.0, 17, 5, false, true, 1234ULL}, - {1e-8, 1e-1, 1 << 27, 2, false, false, 1234ULL, 0.0001}, - {1e-8, 1e-1, 1 << 27, 2, false, true, 1234ULL, 0.0001}}; + {0.15f, -1.f, 1024, 32, false, 1234ULL}, + {0.15f, -1.f, 1024, 64, false, 1234ULL}, + {0.15f, -1.f, 1024, 128, false, 1234ULL}, + {0.15f, -1.f, 1024, 256, false, 1234ULL}, + {0.15f, -1.f, 1024, 32, true, 1234ULL}, + {0.15f, -1.f, 1024, 64, true, 1234ULL}, + {0.15f, -1.f, 1024, 128, true, 1234ULL}, + {0.15f, -1.f, 1024, 256, true, 1234ULL}, + {0.15f, -1.f, 1030, 1, false, 1234ULL}, + {2.0f, -1.f, 31, 120, false, 1234ULL}, + {2.0f, -1.f, 1, 130, false, 1234ULL}, + {0.15f, -1.f, 1030, 1, true, 1234ULL}, + {2.0f, -1.f, 31, 120, true, 1234ULL}, + {2.0f, -1.f, 1, 130, true, 1234ULL}, + {2.0f, -1.f, 1, 1, false, 1234ULL}, + {2.0f, -1.f, 1, 1, true, 1234ULL}, + {2.0f, -1.f, 7, 23, false, 1234ULL}, + {2.0f, -1.f, 7, 23, true, 1234ULL}, + {2.0f, -1.f, 17, 5, false, 1234ULL}, + {2.0f, -1.f, 17, 5, true, 1234ULL}, + {0.0001f, 0.1f, 1 << 27, 2, false, 1234ULL, 0.0001f}, + {0.0001f, 0.1f, 1 << 27, 2, true, 1234ULL, 0.0001f}}; + +const std::vector> inputsd = {{0.15, -1.0, 1024, 32, false, 1234ULL}, + {0.15, -1.0, 1024, 64, false, 1234ULL}, + {0.15, -1.0, 1024, 128, false, 1234ULL}, + {0.15, -1.0, 1024, 256, false, 1234ULL}, + {0.15, -1.0, 1024, 32, true, 1234ULL}, + {0.15, -1.0, 1024, 64, true, 1234ULL}, + {0.15, -1.0, 1024, 128, true, 1234ULL}, + {0.15, -1.0, 1024, 256, true, 1234ULL}, + {0.15, -1.0, 1030, 1, false, 1234ULL}, + {2.0, -1.0, 31, 120, false, 1234ULL}, + {2.0, -1.0, 1, 130, false, 1234ULL}, + {0.15, -1.0, 1030, 1, true, 1234ULL}, + {2.0, -1.0, 31, 120, true, 1234ULL}, + {2.0, -1.0, 1, 130, true, 1234ULL}, + {2.0, -1.0, 1, 1, false, 1234ULL}, + {2.0, -1.0, 1, 1, true, 1234ULL}, + {2.0, -1.0, 7, 23, false, 1234ULL}, + {2.0, -1.0, 7, 23, true, 1234ULL}, + {2.0, -1.0, 17, 5, false, 1234ULL}, + {2.0, -1.0, 17, 5, true, 1234ULL}, + {1e-8, 1e-1, 1 << 27, 2, false, 1234ULL, 0.0001}, + {1e-8, 1e-1, 1 << 27, 2, true, 1234ULL, 0.0001}}; typedef MeanTest MeanTestF; TEST_P(MeanTestF, Result) diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu index b44d87d1bd..48bf50056c 100644 --- a/cpp/test/stats/mean_center.cu +++ b/cpp/test/stats/mean_center.cu @@ -32,7 +32,7 @@ template struct MeanCenterInputs { T tolerance, mean; IdxType rows, cols; - bool sample, rowMajor, bcastAlongRows; + bool rowMajor, bcastAlongRows; unsigned long long int seed; }; @@ -64,8 +64,7 @@ class MeanCenterTest : public ::testing::TestWithParam> inputsf_i32 = { - {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL}, - {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL}, - {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL}, - {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL}, - {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL}, - {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL}, - {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL}, - {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL}, - {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL}, - {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL}, - {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL}, - {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL}, - {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL}, - {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL}, - {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL}, - {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL}, - {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL}, - {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL}, - {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL}, - {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL}, - {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL}, - {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL}, - {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, - {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; + {0.05f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 32, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 64, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 128, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 32, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 64, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 128, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i32; TEST_P(MeanCenterTestF_i32, Result) { @@ -136,30 +123,18 @@ TEST_P(MeanCenterTestF_i32, Result) INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32)); const std::vector> inputsf_i64 = { - {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL}, - {0.05f, 1.f, 1024, 64, true, false, true, 1234ULL}, - {0.05f, 1.f, 1024, 128, true, false, true, 1234ULL}, - {0.05f, -1.f, 1024, 32, false, false, true, 1234ULL}, - {0.05f, -1.f, 1024, 64, false, false, true, 1234ULL}, - {0.05f, -1.f, 1024, 128, false, false, true, 1234ULL}, - {0.05f, 1.f, 1024, 32, true, true, true, 1234ULL}, - {0.05f, 1.f, 1024, 64, true, true, true, 1234ULL}, - {0.05f, 1.f, 1024, 128, true, true, true, 1234ULL}, - {0.05f, -1.f, 1024, 32, false, true, true, 1234ULL}, - {0.05f, -1.f, 1024, 64, false, true, true, 1234ULL}, - {0.05f, -1.f, 1024, 128, false, true, true, 1234ULL}, - {0.05f, 1.f, 1024, 32, true, false, false, 1234ULL}, - {0.05f, 1.f, 1024, 64, true, false, false, 1234ULL}, - {0.05f, 1.f, 1024, 128, true, false, false, 1234ULL}, - {0.05f, -1.f, 1024, 32, false, false, false, 1234ULL}, - {0.05f, -1.f, 1024, 64, false, false, false, 1234ULL}, - {0.05f, -1.f, 1024, 128, false, false, false, 1234ULL}, - {0.05f, 1.f, 1024, 32, true, true, false, 1234ULL}, - {0.05f, 1.f, 1024, 64, true, true, false, 1234ULL}, - {0.05f, 1.f, 1024, 128, true, true, false, 1234ULL}, - {0.05f, -1.f, 1024, 32, false, true, false, 1234ULL}, - {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL}, - {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}}; + {0.05f, -1.f, 1024, 32, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, true, 1234ULL}, + {0.05f, -1.f, 1024, 32, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 64, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 128, true, true, 1234ULL}, + {0.05f, -1.f, 1024, 32, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 64, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 128, false, false, 1234ULL}, + {0.05f, -1.f, 1024, 32, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 64, true, false, 1234ULL}, + {0.05f, -1.f, 1024, 128, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestF_i64; TEST_P(MeanCenterTestF_i64, Result) { @@ -169,30 +144,18 @@ TEST_P(MeanCenterTestF_i64, Result) INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64)); const std::vector> inputsd_i32 = { - {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, - {0.05, 1.0, 1024, 64, true, false, true, 1234ULL}, - {0.05, 1.0, 1024, 128, true, false, true, 1234ULL}, - {0.05, -1.0, 1024, 32, false, false, true, 1234ULL}, - {0.05, -1.0, 1024, 64, false, false, true, 1234ULL}, - {0.05, -1.0, 1024, 128, false, false, true, 1234ULL}, - {0.05, 1.0, 1024, 32, true, true, true, 1234ULL}, - {0.05, 1.0, 1024, 64, true, true, true, 1234ULL}, - {0.05, 1.0, 1024, 128, true, true, true, 1234ULL}, - {0.05, -1.0, 1024, 32, false, true, true, 1234ULL}, - {0.05, -1.0, 1024, 64, false, true, true, 1234ULL}, - {0.05, -1.0, 1024, 128, false, true, true, 1234ULL}, - {0.05, 1.0, 1024, 32, true, false, false, 1234ULL}, - {0.05, 1.0, 1024, 64, true, false, false, 1234ULL}, - {0.05, 1.0, 1024, 128, true, false, false, 1234ULL}, - {0.05, -1.0, 1024, 32, false, false, false, 1234ULL}, - {0.05, -1.0, 1024, 64, false, false, false, 1234ULL}, - {0.05, -1.0, 1024, 128, false, false, false, 1234ULL}, - {0.05, 1.0, 1024, 32, true, true, false, 1234ULL}, - {0.05, 1.0, 1024, 64, true, true, false, 1234ULL}, - {0.05, 1.0, 1024, 128, true, true, false, 1234ULL}, - {0.05, -1.0, 1024, 32, false, true, false, 1234ULL}, - {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, - {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; + {0.05, -1.0, 1024, 32, false, true, 1234ULL}, + {0.05, -1.0, 1024, 64, false, true, 1234ULL}, + {0.05, -1.0, 1024, 128, false, true, 1234ULL}, + {0.05, -1.0, 1024, 32, true, true, 1234ULL}, + {0.05, -1.0, 1024, 64, true, true, 1234ULL}, + {0.05, -1.0, 1024, 128, true, true, 1234ULL}, + {0.05, -1.0, 1024, 32, false, false, 1234ULL}, + {0.05, -1.0, 1024, 64, false, false, 1234ULL}, + {0.05, -1.0, 1024, 128, false, false, 1234ULL}, + {0.05, -1.0, 1024, 32, true, false, 1234ULL}, + {0.05, -1.0, 1024, 64, true, false, 1234ULL}, + {0.05, -1.0, 1024, 128, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i32; TEST_P(MeanCenterTestD_i32, Result) { @@ -202,30 +165,18 @@ TEST_P(MeanCenterTestD_i32, Result) INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32)); const std::vector> inputsd_i64 = { - {0.05, 1.0, 1024, 32, true, false, true, 1234ULL}, - {0.05, 1.0, 1024, 64, true, false, true, 1234ULL}, - {0.05, 1.0, 1024, 128, true, false, true, 1234ULL}, - {0.05, -1.0, 1024, 32, false, false, true, 1234ULL}, - {0.05, -1.0, 1024, 64, false, false, true, 1234ULL}, - {0.05, -1.0, 1024, 128, false, false, true, 1234ULL}, - {0.05, 1.0, 1024, 32, true, true, true, 1234ULL}, - {0.05, 1.0, 1024, 64, true, true, true, 1234ULL}, - {0.05, 1.0, 1024, 128, true, true, true, 1234ULL}, - {0.05, -1.0, 1024, 32, false, true, true, 1234ULL}, - {0.05, -1.0, 1024, 64, false, true, true, 1234ULL}, - {0.05, -1.0, 1024, 128, false, true, true, 1234ULL}, - {0.05, 1.0, 1024, 32, true, false, false, 1234ULL}, - {0.05, 1.0, 1024, 64, true, false, false, 1234ULL}, - {0.05, 1.0, 1024, 128, true, false, false, 1234ULL}, - {0.05, -1.0, 1024, 32, false, false, false, 1234ULL}, - {0.05, -1.0, 1024, 64, false, false, false, 1234ULL}, - {0.05, -1.0, 1024, 128, false, false, false, 1234ULL}, - {0.05, 1.0, 1024, 32, true, true, false, 1234ULL}, - {0.05, 1.0, 1024, 64, true, true, false, 1234ULL}, - {0.05, 1.0, 1024, 128, true, true, false, 1234ULL}, - {0.05, -1.0, 1024, 32, false, true, false, 1234ULL}, - {0.05, -1.0, 1024, 64, false, true, false, 1234ULL}, - {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}}; + {0.05, -1.0, 1024, 32, false, true, 1234ULL}, + {0.05, -1.0, 1024, 64, false, true, 1234ULL}, + {0.05, -1.0, 1024, 128, false, true, 1234ULL}, + {0.05, -1.0, 1024, 32, true, true, 1234ULL}, + {0.05, -1.0, 1024, 64, true, true, 1234ULL}, + {0.05, -1.0, 1024, 128, true, true, 1234ULL}, + {0.05, -1.0, 1024, 32, false, false, 1234ULL}, + {0.05, -1.0, 1024, 64, false, false, 1234ULL}, + {0.05, -1.0, 1024, 128, false, false, 1234ULL}, + {0.05, -1.0, 1024, 32, true, false, 1234ULL}, + {0.05, -1.0, 1024, 64, true, false, 1234ULL}, + {0.05, -1.0, 1024, 128, true, false, 1234ULL}}; typedef MeanCenterTest MeanCenterTestD_i64; TEST_P(MeanCenterTestD_i64, Result) { diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu index f4c5f92f49..a9a70b1e60 100644 --- a/cpp/test/stats/stddev.cu +++ b/cpp/test/stats/stddev.cu @@ -81,8 +81,7 @@ class StdDevTest : public ::testing::TestWithParam> { using layout_t = raft::row_major; mean(handle, raft::make_device_matrix_view(data, rows, cols), - raft::make_device_vector_view(mean_act.data(), cols), - false); + raft::make_device_vector_view(mean_act.data(), cols)); stddev(handle, raft::make_device_matrix_view(data, rows, cols), @@ -99,8 +98,7 @@ class StdDevTest : public ::testing::TestWithParam> { using layout_t = raft::col_major; mean(handle, raft::make_device_matrix_view(data, rows, cols), - raft::make_device_vector_view(mean_act.data(), cols), - false); + raft::make_device_vector_view(mean_act.data(), cols)); stddev(handle, raft::make_device_matrix_view(data, rows, cols),