Skip to content

Commit

Permalink
fixup! Add polynomial/div_by_x_minus_z.cuh.
Browse files Browse the repository at this point in the history
  • Loading branch information
dot-asm committed Sep 18, 2024
1 parent 37ddaa0 commit 64479c4
Showing 1 changed file with 10 additions and 5 deletions.
15 changes: 10 additions & 5 deletions polynomial/div_by_x_minus_z.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <cooperative_groups.h>
#include <ff/shfl.cuh>

template<class fr_t> __global__
template<class fr_t, int BSZ> __global__ __launch_bounds__(BSZ)
void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z)
{
struct my {
Expand Down Expand Up @@ -335,11 +335,16 @@ template<class fr_t, class stream_t>
void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z,
const stream_t& s)
{
cudaFuncAttributes attr;
CUDA_OK(cudaFuncGetAttributes(&attr, d_div_by_x_minus_z<fr_t>));
constexpr int BSZ = sizeof(fr_t) <= 16 ? 1024 : 0;

int gridDim = s.sm_count();
int blockDim = attr.maxThreadsPerBlock;
int blockDim = BSZ;

if (BSZ == 0) {
cudaFuncAttributes attr;
CUDA_OK(cudaFuncGetAttributes(&attr, d_div_by_x_minus_z<fr_t, BSZ>));
blockDim = attr.maxThreadsPerBlock;
}

if (gridDim > blockDim) // there are no such large GPUs, not for now...
gridDim = blockDim;
Expand All @@ -355,7 +360,7 @@ void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z,
size_t sharedSz = sizeof(fr_t) * max(blockDim/WARP_SZ, gridDim);
sharedSz += sizeof(fr_t) * WARP_SZ;

s.launch_coop(d_div_by_x_minus_z<fr_t>, {gridDim, blockDim, sharedSz},
s.launch_coop(d_div_by_x_minus_z<fr_t, BSZ>, {gridDim, blockDim, sharedSz},
d_inout, len, z);
}
#endif

0 comments on commit 64479c4

Please sign in to comment.