Skip to content

Commit

Permalink
polynomial/div_by_x_minus_z.cuh: add |rotate| template parameter.
Browse files Browse the repository at this point in the history
  • Loading branch information
dot-asm committed Oct 14, 2024
1 parent cc89597 commit 86ad180
Showing 1 changed file with 30 additions and 15 deletions.
45 changes: 30 additions & 15 deletions polynomial/div_by_x_minus_z.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <cooperative_groups.h>
#include <ff/shfl.cuh>

template<class fr_t, int BSZ> __global__ __launch_bounds__(BSZ)
template<class fr_t, int BSZ, bool rotate> __global__ __launch_bounds__(BSZ)
void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z)
{
struct my {
Expand Down Expand Up @@ -127,8 +127,10 @@ void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z)
* cf ce * z^14
* cf * z^15
*
* The first element of the output is the remainder and
* the rest is the quotient.
* If |rotate| is false, the first element of the output is
* the remainder and the rest is the quotient. Otherwise
* the remainder is stored at the end and the quotiend is
* "shifted" toward the beginning of the |d_inout| vector.
*/
class rev_ptr_t {
fr_t* p;
Expand All @@ -138,15 +140,22 @@ void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z)
__device__ const fr_t& operator[](size_t i) const { return *(p - i); }
};
rev_ptr_t inout{d_inout, len};
fr_t coeff, carry_over;
fr_t coeff, carry_over, prefetch;
uint32_t stride = blockDim.x*gridDim.x;
size_t idx;
auto __grid = cooperative_groups::this_grid();

for (size_t chunk = 0; chunk < len; chunk += blockDim.x*gridDim.x) {
size_t idx = chunk + tid;
if (tid < stride)
prefetch = inout[tid];

for (size_t chunk = 0; chunk < len; chunk += stride) {
idx = chunk + tid;

if (sizeof(fr_t) <= 32) {
if (idx < len)
coeff = inout[idx];
coeff = prefetch;

if (idx + stride < len)
prefetch = inout[idx + stride];

my::madd_up(coeff, z_pow = z);

Expand Down Expand Up @@ -218,8 +227,10 @@ void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z)
} else { // ~14KB loop size with 256-bit field, yet unused...
fr_t acc, z_pow_adjust;

if (idx < len)
acc = inout[idx];
acc = prefetch;

if (idx + stride > len)
prefetch = inout[idx + stride];

z_pow = z;
uint32_t limit = WARP_SZ;
Expand Down Expand Up @@ -326,12 +337,15 @@ void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z)
__syncthreads();
}

if (idx < len)
inout[idx] = coeff;
if (idx < len - rotate)
inout[idx + rotate] = coeff;
}

if (rotate && idx == len - 1)
inout[0] = coeff;
}

template<class fr_t, class stream_t>
template<class fr_t, bool rotate = false, class stream_t>
void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z,
const stream_t& s)
{
Expand All @@ -342,7 +356,7 @@ void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z,

if (BSZ == 0) {
cudaFuncAttributes attr;
CUDA_OK(cudaFuncGetAttributes(&attr, d_div_by_x_minus_z<fr_t, BSZ>));
CUDA_OK(cudaFuncGetAttributes(&attr, d_div_by_x_minus_z<fr_t, BSZ, rotate>));
blockDim = attr.maxThreadsPerBlock;
}

Expand All @@ -360,7 +374,8 @@ void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z,
size_t sharedSz = sizeof(fr_t) * max(blockDim/WARP_SZ, gridDim);
sharedSz += sizeof(fr_t) * WARP_SZ;

s.launch_coop(d_div_by_x_minus_z<fr_t, BSZ>, {gridDim, blockDim, sharedSz},
s.launch_coop(d_div_by_x_minus_z<fr_t, BSZ, rotate>,
{gridDim, blockDim, sharedSz},
d_inout, len, z);
}
#endif

0 comments on commit 86ad180

Please sign in to comment.