diff --git a/polynomial/div_by_x_minus_z.cuh b/polynomial/div_by_x_minus_z.cuh index 99e155d..90b2f0b 100644 --- a/polynomial/div_by_x_minus_z.cuh +++ b/polynomial/div_by_x_minus_z.cuh @@ -9,7 +9,7 @@ #include #include -template __global__ __launch_bounds__(BSZ) +template __global__ __launch_bounds__(BSZ) void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z) { struct my { @@ -127,8 +127,10 @@ void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z) * cf ce * z^14 * cf * z^15 * - * The first element of the output is the remainder and - * the rest is the quotient. + * If |rotate| is false, the first element of the output is + * the remainder and the rest is the quotient. Otherwise + * the remainder is stored at the end and the quotiend is + * "shifted" toward the beginning of the |d_inout| vector. */ class rev_ptr_t { fr_t* p; @@ -138,15 +140,22 @@ void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z) __device__ const fr_t& operator[](size_t i) const { return *(p - i); } }; rev_ptr_t inout{d_inout, len}; - fr_t coeff, carry_over; + fr_t coeff, carry_over, prefetch; + uint32_t stride = blockDim.x*gridDim.x; + size_t idx; auto __grid = cooperative_groups::this_grid(); - for (size_t chunk = 0; chunk < len; chunk += blockDim.x*gridDim.x) { - size_t idx = chunk + tid; + if (tid < stride) + prefetch = inout[tid]; + + for (size_t chunk = 0; chunk < len; chunk += stride) { + idx = chunk + tid; if (sizeof(fr_t) <= 32) { - if (idx < len) - coeff = inout[idx]; + coeff = prefetch; + + if (idx + stride < len) + prefetch = inout[idx + stride]; my::madd_up(coeff, z_pow = z); @@ -218,8 +227,10 @@ void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z) } else { // ~14KB loop size with 256-bit field, yet unused... fr_t acc, z_pow_adjust; - if (idx < len) - acc = inout[idx]; + acc = prefetch; + + if (idx + stride > len) + prefetch = inout[idx + stride]; z_pow = z; uint32_t limit = WARP_SZ; @@ -326,12 +337,15 @@ void d_div_by_x_minus_z(fr_t d_inout[], size_t len, fr_t z) __syncthreads(); } - if (idx < len) - inout[idx] = coeff; + if (idx < len - rotate) + inout[idx + rotate] = coeff; } + + if (rotate && idx == len - 1) + inout[0] = coeff; } -template +template void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z, const stream_t& s) { @@ -342,7 +356,7 @@ void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z, if (BSZ == 0) { cudaFuncAttributes attr; - CUDA_OK(cudaFuncGetAttributes(&attr, d_div_by_x_minus_z)); + CUDA_OK(cudaFuncGetAttributes(&attr, d_div_by_x_minus_z)); blockDim = attr.maxThreadsPerBlock; } @@ -360,7 +374,8 @@ void div_by_x_minus_z(fr_t d_inout[], size_t len, const fr_t& z, size_t sharedSz = sizeof(fr_t) * max(blockDim/WARP_SZ, gridDim); sharedSz += sizeof(fr_t) * WARP_SZ; - s.launch_coop(d_div_by_x_minus_z, {gridDim, blockDim, sharedSz}, + s.launch_coop(d_div_by_x_minus_z, + {gridDim, blockDim, sharedSz}, d_inout, len, z); } #endif