diff --git a/ntt/ntt.cuh b/ntt/ntt.cuh index 468f90d..d140314 100644 --- a/ntt/ntt.cuh +++ b/ntt/ntt.cuh @@ -47,7 +47,7 @@ protected: else // Those GPUs that can reserve 96KB of shared memory can // schedule 2 blocks to each SM... - bit_rev_permutation_z<<<<>> (d_out, d_inp, lg_domain_size); @@ -71,8 +71,7 @@ private: LDE_distribute_powers<<>> (inout, lg_dsz, lg_blowup, bitrev, gen_powers); else - LDE_distribute_powers<<>> + LDE_distribute_powers<<>> (inout, lg_dsz, lg_blowup, bitrev, gen_powers); CUDA_OK(cudaGetLastError()); @@ -174,10 +173,8 @@ protected: assert(lg_domain_size + lg_blowup <= MAX_LG_DOMAIN_SIZE); size_t domain_size = (size_t)1 << lg_domain_size; - const cudaDeviceProp& gpu_prop = gpu_props(stream.id()); - // Determine the max power of 2 SM count - size_t kernel_sms = gpu_prop.multiProcessorCount; + size_t kernel_sms = stream.sm_count(); while (kernel_sms & (kernel_sms - 1)) kernel_sms -= (kernel_sms & (0 - kernel_sms));