From 8f08c8609ece871968b3f7f41cd0138a370a967a Mon Sep 17 00:00:00 2001 From: wunhuang Date: Fri, 29 May 2020 12:05:02 +0000 Subject: [PATCH] [HIP] Optimized the spread kernel --- src/gromacs/ewald/pme_calculate_splines.hip.h | 20 ++++++++++++------- src/gromacs/ewald/pme_gather.hip.cpp | 2 +- src/gromacs/ewald/pme_spread.hip.cpp | 8 ++++---- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/gromacs/ewald/pme_calculate_splines.hip.h b/src/gromacs/ewald/pme_calculate_splines.hip.h index 9bdd61ceeb..890ad14795 100644 --- a/src/gromacs/ewald/pme_calculate_splines.hip.h +++ b/src/gromacs/ewald/pme_calculate_splines.hip.h @@ -107,7 +107,7 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuHipKernelPar * \param[out] sm_gridlineIndices Atom gridline indices in the shared memory. */ -template +template __device__ __forceinline__ void calculate_splines(const PmeGpuHipKernelParams kernelParams, const int atomIndexOffset, const float3 atomX, @@ -137,17 +137,17 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuHipKernelParams ke /* Atom index w.r.t. global memory */ const int atomIndexGlobal = atomIndexOffset + atomIndexLocal; /* Spline contribution index in one dimension */ - const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x; - const int orderIndex = threadLocalIdXY / DIM; + //const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x; + const int orderIndex = threadIdx.y;//threadLocalIdXY / DIM; /* Dimension index */ - const int dimIndex = threadLocalIdXY % DIM; + const int dimIndex = threadIdx.x;//threadLocalIdXY % DIM; /* Multi-purpose index of rvec/ivec atom data */ const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex; float splineData[order]; - const int localCheck = (dimIndex < DIM) && (orderIndex < 1); + const int localCheck = dimIndex < DIM;//(dimIndex < DIM) && (orderIndex < 1); const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms); /* we have 4 threads per atom, but can only use 3 here for the dimensions */ @@ -253,8 +253,11 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuHipKernelParams ke if (writeSmDtheta || writeGlobal) { /* Differentiation and storing the spline derivatives (dtheta) */ + const int ithyMin = useOrderThreads ? 0 : orderIndex; + const int ithyMax = useOrderThreads ? order : orderIndex + 1; #pragma unroll - for (o = 0; o < order; o++) + for (int o = ithyMin; o < ithyMax; o++) + //for (o = 0; o < order; o++) { const int thetaIndex = getSplineParamIndex(thetaIndexBase, dimIndex, o); @@ -286,8 +289,11 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuHipKernelParams ke splineData[0] = div * (1.0f - dr) * splineData[0]; /* Storing the spline values (theta) */ + const int ithyMin = useOrderThreads ? 0 : orderIndex; + const int ithyMax = useOrderThreads ? order : orderIndex + 1; #pragma unroll - for (o = 0; o < order; o++) + for (int o = ithyMin; o < ithyMax; o++) + //for (o = 0; o < order; o++) { const int thetaIndex = getSplineParamIndex(thetaIndexBase, dimIndex, o); diff --git a/src/gromacs/ewald/pme_gather.hip.cpp b/src/gromacs/ewald/pme_gather.hip.cpp index df3797244f..f64d1962e3 100644 --- a/src/gromacs/ewald/pme_gather.hip.cpp +++ b/src/gromacs/ewald/pme_gather.hip.cpp @@ -361,7 +361,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY]; atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ]; } - calculate_splines( + calculate_splines( kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices); // __syncwarp(); __all(1); diff --git a/src/gromacs/ewald/pme_spread.hip.cpp b/src/gromacs/ewald/pme_spread.hip.cpp index 68a055dd2e..ca07781028 100644 --- a/src/gromacs/ewald/pme_spread.hip.cpp +++ b/src/gromacs/ewald/pme_spread.hip.cpp @@ -133,7 +133,7 @@ __device__ __forceinline__ void spread_charges(const PmeGpuHipKernelParams kerne const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); float thetaY = sm_theta[splineIndexY]; const float Val = thetaZ * thetaY * (*atomCharge); - assert(isfinite(Val)); + //assert(isfinite(Val)); const int offset = iy * pnz + iz; #pragma unroll @@ -148,8 +148,8 @@ __device__ __forceinline__ void spread_charges(const PmeGpuHipKernelParams kerne const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx); const float thetaX = sm_theta[splineIndexX]; - assert(isfinite(thetaX)); - assert(isfinite(gm_grid[gridIndexGlobal])); + //assert(isfinite(thetaX)); + //assert(isfinite(gm_grid[gridIndexGlobal])); #if (HIP_VERSION_MAJOR >= 3) && (HIP_VERSION_MINOR > 3) atomicAddNoRet(gm_grid + gridIndexGlobal, thetaX * Val); #else @@ -254,7 +254,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY]; atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ]; } - calculate_splines( + calculate_splines( kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices); // __syncwarp(); __all(1);