From 8f08c8609ece871968b3f7f41cd0138a370a967a Mon Sep 17 00:00:00 2001
From: wunhuang <wunhuang@amd.com>
Date: Fri, 29 May 2020 12:05:02 +0000
Subject: [PATCH] [HIP] Optimized the spread kernel

---
 src/gromacs/ewald/pme_calculate_splines.hip.h | 20 ++++++++++++-------
 src/gromacs/ewald/pme_gather.hip.cpp          |  2 +-
 src/gromacs/ewald/pme_spread.hip.cpp          |  8 ++++----
 3 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/src/gromacs/ewald/pme_calculate_splines.hip.h b/src/gromacs/ewald/pme_calculate_splines.hip.h
index 9bdd61ceeb..890ad14795 100644
--- a/src/gromacs/ewald/pme_calculate_splines.hip.h
+++ b/src/gromacs/ewald/pme_calculate_splines.hip.h
@@ -107,7 +107,7 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuHipKernelPar
  * \param[out] sm_gridlineIndices   Atom gridline indices in the shared memory.
  */
 
-template<const int order, const int atomsPerBlock, const int atomsPerWarp, const bool writeSmDtheta, const bool writeGlobal>
+template<const int order, const int atomsPerBlock, const int atomsPerWarp, const bool writeSmDtheta, const bool writeGlobal, const bool useOrderThreads>
 __device__ __forceinline__ void calculate_splines(const PmeGpuHipKernelParams kernelParams,
                                                   const int                    atomIndexOffset,
                                                   const float3                 atomX,
@@ -137,17 +137,17 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuHipKernelParams ke
     /* Atom index w.r.t. global memory */
     const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
     /* Spline contribution index in one dimension */
-    const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
-    const int orderIndex      = threadLocalIdXY / DIM;
+    //const int threadLocalIdXY = (threadIdx.y * blockDim.x) + threadIdx.x;
+    const int orderIndex      = threadIdx.y;//threadLocalIdXY / DIM;
     /* Dimension index */
-    const int dimIndex = threadLocalIdXY % DIM;
+    const int dimIndex = threadIdx.x;//threadLocalIdXY % DIM;
 
     /* Multi-purpose index of rvec/ivec atom data */
     const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
 
     float splineData[order];
 
-    const int localCheck = (dimIndex < DIM) && (orderIndex < 1);
+    const int localCheck = dimIndex < DIM;//(dimIndex < DIM) && (orderIndex < 1);
     const int globalCheck = pme_gpu_check_atom_data_index(atomIndexGlobal, kernelParams.atoms.nAtoms);
 
     /* we have 4 threads per atom, but can only use 3 here for the dimensions */
@@ -253,8 +253,11 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuHipKernelParams ke
             if (writeSmDtheta || writeGlobal)
             {
                 /* Differentiation and storing the spline derivatives (dtheta) */
+                const int ithyMin = useOrderThreads ? 0 : orderIndex;
+                const int ithyMax = useOrderThreads ? order : orderIndex + 1;
 #pragma unroll
-                for (o = 0; o < order; o++)
+                for (int o = ithyMin; o < ithyMax; o++)
+                //for (o = 0; o < order; o++)
                 {
                     const int thetaIndex =
                             getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
@@ -286,8 +289,11 @@ __device__ __forceinline__ void calculate_splines(const PmeGpuHipKernelParams ke
             splineData[0] = div * (1.0f - dr) * splineData[0];
 
             /* Storing the spline values (theta) */
+            const int ithyMin = useOrderThreads ? 0 : orderIndex;
+            const int ithyMax = useOrderThreads ? order : orderIndex + 1;
 #pragma unroll
-            for (o = 0; o < order; o++)
+            for (int o = ithyMin; o < ithyMax; o++)
+            //for (o = 0; o < order; o++)
             {
                 const int thetaIndex =
                         getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
diff --git a/src/gromacs/ewald/pme_gather.hip.cpp b/src/gromacs/ewald/pme_gather.hip.cpp
index df3797244f..f64d1962e3 100644
--- a/src/gromacs/ewald/pme_gather.hip.cpp
+++ b/src/gromacs/ewald/pme_gather.hip.cpp
@@ -361,7 +361,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
             atomX.y    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
             atomX.z    = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
         }
-        calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
+        calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false, useOrderThreads>(
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
 //        __syncwarp();
         __all(1);
diff --git a/src/gromacs/ewald/pme_spread.hip.cpp b/src/gromacs/ewald/pme_spread.hip.cpp
index 68a055dd2e..ca07781028 100644
--- a/src/gromacs/ewald/pme_spread.hip.cpp
+++ b/src/gromacs/ewald/pme_spread.hip.cpp
@@ -133,7 +133,7 @@ __device__ __forceinline__ void spread_charges(const PmeGpuHipKernelParams kerne
             const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
             float       thetaY = sm_theta[splineIndexY];
             const float Val    = thetaZ * thetaY * (*atomCharge);
-            assert(isfinite(Val));
+            //assert(isfinite(Val));
             const int offset = iy * pnz + iz;
 
 #pragma unroll
@@ -148,8 +148,8 @@ __device__ __forceinline__ void spread_charges(const PmeGpuHipKernelParams kerne
                 const int splineIndexX =
                         getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
                 const float thetaX = sm_theta[splineIndexX];
-                assert(isfinite(thetaX));
-                assert(isfinite(gm_grid[gridIndexGlobal]));
+                //assert(isfinite(thetaX));
+                //assert(isfinite(gm_grid[gridIndexGlobal]));
 #if (HIP_VERSION_MAJOR >= 3) && (HIP_VERSION_MINOR > 3)
                 atomicAddNoRet(gm_grid + gridIndexGlobal, thetaX * Val);
 #else
@@ -254,7 +254,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU
             atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
             atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
         }
-        calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>(
+        calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal, useOrderThreads>(
                 kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices);
 //        __syncwarp();
         __all(1);