diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H index c981181c7f..becb102c4a 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H @@ -59,10 +59,8 @@ public: virtual amrex::Real BoundaryFactor() override final { return 1.; } private: - /** Spectral fields, contains (real) field in Fourier space */ - amrex::MultiFab m_tmpSpectralField; - /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */ - amrex::MultiFab m_eigenvalue_matrix; + /** FArrayBox eigenvalues, to solve Poisson equation with Dirichlet BC. */ + amrex::FArrayBox m_eigenvalue_matrix; /** Real array for the FFTs */ amrex::Gpu::DeviceVector m_position_array; /** Complex array for the FFTs */ @@ -73,6 +71,10 @@ private: AnyFFT m_y_fft; /** work area for both DST plans */ amrex::Gpu::DeviceVector m_fft_work_area; + /** x prefactor for ToSine */ + amrex::Gpu::DeviceVector m_sine_x_factor; + /** y prefactor for ToSine */ + amrex::Gpu::DeviceVector m_sine_y_factor; }; #endif diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp index 50f1547bca..fb79015b56 100644 --- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp +++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp @@ -26,41 +26,29 @@ FFTPoissonSolverDirichletFast::FFTPoissonSolverDirichletFast ( * in[-1] = 0; in[-2] = -in[0]; in[n_data] = 0; in[n_data+1] = -in[n_data-1] * * \param[in] in input real array - * \param[out] out output complex array + * \param[in] i x index to compute + * \param[in] j y index to compute + * \param[in] n_half highest index of the output array, equal to (n_data+1)/2 * \param[in] n_data number of (contiguous) rows in position matrix - * \param[in] n_batch number of (strided) columns in position matrix */ -void ToComplex (const amrex::Real* const AMREX_RESTRICT in, - amrex::GpuComplex* const AMREX_RESTRICT out, - const int n_data, const int n_batch) -{ - const int n_half = (n_data+1)/2; - if((n_data%2 == 1)) { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half); - const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in] - *!i_is_zero*!i_is_n_half; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); +template AMREX_GPU_DEVICE AMREX_FORCE_INLINE +amrex::GpuComplex to_complex (T&& in, int i, int j, int n_half, int n_data) { + amrex::Real real = 0; + amrex::Real imag = 0; + if (i == 0) { + real = 2*in(2*i, j); + } else if (i == n_half) { + if (n_data & 1) { // n_data is odd + real = -2*in(2*i-2, j); + } else { + real = -in(2*i-2, j); + imag = in(2*i-1, j); + } } else { - amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept - { - const int stride_in = n_data*j; - const int i_is_zero = (i==0); - const int i_is_n_half = (i==n_half); - const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero) - +in[2*i-i_is_n_half+stride_in]*!i_is_n_half; - const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero; - out[i+(n_half+1)*j] = amrex::GpuComplex(real, imag); - }); + real = in(2*i, j) - in(2*i-2, j); + imag = in(2*i-1, j); } + return {real, imag}; } /** \brief Make Sine-space Real array out of array from fft. @@ -68,89 +56,142 @@ void ToComplex (const amrex::Real* const AMREX_RESTRICT in, * (2*sin((idx+1)*pi/(n_data+1)))) for each column * * \param[in] in input real array - * \param[out] out output real array + * \param[in] i x index to compute + * \param[in] j y index to compute * \param[in] n_data number of (contiguous) rows in position matrix - * \param[in] n_batch number of (strided) columns in position matrix + * \param[in] sine_factor prefactor for ToSine equal to 1/(2*sin((idx+1)*pi/(n_data+1))) */ -void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) +template AMREX_GPU_DEVICE AMREX_FORCE_INLINE +amrex::Real to_sine (T&& in, int i, int j, int n_data, const amrex::Real* sine_factor) { + const amrex::Real in_a = in(i+1, j); + const amrex::Real in_b = in(n_data-i, j); + // possible optimization: + // iterate over the elements is such a way that each thread computes (i,j) and (n_data-i-1,j) + // so in_a and in_b can be reused + return amrex::Real(0.5)*(in_b - in_a + (in_a + in_b) * sine_factor[i]); +} + +void ToComplex (const Array2 in, const Array2> out, + const int n_data, const int n_batch) { - using namespace amrex::literals; + const int n_half = (n_data+1)/2; + amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept + { + out(i, j) = to_complex(in, i, j, n_half, n_data); + }); +} - const amrex::Real n_1_real_inv = 1._rt / (n_data+1._rt); - const int n_1 = n_data+1; - amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}}, +void ToSine (const Array2 in, const Array2 out, + const amrex::Real* sine_factor, const int n_data, const int n_batch) +{ + amrex::ParallelFor({{0,0,0}, {n_data-1,n_batch-1,0}}, [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { - const int i_rev = n_1-i; - const int stride_in = n_1*j; - const int stride_out = n_data*j; - const amrex::Real in_a = in[i + stride_in]; - const amrex::Real in_b = in[i_rev + stride_in]; - out[i - 1 + stride_out] = - 0.5_rt*(in_b - in_a + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i * n_1_real_inv))); - out[i_rev - 1 + stride_out] = - 0.5_rt*(in_a - in_b + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i_rev * n_1_real_inv))); + out(i, j) = to_sine(in, i, j, n_data, sine_factor); }); } -/** \brief Transpose input matrix - * out[idy][idx] = in[idx][idy] - * - * \param[in] in input real array - * \param[out] out output real array - * \param[in] n_data number of (contiguous) rows in input matrix - * \param[in] n_batch number of (strided) columns in input matrix - */ -void Transpose (const amrex::Real* const AMREX_RESTRICT in, - amrex::Real* const AMREX_RESTRICT out, - const int n_data, const int n_batch) +void ToSine_Transpose_ToComplex (const Array2 in, + const Array2> out, + const amrex::Real* sine_factor, const int n_data, const int n_batch) { + const int n_half = (n_batch+1)/2; + + auto transpose_to_sine = [=] AMREX_GPU_DEVICE (int i, int j) { + return to_sine(in, j, i, n_data, sine_factor); + }; + #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) - constexpr int tile_dim = 32; //must be power of 2 - constexpr int block_rows = 8; - const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim; - const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim; - amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows, - tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(), + constexpr int tile_dim_x = 16; + constexpr int tile_dim_x_ex = 34; + constexpr int tile_dim_y = 32; + constexpr int block_rows_x = 8; + constexpr int block_rows_y = 16; + + const int nx = n_half + 1; + const int ny = n_data; + + const int nx_sin = n_data; + const int ny_sin = n_batch; + + const int num_blocks_x = (nx + tile_dim_x - 1)/tile_dim_x; + const int num_blocks_y = (ny + tile_dim_y - 1)/tile_dim_y; + + amrex::launch(num_blocks_x*num_blocks_y, amrex::Gpu::gpuStream(), [=] AMREX_GPU_DEVICE() noexcept { - amrex::Gpu::SharedMemory gsm; - amrex::Real* const tile = gsm.dataPtr(); + __shared__ amrex::Real tile_ptr[tile_dim_x_ex * tile_dim_y]; - const int thread_x = threadIdx.x&(tile_dim-1); - const int thread_y = threadIdx.x/tile_dim; - const int block_y = blockIdx.x/num_blocks_x; + const int block_y = blockIdx.x / num_blocks_x; const int block_x = blockIdx.x - block_y*num_blocks_x; - int mat_x = block_x * tile_dim + thread_x; - int mat_y = block_y * tile_dim + thread_y; - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_data && (mat_y+i) < n_batch) { - tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x]; + const int tile_begin_x = 2 * block_x * tile_dim_x - 2; + const int tile_begin_y = block_y * tile_dim_y; + + const int tile_end_x = tile_begin_x + tile_dim_x_ex; + const int tile_end_y = tile_begin_y + tile_dim_y; + + Array2 shared{{tile_ptr, {tile_begin_x, tile_begin_y, 0}, + {tile_end_x, tile_end_y, 1}, 1}}; + + { + const int thread_x = threadIdx.x / tile_dim_y; + const int thread_y = threadIdx.x - thread_x*tile_dim_y; + + for (int tx = thread_x; tx < tile_dim_x_ex; tx += block_rows_x) { + const int i = tile_begin_x + tx; + const int j = tile_begin_y + thread_y; + + if (j < nx_sin && i < ny_sin && i >= 0 ) { + shared(i, j) = transpose_to_sine(i, j); + } } } __syncthreads(); - mat_x = block_y * tile_dim + thread_x; - mat_y = block_x * tile_dim + thread_y; + { + const int thread_y = threadIdx.x / tile_dim_x; + const int thread_x = threadIdx.x - thread_y*tile_dim_x; - for (int i = 0; i < tile_dim; i += block_rows) { - if(mat_x < n_batch && (mat_y+i) < n_data) { - out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i]; + for (int ty = thread_y; ty < tile_dim_y; ty += block_rows_y) { + const int i = block_x * tile_dim_x + thread_x; + const int j = tile_begin_y + ty; + + if (i < nx && j < ny) { + out(i, j) = to_complex(shared, i, j, n_half, n_batch); + } } } }); #else - amrex::ParallelFor({{0,0,0}, {n_batch-1, n_data-1, 0}}, - [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept + amrex::ParallelFor({{0,0,0}, {n_half,n_data-1,0}}, + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { - out[i + n_batch*j] = in[j + n_data*i]; + out(i, j) = to_complex(transpose_to_sine, i, j, n_half, n_batch); }); #endif } +void ToSine_Mult_ToComplex (const Array2 in, + const Array2> out, + const Array2 eigenvalue, + const amrex::Real* sine_factor, const int n_data, const int n_batch) +{ + const int n_half = (n_data+1)/2; + + auto mult_to_sine = [=] AMREX_GPU_DEVICE (int i, int j) { + return eigenvalue(i, j) * to_sine(in, i, j, n_data, sine_factor); + }; + + amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}}, + [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept + { + out(i, j) = to_complex(mult_to_sine, i, j, n_half, n_data); + }); +} + void FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, amrex::DistributionMapping const& dm, @@ -167,16 +208,11 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, // The stagingArea is also created from 0 to nx, because the real space array may have // an offset for levels > 0 m_stagingArea = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); - m_tmpSpectralField = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); - m_eigenvalue_matrix = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards); m_stagingArea.setVal(0.0, Fields::m_poisson_nguards); // this is not required - m_tmpSpectralField.setVal(0.0, Fields::m_poisson_nguards); // This must be true even for parallel FFT. AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stagingArea.local_size() == 1, "There should be only one box locally."); - AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_tmpSpectralField.local_size() == 1, - "There should be only one box locally."); const amrex::Box fft_box = m_stagingArea[0].box(); const amrex::IntVect fft_size = fft_box.length(); @@ -193,25 +229,23 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 ))); // Calculate the array of m_eigenvalue_matrix - for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){ - Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); - amrex::IntVect lo = fft_box.smallEnd(); - amrex::ParallelFor( - fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept - { - /* fast poisson solver diagonal x coeffs */ - amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor); - /* fast poisson solver diagonal y coeffs */ - amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor); - - if ((sinex_sq!=0) && (siney_sq!=0)) { - eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared )); - } else { - // Avoid division by 0 - eigenvalue_matrix(i,j) = 0._rt; - } - }); - } + m_eigenvalue_matrix.resize({{0,0,0}, {ny-1,nx-1,0}}); + Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(); + amrex::ParallelFor({{0,0,0}, {ny-1,nx-1,0}}, + [=] AMREX_GPU_DEVICE (int j, int i, int /* k */) noexcept + { + /* fast poisson solver diagonal x coeffs */ + amrex::Real sinex_sq = std::sin(( i + 1 ) * sine_x_factor) * std::sin(( i + 1 ) * sine_x_factor); + /* fast poisson solver diagonal y coeffs */ + amrex::Real siney_sq = std::sin(( j + 1 ) * sine_y_factor) * std::sin(( j + 1 ) * sine_y_factor); + + if ((sinex_sq!=0) && (siney_sq!=0)) { + eigenvalue_matrix(j,i) = norm_fac / ( -4.0_rt * ( sinex_sq / dxsquared + siney_sq / dysquared )); + } else { + // Avoid division by 0 + eigenvalue_matrix(j,i) = 0._rt; + } + }); // Allocate 1d Array for 2d data or 2d transpose data const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx); @@ -230,6 +264,21 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba, m_fft_work_area.dataPtr()); m_y_fft.SetBuffers(m_fourier_array.dataPtr(), m_position_array.dataPtr(), m_fft_work_area.dataPtr()); + + // set up prefactors for ToSine + m_sine_x_factor.resize(nx); + amrex::Real* const sine_x_ptr = m_sine_x_factor.dataPtr(); + amrex::ParallelFor(nx, + [=] AMREX_GPU_DEVICE (int i) { + sine_x_ptr[i] = 1._rt / (2._rt * amrex::Math::sinpi((i + 1._rt) / (nx + 1._rt))); + }); + + m_sine_y_factor.resize(ny); + amrex::Real* const sine_y_ptr = m_sine_y_factor.dataPtr(); + amrex::ParallelFor(ny, + [=] AMREX_GPU_DEVICE (int i) { + sine_y_ptr[i] = 1._rt / (2._rt * amrex::Math::sinpi((i + 1._rt) / (ny + 1._rt))); + }); } @@ -241,75 +290,39 @@ FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf) const int nx = m_stagingArea[0].box().length(0); // initially contiguous const int ny = m_stagingArea[0].box().length(1); // contiguous after transpose - amrex::Real* const pos_arr = m_stagingArea[0].dataPtr(); - amrex::Real* const fourier_arr = m_tmpSpectralField[0].dataPtr(); - amrex::Real* const real_arr = m_position_array.dataPtr(); - amrex::GpuComplex* comp_arr = m_fourier_array.dataPtr(); + Array2 pos_arr {{m_stagingArea[0].dataPtr(), {0,0,0}, {nx,ny,1}, 1}}; + + Array2 real_arr {{m_position_array.dataPtr(), {0,0,0}, {nx+1,ny,1}, 1}}; + Array2 real_arr_t {{m_position_array.dataPtr(), {0,0,0}, {ny+1,nx,1}, 1}}; + + Array2> comp_arr {{ m_fourier_array.dataPtr(), {0,0,0}, {(nx+1)/2+1,ny,1}, 1}}; + Array2> comp_arr_t {{ m_fourier_array.dataPtr(), {0,0,0}, {(ny+1)/2+1,nx,1}, 1}}; // 1D DST in x ToComplex(pos_arr, comp_arr, nx, ny); m_x_fft.Execute(); - ToSine(real_arr, pos_arr, nx, ny); - - Transpose(pos_arr, fourier_arr, nx, ny); - // 1D DST in y - ToComplex(fourier_arr, comp_arr, ny, nx); + ToSine_Transpose_ToComplex(real_arr, comp_arr_t, m_sine_x_factor.dataPtr(), nx, ny); m_y_fft.Execute(); - ToSine(real_arr, pos_arr, ny, nx); - - Transpose(pos_arr, fourier_arr, ny, nx); + // 1D DST in y + ToSine_Mult_ToComplex(real_arr_t, comp_arr_t, m_eigenvalue_matrix.array(), + m_sine_y_factor.dataPtr(), ny, nx); -#ifdef AMREX_USE_OMP -#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) -#endif - for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ - // Solve Poisson equation in Fourier space: - // Multiply `tmpSpectralField` by eigenvalue_matrix - Array2 tmp_cmplx_arr = m_tmpSpectralField.array(mfi); - Array2 eigenvalue_matrix = m_eigenvalue_matrix.array(mfi); - - amrex::ParallelFor( mfi.growntilebox(), - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { - tmp_cmplx_arr(i,j) *= eigenvalue_matrix(i,j); - }); - } + m_y_fft.Execute(); // 1D DST in x - ToComplex(fourier_arr, comp_arr, nx, ny); + ToSine_Transpose_ToComplex(real_arr_t, comp_arr, m_sine_y_factor.dataPtr(), ny, nx); m_x_fft.Execute(); - ToSine(real_arr, fourier_arr, nx, ny); + amrex::Box lhs_bx = lhs_mf[0].box(); + // shift box to handle ghost cells properly + lhs_bx -= m_stagingArea[0].box().smallEnd(); + Array2 lhs_arr {{lhs_mf[0].dataPtr(), amrex::begin(lhs_bx), amrex::end(lhs_bx), 1}}; - Transpose(fourier_arr, pos_arr, nx, ny); - - // 1D DST in y - ToComplex(pos_arr, comp_arr, ny, nx); - - m_y_fft.Execute(); - - ToSine(real_arr, fourier_arr, ny, nx); - - Transpose(fourier_arr, pos_arr, ny, nx); - -#ifdef AMREX_USE_OMP -#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) -#endif - for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){ - // Copy from the staging area to output array (and normalize) - Array2 tmp_real_arr = m_stagingArea.array(mfi); - Array2 lhs_arr = lhs_mf.array(mfi); - AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lhs_mf.size() == 1, - "Slice MFs must be defined on one box only"); - amrex::ParallelFor( lhs_mf[mfi].box() & mfi.growntilebox(), - [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept { - // Copy field - lhs_arr(i,j) = tmp_real_arr(i,j); - }); - } + ToSine(real_arr, lhs_arr, m_sine_x_factor.dataPtr(), nx, ny); } diff --git a/src/utils/GPUUtil.H b/src/utils/GPUUtil.H index c482842091..7d9429d03e 100644 --- a/src/utils/GPUUtil.H +++ b/src/utils/GPUUtil.H @@ -23,6 +23,7 @@ struct Array2 { int ncomp=0; #endif + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Array2 (const amrex::Array4& rhs) noexcept : p(rhs.p), jstride(rhs.jstride), @@ -34,7 +35,7 @@ struct Array2 { #endif { // slice is only one cell thick if allocated - AMREX_ALWAYS_ASSERT(!rhs.p || rhs.begin.z + 1 == rhs.end.z); + AMREX_IF_ON_HOST((AMREX_ALWAYS_ASSERT(!rhs.p || rhs.begin.z + 1 == rhs.end.z);)); } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE