diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H
index c981181c7f..becb102c4a 100644
--- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.H
@@ -59,10 +59,8 @@ public:
     virtual amrex::Real BoundaryFactor() override final { return 1.; }
 
 private:
-    /** Spectral fields, contains (real) field in Fourier space */
-    amrex::MultiFab m_tmpSpectralField;
-    /** Multifab eigenvalues, to solve Poisson equation with Dirichlet BC. */
-    amrex::MultiFab m_eigenvalue_matrix;
+    /** FArrayBox eigenvalues, to solve Poisson equation with Dirichlet BC. */
+    amrex::FArrayBox m_eigenvalue_matrix;
     /** Real array for the FFTs */
     amrex::Gpu::DeviceVector<amrex::Real> m_position_array;
     /** Complex array for the FFTs */
@@ -73,6 +71,10 @@ private:
     AnyFFT m_y_fft;
     /** work area for both DST plans */
     amrex::Gpu::DeviceVector<char> m_fft_work_area;
+    /** x prefactor for ToSine */
+    amrex::Gpu::DeviceVector<amrex::Real> m_sine_x_factor;
+    /** y prefactor for ToSine */
+    amrex::Gpu::DeviceVector<amrex::Real> m_sine_y_factor;
 };
 
 #endif
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp
index 50f1547bca..fb79015b56 100644
--- a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp
+++ b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp
@@ -26,41 +26,29 @@ FFTPoissonSolverDirichletFast::FFTPoissonSolverDirichletFast (
  * in[-1] = 0; in[-2] = -in[0]; in[n_data] = 0; in[n_data+1] = -in[n_data-1]
  *
  * \param[in] in input real array
- * \param[out] out output complex array
+ * \param[in] i x index to compute
+ * \param[in] j y index to compute
+ * \param[in] n_half highest index of the output array, equal to (n_data+1)/2
  * \param[in] n_data number of (contiguous) rows in position matrix
- * \param[in] n_batch number of (strided) columns in position matrix
  */
-void ToComplex (const amrex::Real* const AMREX_RESTRICT in,
-                amrex::GpuComplex<amrex::Real>* const AMREX_RESTRICT out,
-                const int n_data, const int n_batch)
-{
-    const int n_half = (n_data+1)/2;
-    if((n_data%2 == 1)) {
-        amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
-            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-            {
-                const int stride_in = n_data*j;
-                const int i_is_zero = (i==0);
-                const int i_is_n_half = (i==n_half);
-                const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero)
-                                            +in[2*i-2*i_is_n_half+stride_in]*(1-2*i_is_n_half);
-                const amrex::Real imag = in[2*i-1+i_is_zero-i_is_n_half+stride_in]
-                                            *!i_is_zero*!i_is_n_half;
-                out[i+(n_half+1)*j] = amrex::GpuComplex<amrex::Real>(real, imag);
-            });
+template<class T> AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+amrex::GpuComplex<amrex::Real> to_complex (T&& in, int i, int j, int n_half, int n_data) {
+    amrex::Real real = 0;
+    amrex::Real imag = 0;
+    if (i == 0) {
+        real = 2*in(2*i, j);
+    } else if (i == n_half) {
+        if (n_data & 1) { // n_data is odd
+            real = -2*in(2*i-2, j);
+        } else {
+            real = -in(2*i-2, j);
+            imag = in(2*i-1, j);
+        }
     } else {
-        amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
-            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
-            {
-                const int stride_in = n_data*j;
-                const int i_is_zero = (i==0);
-                const int i_is_n_half = (i==n_half);
-                const amrex::Real real = -in[2*i-2+2*i_is_zero+stride_in]*(1-2*i_is_zero)
-                                            +in[2*i-i_is_n_half+stride_in]*!i_is_n_half;
-                const amrex::Real imag = in[2*i-1+i_is_zero+stride_in]*!i_is_zero;
-                out[i+(n_half+1)*j] = amrex::GpuComplex<amrex::Real>(real, imag);
-            });
+        real = in(2*i, j) - in(2*i-2, j);
+        imag = in(2*i-1, j);
     }
+    return {real, imag};
 }
 
 /** \brief Make Sine-space Real array out of array from fft.
@@ -68,89 +56,142 @@ void ToComplex (const amrex::Real* const AMREX_RESTRICT in,
  * (2*sin((idx+1)*pi/(n_data+1)))) for each column
  *
  * \param[in] in input real array
- * \param[out] out output real array
+ * \param[in] i x index to compute
+ * \param[in] j y index to compute
  * \param[in] n_data number of (contiguous) rows in position matrix
- * \param[in] n_batch number of (strided) columns in position matrix
+ * \param[in] sine_factor prefactor for ToSine equal to 1/(2*sin((idx+1)*pi/(n_data+1)))
  */
-void ToSine (const amrex::Real* const AMREX_RESTRICT in, amrex::Real* const AMREX_RESTRICT out,
-             const int n_data, const int n_batch)
+template<class T> AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+amrex::Real to_sine (T&& in, int i, int j, int n_data, const amrex::Real* sine_factor) {
+    const amrex::Real in_a = in(i+1, j);
+    const amrex::Real in_b = in(n_data-i, j);
+    // possible optimization:
+    // iterate over the elements is such a way that each thread computes (i,j) and (n_data-i-1,j)
+    // so in_a and in_b can be reused
+    return amrex::Real(0.5)*(in_b - in_a + (in_a + in_b) * sine_factor[i]);
+}
+
+void ToComplex (const Array2<amrex::Real> in, const Array2<amrex::GpuComplex<amrex::Real>> out,
+                const int n_data, const int n_batch)
 {
-    using namespace amrex::literals;
+    const int n_half = (n_data+1)/2;
+    amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
+        [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
+        {
+            out(i, j) = to_complex(in, i, j, n_half, n_data);
+        });
+}
 
-    const amrex::Real n_1_real_inv = 1._rt / (n_data+1._rt);
-    const int n_1 = n_data+1;
-    amrex::ParallelFor({{1,0,0}, {(n_data+1)/2,n_batch-1,0}},
+void ToSine (const Array2<amrex::Real> in, const Array2<amrex::Real> out,
+             const amrex::Real* sine_factor, const int n_data, const int n_batch)
+{
+    amrex::ParallelFor({{0,0,0}, {n_data-1,n_batch-1,0}},
         [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
         {
-            const int i_rev = n_1-i;
-            const int stride_in = n_1*j;
-            const int stride_out = n_data*j;
-            const amrex::Real in_a = in[i + stride_in];
-            const amrex::Real in_b = in[i_rev + stride_in];
-            out[i - 1 + stride_out] =
-                0.5_rt*(in_b - in_a + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i * n_1_real_inv)));
-            out[i_rev - 1 + stride_out] =
-                0.5_rt*(in_a - in_b + (in_a + in_b) / (2._rt * amrex::Math::sinpi(i_rev * n_1_real_inv)));
+            out(i, j) = to_sine(in, i, j, n_data, sine_factor);
         });
 }
 
-/** \brief Transpose input matrix
- * out[idy][idx] = in[idx][idy]
- *
- * \param[in] in input real array
- * \param[out] out output real array
- * \param[in] n_data number of (contiguous) rows in input matrix
- * \param[in] n_batch number of (strided) columns in input matrix
- */
-void Transpose (const amrex::Real* const AMREX_RESTRICT in,
-                amrex::Real* const AMREX_RESTRICT out,
-                const int n_data, const int n_batch)
+void ToSine_Transpose_ToComplex (const Array2<amrex::Real> in,
+                                 const Array2<amrex::GpuComplex<amrex::Real>> out,
+                                 const amrex::Real* sine_factor, const int n_data, const int n_batch)
 {
+    const int n_half = (n_batch+1)/2;
+
+    auto transpose_to_sine = [=] AMREX_GPU_DEVICE (int i, int j) {
+        return to_sine(in, j, i, n_data, sine_factor);
+    };
+
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
-    constexpr int tile_dim = 32; //must be power of 2
-    constexpr int block_rows = 8;
-    const int num_blocks_x = (n_data + tile_dim - 1)/tile_dim;
-    const int num_blocks_y = (n_batch + tile_dim - 1)/tile_dim;
-    amrex::launch(num_blocks_x*num_blocks_y, tile_dim*block_rows,
-        tile_dim*(tile_dim+1)*sizeof(amrex::Real), amrex::Gpu::gpuStream(),
+    constexpr int tile_dim_x = 16;
+    constexpr int tile_dim_x_ex = 34;
+    constexpr int tile_dim_y = 32;
+    constexpr int block_rows_x = 8;
+    constexpr int block_rows_y = 16;
+
+    const int nx = n_half + 1;
+    const int ny = n_data;
+
+    const int nx_sin = n_data;
+    const int ny_sin = n_batch;
+
+    const int num_blocks_x = (nx + tile_dim_x - 1)/tile_dim_x;
+    const int num_blocks_y = (ny + tile_dim_y - 1)/tile_dim_y;
+
+    amrex::launch<tile_dim_x*block_rows_y>(num_blocks_x*num_blocks_y, amrex::Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE() noexcept
         {
-            amrex::Gpu::SharedMemory<amrex::Real> gsm;
-            amrex::Real* const tile = gsm.dataPtr();
+            __shared__ amrex::Real tile_ptr[tile_dim_x_ex * tile_dim_y];
 
-            const int thread_x = threadIdx.x&(tile_dim-1);
-            const int thread_y = threadIdx.x/tile_dim;
-            const int block_y = blockIdx.x/num_blocks_x;
+            const int block_y = blockIdx.x / num_blocks_x;
             const int block_x = blockIdx.x - block_y*num_blocks_x;
-            int mat_x = block_x * tile_dim + thread_x;
-            int mat_y = block_y * tile_dim + thread_y;
 
-            for (int i = 0; i < tile_dim; i += block_rows) {
-                if(mat_x < n_data && (mat_y+i) < n_batch) {
-                    tile[(thread_y+i)*(tile_dim+1) + thread_x] = in[(mat_y+i)*n_data + mat_x];
+            const int tile_begin_x = 2 * block_x * tile_dim_x - 2;
+            const int tile_begin_y = block_y * tile_dim_y;
+
+            const int tile_end_x = tile_begin_x + tile_dim_x_ex;
+            const int tile_end_y = tile_begin_y + tile_dim_y;
+
+            Array2<amrex::Real> shared{{tile_ptr, {tile_begin_x, tile_begin_y, 0},
+                                                  {tile_end_x, tile_end_y, 1}, 1}};
+
+            {
+                const int thread_x = threadIdx.x / tile_dim_y;
+                const int thread_y = threadIdx.x - thread_x*tile_dim_y;
+
+                for (int tx = thread_x; tx < tile_dim_x_ex; tx += block_rows_x) {
+                    const int i = tile_begin_x + tx;
+                    const int j = tile_begin_y + thread_y;
+
+                    if (j < nx_sin && i < ny_sin && i >= 0 ) {
+                        shared(i, j) = transpose_to_sine(i, j);
+                    }
                 }
             }
 
             __syncthreads();
 
-            mat_x = block_y * tile_dim + thread_x;
-            mat_y = block_x * tile_dim + thread_y;
+            {
+                const int thread_y = threadIdx.x / tile_dim_x;
+                const int thread_x = threadIdx.x - thread_y*tile_dim_x;
 
-            for (int i = 0; i < tile_dim; i += block_rows) {
-                if(mat_x < n_batch && (mat_y+i) < n_data) {
-                    out[(mat_y+i)*n_batch + mat_x] = tile[thread_x*(tile_dim+1) + thread_y+i];
+                for (int ty = thread_y; ty < tile_dim_y; ty += block_rows_y) {
+                    const int i = block_x * tile_dim_x + thread_x;
+                    const int j = tile_begin_y + ty;
+
+                    if (i < nx && j < ny) {
+                        out(i, j) = to_complex(shared, i, j, n_half, n_batch);
+                    }
                 }
             }
         });
 #else
-    amrex::ParallelFor({{0,0,0}, {n_batch-1, n_data-1, 0}},
-        [=] AMREX_GPU_DEVICE (int i, int j, int) noexcept
+    amrex::ParallelFor({{0,0,0}, {n_half,n_data-1,0}},
+        [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
         {
-            out[i + n_batch*j] = in[j + n_data*i];
+            out(i, j) = to_complex(transpose_to_sine, i, j, n_half, n_batch);
         });
 #endif
 }
 
+void ToSine_Mult_ToComplex (const Array2<amrex::Real> in,
+                            const Array2<amrex::GpuComplex<amrex::Real>> out,
+                            const Array2<amrex::Real> eigenvalue,
+                            const amrex::Real* sine_factor, const int n_data, const int n_batch)
+{
+    const int n_half = (n_data+1)/2;
+
+    auto mult_to_sine = [=] AMREX_GPU_DEVICE (int i, int j) {
+        return eigenvalue(i, j) * to_sine(in, i, j, n_data, sine_factor);
+    };
+
+    amrex::ParallelFor({{0,0,0}, {n_half,n_batch-1,0}},
+        [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept
+        {
+            out(i, j) = to_complex(mult_to_sine, i, j, n_half, n_data);
+        });
+}
+
 void
 FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba,
                                        amrex::DistributionMapping const& dm,
@@ -167,16 +208,11 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba,
     // The stagingArea is also created from 0 to nx, because the real space array may have
     // an offset for levels > 0
     m_stagingArea = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards);
-    m_tmpSpectralField = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards);
-    m_eigenvalue_matrix = amrex::MultiFab(a_realspace_ba, dm, 1, Fields::m_poisson_nguards);
     m_stagingArea.setVal(0.0, Fields::m_poisson_nguards); // this is not required
-    m_tmpSpectralField.setVal(0.0, Fields::m_poisson_nguards);
 
     // This must be true even for parallel FFT.
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_stagingArea.local_size() == 1,
                                      "There should be only one box locally.");
-    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_tmpSpectralField.local_size() == 1,
-                                     "There should be only one box locally.");
 
     const amrex::Box fft_box = m_stagingArea[0].box();
     const amrex::IntVect fft_size = fft_box.length();
@@ -193,25 +229,23 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba,
     const amrex::Real norm_fac = 0.5 / ( 2 * (( nx + 1 ) * ( ny + 1 )));
 
     // Calculate the array of m_eigenvalue_matrix
-    for (amrex::MFIter mfi(m_eigenvalue_matrix, DfltMfi); mfi.isValid(); ++mfi ){
-        Array2<amrex::Real> eigenvalue_matrix = m_eigenvalue_matrix.array(mfi);
-        amrex::IntVect lo = fft_box.smallEnd();
-        amrex::ParallelFor(
-            fft_box, [=] AMREX_GPU_DEVICE (int i, int j, int /* k */) noexcept
-                {
-                    /* fast poisson solver diagonal x coeffs */
-                    amrex::Real sinex_sq = std::sin(( i - lo[0] + 1 ) * sine_x_factor) * std::sin(( i - lo[0] + 1 ) * sine_x_factor);
-                    /* fast poisson solver diagonal y coeffs */
-                    amrex::Real siney_sq = std::sin(( j - lo[1] + 1 ) * sine_y_factor) * std::sin(( j - lo[1] + 1 ) * sine_y_factor);
-
-                    if ((sinex_sq!=0) && (siney_sq!=0)) {
-                        eigenvalue_matrix(i,j) = norm_fac / ( -4.0 * ( sinex_sq / dxsquared + siney_sq / dysquared ));
-                    } else {
-                        // Avoid division by 0
-                        eigenvalue_matrix(i,j) = 0._rt;
-                    }
-                });
-    }
+    m_eigenvalue_matrix.resize({{0,0,0}, {ny-1,nx-1,0}});
+    Array2<amrex::Real> eigenvalue_matrix = m_eigenvalue_matrix.array();
+    amrex::ParallelFor({{0,0,0}, {ny-1,nx-1,0}},
+        [=] AMREX_GPU_DEVICE (int j, int i, int /* k */) noexcept
+        {
+            /* fast poisson solver diagonal x coeffs */
+            amrex::Real sinex_sq = std::sin(( i + 1 ) * sine_x_factor) * std::sin(( i + 1 ) * sine_x_factor);
+            /* fast poisson solver diagonal y coeffs */
+            amrex::Real siney_sq = std::sin(( j + 1 ) * sine_y_factor) * std::sin(( j + 1 ) * sine_y_factor);
+
+            if ((sinex_sq!=0) && (siney_sq!=0)) {
+                eigenvalue_matrix(j,i) = norm_fac / ( -4.0_rt * ( sinex_sq / dxsquared + siney_sq / dysquared ));
+            } else {
+                // Avoid division by 0
+                eigenvalue_matrix(j,i) = 0._rt;
+            }
+        });
 
     // Allocate 1d Array for 2d data or 2d transpose data
     const int real_1d_size = std::max((nx+1)*ny, (ny+1)*nx);
@@ -230,6 +264,21 @@ FFTPoissonSolverDirichletFast::define (amrex::BoxArray const& a_realspace_ba,
                        m_fft_work_area.dataPtr());
     m_y_fft.SetBuffers(m_fourier_array.dataPtr(), m_position_array.dataPtr(),
                        m_fft_work_area.dataPtr());
+
+    // set up prefactors for ToSine
+    m_sine_x_factor.resize(nx);
+    amrex::Real* const sine_x_ptr = m_sine_x_factor.dataPtr();
+    amrex::ParallelFor(nx,
+        [=] AMREX_GPU_DEVICE (int i) {
+            sine_x_ptr[i] = 1._rt / (2._rt * amrex::Math::sinpi((i + 1._rt) / (nx + 1._rt)));
+        });
+
+    m_sine_y_factor.resize(ny);
+    amrex::Real* const sine_y_ptr = m_sine_y_factor.dataPtr();
+    amrex::ParallelFor(ny,
+        [=] AMREX_GPU_DEVICE (int i) {
+            sine_y_ptr[i] = 1._rt / (2._rt * amrex::Math::sinpi((i + 1._rt) / (ny + 1._rt)));
+        });
 }
 
 
@@ -241,75 +290,39 @@ FFTPoissonSolverDirichletFast::SolvePoissonEquation (amrex::MultiFab& lhs_mf)
     const int nx = m_stagingArea[0].box().length(0); // initially contiguous
     const int ny = m_stagingArea[0].box().length(1); // contiguous after transpose
 
-    amrex::Real* const pos_arr = m_stagingArea[0].dataPtr();
-    amrex::Real* const fourier_arr = m_tmpSpectralField[0].dataPtr();
-    amrex::Real* const real_arr = m_position_array.dataPtr();
-    amrex::GpuComplex<amrex::Real>* comp_arr = m_fourier_array.dataPtr();
+    Array2<amrex::Real> pos_arr {{m_stagingArea[0].dataPtr(), {0,0,0}, {nx,ny,1}, 1}};
+
+    Array2<amrex::Real> real_arr {{m_position_array.dataPtr(), {0,0,0}, {nx+1,ny,1}, 1}};
+    Array2<amrex::Real> real_arr_t {{m_position_array.dataPtr(), {0,0,0}, {ny+1,nx,1}, 1}};
+
+    Array2<amrex::GpuComplex<amrex::Real>> comp_arr {{ m_fourier_array.dataPtr(), {0,0,0}, {(nx+1)/2+1,ny,1}, 1}};
+    Array2<amrex::GpuComplex<amrex::Real>> comp_arr_t {{ m_fourier_array.dataPtr(), {0,0,0}, {(ny+1)/2+1,nx,1}, 1}};
 
     // 1D DST in x
     ToComplex(pos_arr, comp_arr, nx, ny);
 
     m_x_fft.Execute();
 
-    ToSine(real_arr, pos_arr, nx, ny);
-
-    Transpose(pos_arr, fourier_arr, nx, ny);
-
     // 1D DST in y
-    ToComplex(fourier_arr, comp_arr, ny, nx);
+    ToSine_Transpose_ToComplex(real_arr, comp_arr_t, m_sine_x_factor.dataPtr(), nx, ny);
 
     m_y_fft.Execute();
 
-    ToSine(real_arr, pos_arr, ny, nx);
-
-    Transpose(pos_arr, fourier_arr, ny, nx);
+    // 1D DST in y
+    ToSine_Mult_ToComplex(real_arr_t, comp_arr_t, m_eigenvalue_matrix.array(),
+                          m_sine_y_factor.dataPtr(), ny, nx);
 
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
-#endif
-    for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){
-        // Solve Poisson equation in Fourier space:
-        // Multiply `tmpSpectralField` by eigenvalue_matrix
-        Array2<amrex::Real> tmp_cmplx_arr = m_tmpSpectralField.array(mfi);
-        Array2<amrex::Real> eigenvalue_matrix = m_eigenvalue_matrix.array(mfi);
-
-        amrex::ParallelFor( mfi.growntilebox(),
-            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept {
-                tmp_cmplx_arr(i,j) *= eigenvalue_matrix(i,j);
-            });
-    }
+    m_y_fft.Execute();
 
     // 1D DST in x
-    ToComplex(fourier_arr, comp_arr, nx, ny);
+    ToSine_Transpose_ToComplex(real_arr_t, comp_arr, m_sine_y_factor.dataPtr(), ny, nx);
 
     m_x_fft.Execute();
 
-    ToSine(real_arr, fourier_arr, nx, ny);
+    amrex::Box lhs_bx = lhs_mf[0].box();
+    // shift box to handle ghost cells properly
+    lhs_bx -= m_stagingArea[0].box().smallEnd();
+    Array2<amrex::Real> lhs_arr {{lhs_mf[0].dataPtr(), amrex::begin(lhs_bx), amrex::end(lhs_bx), 1}};
 
-    Transpose(fourier_arr, pos_arr, nx, ny);
-
-    // 1D DST in y
-    ToComplex(pos_arr, comp_arr, ny, nx);
-
-    m_y_fft.Execute();
-
-    ToSine(real_arr, fourier_arr, ny, nx);
-
-    Transpose(fourier_arr, pos_arr, ny, nx);
-
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
-#endif
-    for ( amrex::MFIter mfi(m_stagingArea, DfltMfiTlng); mfi.isValid(); ++mfi ){
-        // Copy from the staging area to output array (and normalize)
-        Array2<amrex::Real> tmp_real_arr = m_stagingArea.array(mfi);
-        Array2<amrex::Real> lhs_arr = lhs_mf.array(mfi);
-        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lhs_mf.size() == 1,
-                                         "Slice MFs must be defined on one box only");
-        amrex::ParallelFor( lhs_mf[mfi].box() & mfi.growntilebox(),
-            [=] AMREX_GPU_DEVICE(int i, int j, int) noexcept {
-                // Copy field
-                lhs_arr(i,j) = tmp_real_arr(i,j);
-            });
-    }
+    ToSine(real_arr, lhs_arr, m_sine_x_factor.dataPtr(), nx, ny);
 }
diff --git a/src/utils/GPUUtil.H b/src/utils/GPUUtil.H
index c482842091..7d9429d03e 100644
--- a/src/utils/GPUUtil.H
+++ b/src/utils/GPUUtil.H
@@ -23,6 +23,7 @@ struct Array2 {
     int ncomp=0;
 #endif
 
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     Array2 (const amrex::Array4<T>& rhs) noexcept
         : p(rhs.p),
           jstride(rhs.jstride),
@@ -34,7 +35,7 @@ struct Array2 {
 #endif
     {
         // slice is only one cell thick if allocated
-        AMREX_ALWAYS_ASSERT(!rhs.p || rhs.begin.z + 1 == rhs.end.z);
+        AMREX_IF_ON_HOST((AMREX_ALWAYS_ASSERT(!rhs.p || rhs.begin.z + 1 == rhs.end.z);));
     }
 
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE