diff --git a/scripts_future_API/bench3d.jl b/scripts_future_API/bench3d.jl
index 9d782c28..b2f13dd4 100644
--- a/scripts_future_API/bench3d.jl
+++ b/scripts_future_API/bench3d.jl
@@ -31,44 +31,18 @@ macro d2_zi(A) esc(:( $A[ix+1, iy+1, iz+2] - $A[ix+1, iy+1, iz+1] - $A[ix+1, iy+
     # end
 end
 
-function lapl!(A_new, A, h, _dx, _dy, _dz)
-    ix = (workgroupIdx().x - UInt32(1)) * workgroupDim().x + workitemIdx().x
-    iy = (workgroupIdx().y - UInt32(1)) * workgroupDim().y + workitemIdx().y
-    iz = (workgroupIdx().z - UInt32(1)) * workgroupDim().z + workitemIdx().z
-    # if ix ∈ axes(A_new, 1)[2:end-1] && iy ∈ axes(A_new, 2)[2:end-1] && iz ∈ axes(A_new, 3)[2:end-1]
-    #     @inbounds A_new[ix, iy, iz] = A[ix, iy, iz] + h #= * (_dx * _dx * @d2_xi(A) + _dy * _dy * @d2_yi(A) + _dz * _dz * @d2_zi(A)) =#
-    # end
-    if (ix < size(A, 1) - 2 && iy < size(A, 2) - 2 && iz < size(A, 3) - 2)
-        @inbounds @inn(A_new) = @inn(A) + h #= * (_dx * _dx * @d2_xi(A) + _dy * _dy * @d2_yi(A) + _dz * _dz * @d2_zi(A)) =#
-    end
-    return
-end
-
-function compute_ka(hide_comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, iters, me)
+function compute_ka(hide_comm, comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, iters, me)
     (me==0) && print("Starting the time loop 🚀...")
+    MPI.Barrier(comm)
     tic = time_ns()
     for _ = 1:iters
         # copyto!(A, A_new)
         # AMDGPU.synchronize(blocking=false) #KernelAbstractions.synchronize(backend)
-        # hide_comm(diffusion_kernel!(backend, 256), neighbors, ranges, A_new, A, h, _dx, _dy, _dz)
-        # A, A_new = A_new, A
+        hide_comm(diffusion_kernel!(backend, 256), neighbors, ranges, A_new, A, h, _dx, _dy, _dz)
+        A, A_new = A_new, A
 
-        diffusion_kernel!(backend, 256)(A_new, A, h, _dx, _dy, _dz, (1, 1, 1); ndrange=size(A))
-        # diffusion_kernel!(backend, 256)(A_new, A, h, _dx, _dy, _dz, (1, 1, 1); ndrange=size(A) .- 2)
-        # diffusion_kernel!(backend, 256, (size(A) .- 2))(A_new, A, h, _dx, _dy, _dz, (1, 1, 1))
-        AMDGPU.synchronize(blocking=false) #KernelAbstractions.synchronize(backend)
-        # A, A_new = A_new, A
-    end
-    wtime = (time_ns() - tic) * 1e-9
-    (me==0) && println("done")
-    return wtime
-end
-
-function compute_roc(A_new, A, h, _dx, _dy, _dz, iters, nblocks, nthreads, me)
-    (me==0) && print("Starting the time loop 🚀...")
-    tic = time_ns()
-    for _ = 1:iters
-        AMDGPU.@sync @roc gridsize=nblocks groupsize=nthreads lapl!(A_new, A, h, _dx, _dy, _dz)
+        # diffusion_kernel!(backend, 256)(A_new, A, h, _dx, _dy, _dz, (1, 1, 1); ndrange=size(A))
+        # AMDGPU.synchronize(blocking=false) #KernelAbstractions.synchronize(backend)
         # A, A_new = A_new, A
     end
     wtime = (time_ns() - tic) * 1e-9
@@ -83,8 +57,6 @@ function main(backend=CPU(), T::DataType=Float64, dims=(0, 0, 0))
     iters, warmup = 35, 5
     nx, ny, nz = 1024, 1024, 1024
     b_width = (128, 8, 4)
-    nthreads = (256, 1, 1)
-    nblocks = cld.((nx, ny, nz), nthreads)
     dims, comm, me, neighbors, coords, device = init_distributed(dims; init_MPI=true)
     dx, dy, dz = l ./ (nx, ny, nz)
     _dx, _dy, _dz = 1.0 ./ (dx, dy, dz)
@@ -137,22 +109,24 @@ function main(backend=CPU(), T::DataType=Float64, dims=(0, 0, 0))
     # GC.gc()
     # GC.enable(false)
 
-    compute_ka(hide_comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, warmup, me)
-    wtime = compute_ka(hide_comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, (iters - warmup), me)
+    compute_ka(hide_comm, comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, warmup, me)
 
-    # compute_roc(A_new, A, h, _dx, _dy, _dz, warmup, nblocks, nthreads, me)
-    # wtime = compute_roc(A_new, A, h, _dx, _dy, _dz, (iters - warmup), nblocks, nthreads, me)
+    for _ in 1:10
+        wtime = compute_ka(hide_comm, comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, (iters - warmup), me)
 
-    # GC.enable(true)
-    # GC.gc()
-
-    # perf
-    A_eff = 2 / 2^30 * (nx-2) * (ny-2) * (nz-2) * sizeof(Float64)
-    wtime_it = wtime / (iters - warmup)
-    T_eff = A_eff / wtime_it
-    # (me==0) && @printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3))
-    @printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s - device %s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3), AMDGPU.device_id(AMDGPU.device()))
+        # GC.enable(true)
+        # GC.gc()
 
+        MPI.Barrier(comm)
+        wtime_min = MPI.Allreduce(wtime, MPI.MIN, comm)
+        wtime_max = MPI.Allreduce(wtime, MPI.MAX, comm)
+        # perf
+        A_eff = 2 / 2^30 * (nx-2) * (ny-2) * (nz-2) * sizeof(Float64)
+        wtime_it = (wtime_min, wtime_max) ./ (iters - warmup)
+        T_eff = A_eff ./ wtime_it
+        (me==0) && @printf("Executed %d steps in = %1.3e sec @ T_eff = %1.2f GB/s (max %1.2f) \n", (iters - warmup), wtime_max, round(T_eff[2], sigdigits=3), round(T_eff[1], sigdigits=3))
+        # @printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s - device %s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3), AMDGPU.device_id(AMDGPU.device()))
+    end
     finalize_distributed(; finalize_MPI=true)
     return
 end
diff --git a/scripts_future_API/mpi_utils2.jl b/scripts_future_API/mpi_utils2.jl
index d97d2c80..16e1f065 100644
--- a/scripts_future_API/mpi_utils2.jl
+++ b/scripts_future_API/mpi_utils2.jl
@@ -24,11 +24,6 @@ function finalize_distributed(; finalize_MPI=true)
     return
 end
 
-@kernel function my_copy!(halo, recv_buf)
-    ix, iy = @index(Global, NTuple)
-    halo[ix, iy] = recv_buf[ix, iy]
-end
-
 # exchanger
 mutable struct Exchanger
     @atomic done::Bool
@@ -75,7 +70,6 @@ mutable struct Exchanger
                             test_send = MPI.Test(send)
                             if test_recv && !flag
                                 copyto!(halo, recv_buf)
-                                # my_copy!(backend, 256, size(recv_buf))(halo, recv_buf)
                                 flag = true
                             end
                             if test_recv && test_send break end
diff --git a/scripts_future_API/sbatch.sh b/scripts_future_API/sbatch.sh
index 626da474..43fc4487 100644
--- a/scripts_future_API/sbatch.sh
+++ b/scripts_future_API/sbatch.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
-#SBATCH --job-name=scaling_16
+#SBATCH --job-name=scaling_2048
 #SBATCH --account=project_465000557
-#SBATCH --time=00:02:00
-#SBATCH --nodes=4
-#SBATCH --ntasks=16
+#SBATCH --time=00:05:00
+#SBATCH --nodes=512
+#SBATCH --ntasks=2048
 #SBATCH --gpus-per-node=8
 #SBATCH --partition=standard-g