diff --git a/scripts_future_API/bench3d.jl b/scripts_future_API/bench3d.jl index 9d782c28..b2f13dd4 100644 --- a/scripts_future_API/bench3d.jl +++ b/scripts_future_API/bench3d.jl @@ -31,44 +31,18 @@ macro d2_zi(A) esc(:( $A[ix+1, iy+1, iz+2] - $A[ix+1, iy+1, iz+1] - $A[ix+1, iy+ # end end -function lapl!(A_new, A, h, _dx, _dy, _dz) - ix = (workgroupIdx().x - UInt32(1)) * workgroupDim().x + workitemIdx().x - iy = (workgroupIdx().y - UInt32(1)) * workgroupDim().y + workitemIdx().y - iz = (workgroupIdx().z - UInt32(1)) * workgroupDim().z + workitemIdx().z - # if ix ∈ axes(A_new, 1)[2:end-1] && iy ∈ axes(A_new, 2)[2:end-1] && iz ∈ axes(A_new, 3)[2:end-1] - # @inbounds A_new[ix, iy, iz] = A[ix, iy, iz] + h #= * (_dx * _dx * @d2_xi(A) + _dy * _dy * @d2_yi(A) + _dz * _dz * @d2_zi(A)) =# - # end - if (ix < size(A, 1) - 2 && iy < size(A, 2) - 2 && iz < size(A, 3) - 2) - @inbounds @inn(A_new) = @inn(A) + h #= * (_dx * _dx * @d2_xi(A) + _dy * _dy * @d2_yi(A) + _dz * _dz * @d2_zi(A)) =# - end - return -end - -function compute_ka(hide_comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, iters, me) +function compute_ka(hide_comm, comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, iters, me) (me==0) && print("Starting the time loop 🚀...") + MPI.Barrier(comm) tic = time_ns() for _ = 1:iters # copyto!(A, A_new) # AMDGPU.synchronize(blocking=false) #KernelAbstractions.synchronize(backend) - # hide_comm(diffusion_kernel!(backend, 256), neighbors, ranges, A_new, A, h, _dx, _dy, _dz) - # A, A_new = A_new, A + hide_comm(diffusion_kernel!(backend, 256), neighbors, ranges, A_new, A, h, _dx, _dy, _dz) + A, A_new = A_new, A - diffusion_kernel!(backend, 256)(A_new, A, h, _dx, _dy, _dz, (1, 1, 1); ndrange=size(A)) - # diffusion_kernel!(backend, 256)(A_new, A, h, _dx, _dy, _dz, (1, 1, 1); ndrange=size(A) .- 2) - # diffusion_kernel!(backend, 256, (size(A) .- 2))(A_new, A, h, _dx, _dy, _dz, (1, 1, 1)) - AMDGPU.synchronize(blocking=false) #KernelAbstractions.synchronize(backend) - # A, A_new = A_new, A - end - wtime = (time_ns() - tic) * 1e-9 - (me==0) && println("done") - return wtime -end - -function compute_roc(A_new, A, h, _dx, _dy, _dz, iters, nblocks, nthreads, me) - (me==0) && print("Starting the time loop 🚀...") - tic = time_ns() - for _ = 1:iters - AMDGPU.@sync @roc gridsize=nblocks groupsize=nthreads lapl!(A_new, A, h, _dx, _dy, _dz) + # diffusion_kernel!(backend, 256)(A_new, A, h, _dx, _dy, _dz, (1, 1, 1); ndrange=size(A)) + # AMDGPU.synchronize(blocking=false) #KernelAbstractions.synchronize(backend) # A, A_new = A_new, A end wtime = (time_ns() - tic) * 1e-9 @@ -83,8 +57,6 @@ function main(backend=CPU(), T::DataType=Float64, dims=(0, 0, 0)) iters, warmup = 35, 5 nx, ny, nz = 1024, 1024, 1024 b_width = (128, 8, 4) - nthreads = (256, 1, 1) - nblocks = cld.((nx, ny, nz), nthreads) dims, comm, me, neighbors, coords, device = init_distributed(dims; init_MPI=true) dx, dy, dz = l ./ (nx, ny, nz) _dx, _dy, _dz = 1.0 ./ (dx, dy, dz) @@ -137,22 +109,24 @@ function main(backend=CPU(), T::DataType=Float64, dims=(0, 0, 0)) # GC.gc() # GC.enable(false) - compute_ka(hide_comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, warmup, me) - wtime = compute_ka(hide_comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, (iters - warmup), me) + compute_ka(hide_comm, comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, warmup, me) - # compute_roc(A_new, A, h, _dx, _dy, _dz, warmup, nblocks, nthreads, me) - # wtime = compute_roc(A_new, A, h, _dx, _dy, _dz, (iters - warmup), nblocks, nthreads, me) + for _ in 1:10 + wtime = compute_ka(hide_comm, comm, backend, neighbors, ranges, A_new, A, h, _dx, _dy, _dz, (iters - warmup), me) - # GC.enable(true) - # GC.gc() - - # perf - A_eff = 2 / 2^30 * (nx-2) * (ny-2) * (nz-2) * sizeof(Float64) - wtime_it = wtime / (iters - warmup) - T_eff = A_eff / wtime_it - # (me==0) && @printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3)) - @printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s - device %s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3), AMDGPU.device_id(AMDGPU.device())) + # GC.enable(true) + # GC.gc() + MPI.Barrier(comm) + wtime_min = MPI.Allreduce(wtime, MPI.MIN, comm) + wtime_max = MPI.Allreduce(wtime, MPI.MAX, comm) + # perf + A_eff = 2 / 2^30 * (nx-2) * (ny-2) * (nz-2) * sizeof(Float64) + wtime_it = (wtime_min, wtime_max) ./ (iters - warmup) + T_eff = A_eff ./ wtime_it + (me==0) && @printf("Executed %d steps in = %1.3e sec @ T_eff = %1.2f GB/s (max %1.2f) \n", (iters - warmup), wtime_max, round(T_eff[2], sigdigits=3), round(T_eff[1], sigdigits=3)) + # @printf("Executed %d steps in = %1.3e sec (@ T_eff = %1.2f GB/s - device %s) \n", (iters - warmup), wtime, round(T_eff, sigdigits=3), AMDGPU.device_id(AMDGPU.device())) + end finalize_distributed(; finalize_MPI=true) return end diff --git a/scripts_future_API/mpi_utils2.jl b/scripts_future_API/mpi_utils2.jl index d97d2c80..16e1f065 100644 --- a/scripts_future_API/mpi_utils2.jl +++ b/scripts_future_API/mpi_utils2.jl @@ -24,11 +24,6 @@ function finalize_distributed(; finalize_MPI=true) return end -@kernel function my_copy!(halo, recv_buf) - ix, iy = @index(Global, NTuple) - halo[ix, iy] = recv_buf[ix, iy] -end - # exchanger mutable struct Exchanger @atomic done::Bool @@ -75,7 +70,6 @@ mutable struct Exchanger test_send = MPI.Test(send) if test_recv && !flag copyto!(halo, recv_buf) - # my_copy!(backend, 256, size(recv_buf))(halo, recv_buf) flag = true end if test_recv && test_send break end diff --git a/scripts_future_API/sbatch.sh b/scripts_future_API/sbatch.sh index 626da474..43fc4487 100644 --- a/scripts_future_API/sbatch.sh +++ b/scripts_future_API/sbatch.sh @@ -1,9 +1,9 @@ #!/bin/bash -#SBATCH --job-name=scaling_16 +#SBATCH --job-name=scaling_2048 #SBATCH --account=project_465000557 -#SBATCH --time=00:02:00 -#SBATCH --nodes=4 -#SBATCH --ntasks=16 +#SBATCH --time=00:05:00 +#SBATCH --nodes=512 +#SBATCH --ntasks=2048 #SBATCH --gpus-per-node=8 #SBATCH --partition=standard-g