CliMA · navidcy · Nov 9, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
diff --git a/.buildkite/distributed/pipeline.yml b/.buildkite/distributed/pipeline.yml
@@ -1,7 +1,7 @@
 agents:
   queue: new-central
   slurm_mem: 8G
-  modules: climacommon/2024_10_09
+  modules: climacommon/2024_10_08
 
 env:
   JULIA_LOAD_PATH: "${JULIA_LOAD_PATH}:${BUILDKITE_BUILD_CHECKOUT_PATH}/.buildkite/distributed"
@@ -16,60 +16,83 @@ steps:
     key: "init_central"
     env:
       TEST_GROUP: "init"
+      GPU_TEST: "true"
     command:
       - echo "--- Instantiate project"
-      - "julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
+      - "julia -O0 --project -e 'using Pkg; Pkg.instantiate(; verbose=true); Pkg.precompile(; strict=true)'"
+
+      # Force the initialization of the CUDA runtime as it is lazily loaded by default
+      - "julia -O0 --project -e 'using CUDA; CUDA.precompile_runtime(); CUDA.versioninfo()'"
+      - "julia -O0 --project -e 'using MPI; MPI.versioninfo()'"
+
+      - echo "--- Initialize tests"
+      - "julia -O0 --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
-      slurm_gpus: 1
-      slurm_cpus_per_task: 8
+      slurm_mem: 8G
+      slurm_ntasks: 1
+      slurm_gpus_per_task: 1
 
   - wait
 
   - label: "🐉 cpu distributed unit tests"
     key: "distributed_cpu"
     env:
       TEST_GROUP: "distributed"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 8G
       slurm_ntasks: 4
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
   - label: "🐲 gpu distributed unit tests"
     key: "distributed_gpu"
     env:
       TEST_GROUP: "distributed"
+      GPU_TEST: "true"
+      MPI_TEST: "true"
     commands:
+      - "julia -O0 --project -e 'using CUDA; CUDA.precompile_runtime(); CUDA.versioninfo()'"
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 8G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
     retry:
       automatic:
         - exit_status: 1      
           limit: 1
 
-
   - label: "🦾 cpu distributed solvers tests"
     key: "distributed_solvers_cpu"
     env:
       TEST_GROUP: "distributed_solvers"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 8G
       slurm_ntasks: 4
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
   - label: "🛸 gpu distributed solvers tests"
     key: "distributed_solvers_gpu"
     env:
       TEST_GROUP: "distributed_solvers"
+      GPU_TEST: "true"
+      MPI_TEST: "true"
     commands:
+      - "julia -O0 --project -e 'using CUDA; CUDA.precompile_runtime(); CUDA.versioninfo()'"
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 32G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
     retry:
@@ -81,20 +104,28 @@ steps:
     key: "distributed_hydrostatic_model_cpu"
     env:
       TEST_GROUP: "distributed_hydrostatic_model"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 32G
       slurm_ntasks: 4
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
   - label: "🦏 gpu distributed hydrostatic model tests"
     key: "distributed_hydrostatic_model_gpu"
     env:
       TEST_GROUP: "distributed_hydrostatic_model"
+      GPU_TEST: "true"
+      MPI_TEST: "true"
     commands:
+      - "julia -O0 --project -e 'using CUDA; CUDA.precompile_runtime(); CUDA.versioninfo()'"
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 32G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
     retry:
@@ -106,20 +137,28 @@ steps:
     key: "distributed_nonhydrostatic_regression_cpu"
     env:
       TEST_GROUP: "distributed_nonhydrostatic_regression"
+      MPI_TEST: "true"
     commands:
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 32G
       slurm_ntasks: 4
+    retry:
+      automatic:
+        - exit_status: 1      
+          limit: 1
 
   - label: "🕺 gpu distributed nonhydrostatic regression"
     key: "distributed_nonhydrostatic_regression_gpu"
     env:
       TEST_GROUP: "distributed_nonhydrostatic_regression"
+      GPU_TEST: "true"
+      MPI_TEST: "true"
     commands:
+      - "julia -O0 --project -e 'using CUDA; CUDA.precompile_runtime(); CUDA.versioninfo()'"
       - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'"
     agents:
-      slurm_mem: 120G
+      slurm_mem: 32G
       slurm_ntasks: 4
       slurm_gpus_per_task: 1
     retry:

diff --git a/Project.toml b/Project.toml
@@ -50,7 +50,7 @@ CubedSphere = "0.2, 0.3"
 Dates = "1.9"
 Distances = "0.10"
 DocStringExtensions = "0.8, 0.9"
-Enzyme = "0.13.3"
+Enzyme = "0.13.14"
 FFTW = "1"
 Glob = "1.3"
 IncompleteLU = "0.2"
@@ -77,10 +77,11 @@ julia = "1.9"
 
 [extras]
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
-Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TimesDates = "bdfc003b-8df8-5c39-adcd-3a9087f5df4a"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+MPIPreferences = "3da0fdf6-3ccc-4f1b-acd9-58baa6c99267"
 
 [targets]
-test = ["DataDeps", "Enzyme", "SafeTestsets", "Test", "TimesDates"]
+test = ["DataDeps", "SafeTestsets", "Test", "Enzyme", "MPIPreferences", "TimesDates"]
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -28,16 +28,13 @@ CUDA.allowscalar() do
 
     # Initialization steps
     if group == :init || group == :all
-        Pkg.instantiate(; verbose=true)
-        Pkg.precompile(; strict=true)
         Pkg.status()
 
         try
             MPI.versioninfo()
         catch; end
 
         try
-            CUDA.precompile_runtime()
             CUDA.versioninfo()
         catch; end
     end

diff --git a/test/test_distributed_poisson_solvers.jl b/test/test_distributed_poisson_solvers.jl
@@ -121,9 +121,7 @@ function divergence_free_poisson_tridiagonal_solution(grid_points, ranks, stretc
     return Array(interior(∇²ϕ)) ≈ Array(R)
 end
 
-@testset "Distributed FFT-based Poisson solver" begin
-    child_arch = test_child_arch()
-
+@testset "Distributed FFT-based Poisson solver" begin    
     for topology in ((Periodic, Periodic, Periodic), 
                      (Periodic, Periodic, Bounded),
                      (Periodic, Bounded, Bounded),

diff --git a/test/test_distributed_transpose.jl b/test/test_distributed_transpose.jl
@@ -38,8 +38,6 @@ function test_transpose(grid_points, ranks, topo, child_arch)
 end
 
 @testset "Distributed Transpose" begin
-    child_arch = test_child_arch()
-
     for topology in ((Periodic, Periodic, Periodic), 
                      (Periodic, Periodic, Bounded),
                      (Periodic, Bounded, Bounded),

diff --git a/test/utils_for_runtests.jl b/test/utils_for_runtests.jl
@@ -3,21 +3,26 @@ using Oceananigans.DistributedComputations: Distributed, Partition, child_archit
 
 import Oceananigans.Fields: interior
 
-test_child_arch() = CUDA.has_cuda() ? GPU() : CPU()
+# Are the test running on the GPUs? 
+# Are the test running in parallel?
+child_arch = get(ENV, "GPU_TEST", nothing) == "true" ? GPU() : CPU()
+mpi_test   = get(ENV, "MPI_TEST", nothing) == "true"
 
 function test_architectures() 
-    child_arch =  test_child_arch()
-
     # If MPI is initialized with MPI.Comm_size > 0, we are running in parallel.
     # We test several different configurations: `Partition(x = 4)`, `Partition(y = 4)`, 
     # `Partition(x = 2, y = 2)`, and different fractional subdivisions in x, y and xy
-    if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
-        return (Distributed(child_arch; partition = Partition(4)),
-                Distributed(child_arch; partition = Partition(1, 4)),
-                Distributed(child_arch; partition = Partition(2, 2)),
-                Distributed(child_arch; partition = Partition(x = Fractional(1, 2, 3, 4))),
-                Distributed(child_arch; partition = Partition(y = Fractional(1, 2, 3, 4))),
-                Distributed(child_arch; partition = Partition(x = Fractional(1, 2), y = Equal()))) 
+    if mpi_test
+        if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
+            return (Distributed(child_arch; partition = Partition(4)),
+                    Distributed(child_arch; partition = Partition(1, 4)),
+                    Distributed(child_arch; partition = Partition(2, 2)),
+                    Distributed(child_arch; partition = Partition(x = Fractional(1, 2, 3, 4))),
+                    Distributed(child_arch; partition = Partition(y = Fractional(1, 2, 3, 4))),
+                    Distributed(child_arch; partition = Partition(x = Fractional(1, 2), y = Equal()))) 
+        else
+            return throw("The MPI partitioning is not correctly configured.")
+        end
     else
         return tuple(child_arch)
     end
@@ -26,15 +31,17 @@ end
 # For nonhydrostatic simulations we cannot use `Fractional` at the moment (requirements
 # for the tranpose are more stringent than for hydrostatic simulations).
 function nonhydrostatic_regression_test_architectures() 
-    child_arch =  test_child_arch()
-
     # If MPI is initialized with MPI.Comm_size > 0, we are running in parallel.
     # We test 3 different configurations: `Partition(x = 4)`, `Partition(y = 4)` 
     # and `Partition(x = 2, y = 2)`
-    if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
-        return (Distributed(child_arch; partition = Partition(4)),
-                Distributed(child_arch; partition = Partition(1, 4)),
-                Distributed(child_arch; partition = Partition(2, 2)))
+    if mpi_test
+        if MPI.Initialized() && MPI.Comm_size(MPI.COMM_WORLD) == 4
+            return (Distributed(child_arch; partition = Partition(4)),
+                    Distributed(child_arch; partition = Partition(1, 4)),
+                    Distributed(child_arch; partition = Partition(2, 2)))
+        else
+            return throw("The MPI partitioning is not correctly configured.")
+        end        
     else
         return tuple(child_arch)
     end