Merge pull request #67 from glotzerlab/fix-deadlock

Fix deadlock and message file retention on Frontier.
glotzerlab · Oct 27, 2023 · 164a106 · 164a106
2 parents b6ab253 + 4e13f89
commit 164a106
Show file tree

Hide file tree

Showing 9 changed files with 135 additions and 98 deletions.
diff --git a/documentation/frontier.md b/documentation/frontier.md
@@ -0,0 +1,38 @@
+# Tips for running on OLCF Frontier
+
+# Recommended configuration
+
+```
+max_cores_sim: 56
+max_cores_submission: 7168
+max_gpus_submission: 256
+max_walltime: 2
+enable_llvm: false
+enable_gpu: true
+```
+
+## Recommended template
+
+```
+{% extends "frontier.sh" %}
+
+{% block header %}
+    {{- super () -}}
+#SBATCH -C nvme
+{% endblock header %}
+{% block custom_content %}
+
+echo "Loading software environment."
+
+export GLOTZERLAB_SOFTWARE_ROOT=/mnt/bb/${USER}/software
+time srun --ntasks-per-node 1 mkdir ${GLOTZERLAB_SOFTWARE_ROOT}
+time srun --ntasks-per-node 1 tar --directory ${GLOTZERLAB_SOFTWARE_ROOT} -xpf ${MEMBERWORK}/mat110/software.tar
+source ${GLOTZERLAB_SOFTWARE_ROOT}/variables.sh
+
+{% endblock custom_content %}
+{% block body %}
+    {{- super () -}}
+
+echo "Completed job in $SECONDS seconds"
+{% endblock body %}
+```
diff --git a/hoomd_validation/alj_2d.py b/hoomd_validation/alj_2d.py
@@ -89,9 +89,9 @@ def alj_2d_create_initial_state(*jobs):
 
     init_diameter = CIRCUMCIRCLE_RADIUS * 2 * 1.15
 
-    device = hoomd.device.CPU(
-        communicator=communicator,
-        message_filename=job.fn('create_initial_state.log'))
+    device = hoomd.device.CPU(communicator=communicator,
+                              message_filename=util.get_message_filename(
+                                  job, 'create_initial_state.log'))
 
     num_particles = job.statepoint['num_particles']
     density = job.statepoint['density']
@@ -136,8 +136,7 @@ def alj_2d_create_initial_state(*jobs):
                           mode='wb')
 
     if communicator.rank == 0:
-        print(f'completed alj_2d_create_initial_state: '
-              f'{job} in {communicator.walltime} s')
+        print(f'completed alj_2d_create_initial_state: {job}')
 
 
 def make_md_simulation(job,
@@ -321,16 +320,15 @@ def alj_2d_nve_md_job(*jobs):
         elif device_name == 'cpu':
             device_cls = hoomd.device.CPU
 
-        device = device_cls(
-            communicator=communicator,
-            message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
+        device = device_cls(communicator=communicator,
+                            message_filename=util.get_message_filename(
+                                job, f'{sim_mode}_{device_name}.log'))
         run_nve_md_sim(job,
                        device,
                        complete_filename=f'{sim_mode}_{device_name}_complete')
 
         if communicator.rank == 0:
-            print(f'completed alj_2d_{sim_mode}_{device_name} '
-                  f'{job} in {communicator.walltime} s')
+            print(f'completed alj_2d_{sim_mode}_{device_name}: {job}')
 
     nve_md_sampling_jobs.append(alj_2d_nve_md_job)
 

diff --git a/hoomd_validation/hard_disk.py b/hoomd_validation/hard_disk.py
@@ -99,9 +99,9 @@ def hard_disk_create_initial_state(*jobs):
     position_2d = list(itertools.product(x, repeat=2))[:num_particles]
 
     # create snapshot
-    device = hoomd.device.CPU(
-        communicator=communicator,
-        message_filename=job.fn('create_initial_state.log'))
+    device = hoomd.device.CPU(communicator=communicator,
+                              message_filename=util.get_message_filename(
+                                  job, 'create_initial_state.log'))
     snap = hoomd.Snapshot(communicator)
 
     if communicator.rank == 0:
@@ -129,8 +129,7 @@ def hard_disk_create_initial_state(*jobs):
                           mode='wb')
 
     if communicator.rank == 0:
-        print(f'completed hard_disk_create_initial_state: '
-              f'{job} in {communicator.walltime} s')
+        print(f'completed hard_disk_create_initial_state: {job}')
 
 
 def make_mc_simulation(job,
@@ -565,16 +564,15 @@ def sampling_operation(*jobs):
         elif device_name == 'cpu':
             device_cls = hoomd.device.CPU
 
-        device = device_cls(
-            communicator=communicator,
-            message_filename=job.fn(f'{mode}_{device_name}.log'))
+        device = device_cls(communicator=communicator,
+                            message_filename=util.get_message_filename(
+                                job, f'{mode}_{device_name}.log'))
 
         globals().get(f'run_{mode}_sim')(
             job, device, complete_filename=f'{mode}_{device_name}_complete')
 
         if communicator.rank == 0:
-            print(f'completed hard_disk_{mode}_{device_name} '
-                  f'{job} in {communicator.walltime} s')
+            print(f'completed hard_disk_{mode}_{device_name}: {job}')
 
     sampling_jobs.append(sampling_operation)
 

diff --git a/hoomd_validation/hard_sphere.py b/hoomd_validation/hard_sphere.py
@@ -94,9 +94,9 @@ def hard_sphere_create_initial_state(*jobs):
     position = list(itertools.product(x, repeat=3))[:num_particles]
 
     # create snapshot
-    device = hoomd.device.CPU(
-        communicator=communicator,
-        message_filename=job.fn('create_initial_state.log'))
+    device = hoomd.device.CPU(communicator=communicator,
+                              message_filename=util.get_message_filename(
+                                  job, 'create_initial_state.log'))
     snap = hoomd.Snapshot(device.communicator)
 
     if device.communicator.rank == 0:
@@ -124,8 +124,7 @@ def hard_sphere_create_initial_state(*jobs):
                           mode='wb')
 
     if communicator.rank == 0:
-        print(f'completed hard_sphere_create_initial_state: '
-              f'{job} in {communicator.walltime} s')
+        print(f'completed hard_sphere_create_initial_state: {job}')
 
 
 def make_mc_simulation(job,
@@ -444,16 +443,15 @@ def sampling_operation(*jobs):
         elif device_name == 'cpu':
             device_cls = hoomd.device.CPU
 
-        device = device_cls(
-            communicator=communicator,
-            message_filename=job.fn(f'run_{mode}_{device_name}.log'))
+        device = device_cls(communicator=communicator,
+                            message_filename=util.get_message_filename(
+                                job, f'run_{mode}_{device_name}.log'))
 
         globals().get(f'run_{mode}_sim')(
             job, device, complete_filename=f'{mode}_{device_name}_complete')
 
         if communicator.rank == 0:
-            print(f'completed hard_sphere_{mode}_{device_name} '
-                  f'{job} in {communicator.walltime} s')
+            print(f'completed hard_sphere_{mode}_{device_name}: {job}')
 
     sampling_jobs.append(sampling_operation)
 

diff --git a/hoomd_validation/lj_fluid.py b/hoomd_validation/lj_fluid.py
@@ -109,9 +109,9 @@ def lj_fluid_create_initial_state(*jobs):
         print('starting lj_fluid_create_initial_state:', job)
 
     sp = job.sp
-    device = hoomd.device.CPU(
-        communicator=communicator,
-        message_filename=job.fn('create_initial_state.log'))
+    device = hoomd.device.CPU(communicator=communicator,
+                              message_filename=util.get_message_filename(
+                                  job, 'create_initial_state.log'))
 
     box_volume = sp["num_particles"] / sp["density"]
     L = box_volume**(1 / 3.)
@@ -152,8 +152,7 @@ def lj_fluid_create_initial_state(*jobs):
                           mode='wb')
 
     if communicator.rank == 0:
-        print(f'completed lj_fluid_create_initial_state: '
-              f'{job} in {communicator.walltime} s')
+        print(f'completed lj_fluid_create_initial_state: {job}')
 
 
 #################################
@@ -408,9 +407,9 @@ def md_sampling_operation(*jobs):
         elif device_name == 'cpu':
             device_cls = hoomd.device.CPU
 
-        device = device_cls(
-            communicator=communicator,
-            message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
+        device = device_cls(communicator=communicator,
+                            message_filename=util.get_message_filename(
+                                job, f'{sim_mode}_{device_name}.log'))
 
         run_md_sim(job,
                    device,
@@ -419,8 +418,7 @@ def md_sampling_operation(*jobs):
                    complete_filename=f'{sim_mode}_{device_name}_complete')
 
         if communicator.rank == 0:
-            print(f'completed lj_fluid_{sim_mode}_{device_name}: '
-                  f'{job} in {communicator.walltime} s')
+            print(f'completed lj_fluid_{sim_mode}_{device_name}: {job}')
 
     md_sampling_jobs.append(md_sampling_operation)
 
@@ -808,16 +806,15 @@ def sampling_operation(*jobs):
         elif device_name == 'cpu':
             device_cls = hoomd.device.CPU
 
-        device = device_cls(
-            communicator=communicator,
-            message_filename=job.fn(f'{mode}_mc_{device_name}.log'))
+        device = device_cls(communicator=communicator,
+                            message_filename=util.get_message_filename(
+                                job, f'{mode}_mc_{device_name}.log'))
 
         globals().get(f'run_{mode}_mc_sim')(
             job, device, complete_filename=f'{mode}_mc_{device_name}_complete')
 
         if communicator.rank == 0:
-            print(f'completed lj_fluid_{mode}_mc_{device_name} '
-                  f'{job} in {communicator.walltime} s')
+            print(f'completed lj_fluid_{mode}_mc_{device_name}: {job}')
 
     mc_sampling_jobs.append(sampling_operation)
 
@@ -1052,19 +1049,22 @@ def lj_fluid_compare_modes(*jobs):
             separate_nvt_npt=True)
 
         if quantity_name == "density":
-            print(f"Average npt_mc_cpu density {num_particles}:",
-                  avg_quantity['npt_mc_cpu'], '+/-',
-                  stderr_quantity['npt_mc_cpu'])
+            if 'npt_mc_cpu' in avg_quantity:
+                print(f"Average npt_mc_cpu density {num_particles}:",
+                      avg_quantity['npt_mc_cpu'], '+/-',
+                      stderr_quantity['npt_mc_cpu'])
             print(f"Average npt_md_cpu density {num_particles}:",
                   avg_quantity['npt_bussi_md_cpu'], '+/-',
                   stderr_quantity['npt_bussi_md_cpu'])
         if quantity_name == "pressure":
-            print(f"Average nvt_mc_cpu pressure {num_particles}:",
-                  avg_quantity['nvt_mc_cpu'], '+/-',
-                  stderr_quantity['nvt_mc_cpu'])
-            print(f"Average npt_mc_cpu pressure {num_particles}:",
-                  avg_quantity['npt_mc_cpu'], '+/-',
-                  stderr_quantity['npt_mc_cpu'])
+            if 'nvt_mc_cpu' in avg_quantity:
+                print(f"Average nvt_mc_cpu pressure {num_particles}:",
+                      avg_quantity['nvt_mc_cpu'], '+/-',
+                      stderr_quantity['nvt_mc_cpu'])
+            if 'npt_mc_cpu' in avg_quantity:
+                print(f"Average npt_mc_cpu pressure {num_particles}:",
+                      avg_quantity['npt_mc_cpu'], '+/-',
+                      stderr_quantity['npt_mc_cpu'])
 
     filename = f'lj_fluid_compare_kT{kT}_density{round(set_density, 2)}_' \
                f'r_cut{round(jobs[0].statepoint.r_cut, 2)}_' \
@@ -1339,17 +1339,16 @@ def lj_fluid_nve_md_job(*jobs):
         elif device_name == 'cpu':
             device_cls = hoomd.device.CPU
 
-        device = device_cls(
-            communicator=communicator,
-            message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
+        device = device_cls(communicator=communicator,
+                            message_filename=util.get_message_filename(
+                                job, f'{sim_mode}_{device_name}.log'))
         run_nve_md_sim(job,
                        device,
                        run_length=run_length,
                        complete_filename=f'{sim_mode}_{device_name}_complete')
 
         if communicator.rank == 0:
-            print(f'completed lj_fluid_{sim_mode}_{device_name} '
-                  f'{job} in {communicator.walltime} s')
+            print(f'completed lj_fluid_{sim_mode}_{device_name} {job}')
 
     nve_md_sampling_jobs.append(lj_fluid_nve_md_job)
 

diff --git a/hoomd_validation/lj_union.py b/hoomd_validation/lj_union.py
@@ -97,9 +97,9 @@ def lj_union_create_initial_state(*jobs):
         print('starting lj_union_create_initial_state:', job)
 
     sp = job.sp
-    device = hoomd.device.CPU(
-        communicator=communicator,
-        message_filename=job.fn('create_initial_state.log'))
+    device = hoomd.device.CPU(communicator=communicator,
+                              message_filename=util.get_message_filename(
+                                  job, 'create_initial_state.log'))
 
     box_volume = sp["num_particles"] / sp["density"]
     L = box_volume**(1 / 3.)
@@ -158,8 +158,7 @@ def lj_union_create_initial_state(*jobs):
                           mode='wb')
 
     if communicator.rank == 0:
-        print(f'completed lj_union_create_initial_state: '
-              f'{job} in {communicator.walltime} s')
+        print(f'completed lj_union_create_initial_state: {job}')
 
 
 #################################
@@ -438,9 +437,9 @@ def md_sampling_operation(*jobs):
         elif device_name == 'cpu':
             device_cls = hoomd.device.CPU
 
-        device = device_cls(
-            communicator=communicator,
-            message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
+        device = device_cls(communicator=communicator,
+                            message_filename=util.get_message_filename(
+                                job, f'{sim_mode}_{device_name}.log'))
 
         run_md_sim(job,
                    device,
@@ -449,8 +448,7 @@ def md_sampling_operation(*jobs):
                    complete_filename=f'{sim_mode}_{device_name}_complete')
 
         if communicator.rank == 0:
-            print(f'completed lj_union_{sim_mode}_{device_name}: '
-                  f'{job} in {communicator.walltime} s')
+            print(f'completed lj_union_{sim_mode}_{device_name}: {job}')
 
     md_sampling_jobs.append(md_sampling_operation)
 
@@ -866,16 +864,15 @@ def sampling_operation(*jobs):
         elif device_name == 'cpu':
             device_cls = hoomd.device.CPU
 
-        device = device_cls(
-            communicator=communicator,
-            message_filename=job.fn(f'{mode}_mc_{device_name}.log'))
+        device = device_cls(communicator=communicator,
+                            message_filename=util.get_message_filename(
+                                job, f'{mode}_mc_{device_name}.log'))
 
         globals().get(f'run_{mode}_mc_sim')(
             job, device, complete_filename=f'{mode}_mc_{device_name}_complete')
 
         if communicator.rank == 0:
-            print(f'completed lj_union_{mode}_mc_{device_name} '
-                  f'{job} in {communicator.walltime} s')
+            print(f'completed lj_union_{mode}_mc_{device_name} {job}')
 
     mc_sampling_jobs.append(sampling_operation)
 
@@ -1107,9 +1104,10 @@ def lj_union_compare_modes(*jobs):
             separate_nvt_npt=True)
 
         if quantity_name == "density":
-            print(f"Average npt_mc_cpu density {num_particles}:",
-                  avg_quantity['npt_mc_cpu'], '+/-',
-                  stderr_quantity['npt_mc_cpu'])
+            if 'npt_mc_cpu' in avg_quantity:
+                print(f"Average npt_mc_cpu density {num_particles}:",
+                      avg_quantity['npt_mc_cpu'], '+/-',
+                      stderr_quantity['npt_mc_cpu'])
             print(f"Average npt_md_cpu density {num_particles}:",
                   avg_quantity['npt_bussi_md_cpu'], '+/-',
                   stderr_quantity['npt_bussi_md_cpu'])
@@ -1421,17 +1419,16 @@ def lj_union_nve_md_job(*jobs):
         elif device_name == 'cpu':
             device_cls = hoomd.device.CPU
 
-        device = device_cls(
-            communicator=communicator,
-            message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
+        device = device_cls(communicator=communicator,
+                            message_filename=util.get_message_filename(
+                                job, f'{sim_mode}_{device_name}.log'))
         run_nve_md_sim(job,
                        device,
                        run_length=run_length,
                        complete_filename=f'{sim_mode}_{device_name}_complete')
 
         if communicator.rank == 0:
-            print(f'completed lj_union_{sim_mode}_{device_name} '
-                  f'{job} in {communicator.walltime} s')
+            print(f'completed lj_union_{sim_mode}_{device_name} {job}')
 
     nve_md_sampling_jobs.append(lj_union_nve_md_job)