Skip to content

Commit

Permalink
Merge pull request #67 from glotzerlab/fix-deadlock
Browse files Browse the repository at this point in the history
Fix deadlock and message file retention on Frontier.
  • Loading branch information
joaander authored Oct 27, 2023
2 parents b6ab253 + 4e13f89 commit 164a106
Show file tree
Hide file tree
Showing 9 changed files with 135 additions and 98 deletions.
38 changes: 38 additions & 0 deletions documentation/frontier.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Tips for running on OLCF Frontier

# Recommended configuration

```
max_cores_sim: 56
max_cores_submission: 7168
max_gpus_submission: 256
max_walltime: 2
enable_llvm: false
enable_gpu: true
```

## Recommended template

```
{% extends "frontier.sh" %}
{% block header %}
{{- super () -}}
#SBATCH -C nvme
{% endblock header %}
{% block custom_content %}
echo "Loading software environment."
export GLOTZERLAB_SOFTWARE_ROOT=/mnt/bb/${USER}/software
time srun --ntasks-per-node 1 mkdir ${GLOTZERLAB_SOFTWARE_ROOT}
time srun --ntasks-per-node 1 tar --directory ${GLOTZERLAB_SOFTWARE_ROOT} -xpf ${MEMBERWORK}/mat110/software.tar
source ${GLOTZERLAB_SOFTWARE_ROOT}/variables.sh
{% endblock custom_content %}
{% block body %}
{{- super () -}}
echo "Completed job in $SECONDS seconds"
{% endblock body %}
```
18 changes: 8 additions & 10 deletions hoomd_validation/alj_2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def alj_2d_create_initial_state(*jobs):

init_diameter = CIRCUMCIRCLE_RADIUS * 2 * 1.15

device = hoomd.device.CPU(
communicator=communicator,
message_filename=job.fn('create_initial_state.log'))
device = hoomd.device.CPU(communicator=communicator,
message_filename=util.get_message_filename(
job, 'create_initial_state.log'))

num_particles = job.statepoint['num_particles']
density = job.statepoint['density']
Expand Down Expand Up @@ -136,8 +136,7 @@ def alj_2d_create_initial_state(*jobs):
mode='wb')

if communicator.rank == 0:
print(f'completed alj_2d_create_initial_state: '
f'{job} in {communicator.walltime} s')
print(f'completed alj_2d_create_initial_state: {job}')


def make_md_simulation(job,
Expand Down Expand Up @@ -321,16 +320,15 @@ def alj_2d_nve_md_job(*jobs):
elif device_name == 'cpu':
device_cls = hoomd.device.CPU

device = device_cls(
communicator=communicator,
message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
device = device_cls(communicator=communicator,
message_filename=util.get_message_filename(
job, f'{sim_mode}_{device_name}.log'))
run_nve_md_sim(job,
device,
complete_filename=f'{sim_mode}_{device_name}_complete')

if communicator.rank == 0:
print(f'completed alj_2d_{sim_mode}_{device_name} '
f'{job} in {communicator.walltime} s')
print(f'completed alj_2d_{sim_mode}_{device_name}: {job}')

nve_md_sampling_jobs.append(alj_2d_nve_md_job)

Expand Down
18 changes: 8 additions & 10 deletions hoomd_validation/hard_disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,9 @@ def hard_disk_create_initial_state(*jobs):
position_2d = list(itertools.product(x, repeat=2))[:num_particles]

# create snapshot
device = hoomd.device.CPU(
communicator=communicator,
message_filename=job.fn('create_initial_state.log'))
device = hoomd.device.CPU(communicator=communicator,
message_filename=util.get_message_filename(
job, 'create_initial_state.log'))
snap = hoomd.Snapshot(communicator)

if communicator.rank == 0:
Expand Down Expand Up @@ -129,8 +129,7 @@ def hard_disk_create_initial_state(*jobs):
mode='wb')

if communicator.rank == 0:
print(f'completed hard_disk_create_initial_state: '
f'{job} in {communicator.walltime} s')
print(f'completed hard_disk_create_initial_state: {job}')


def make_mc_simulation(job,
Expand Down Expand Up @@ -565,16 +564,15 @@ def sampling_operation(*jobs):
elif device_name == 'cpu':
device_cls = hoomd.device.CPU

device = device_cls(
communicator=communicator,
message_filename=job.fn(f'{mode}_{device_name}.log'))
device = device_cls(communicator=communicator,
message_filename=util.get_message_filename(
job, f'{mode}_{device_name}.log'))

globals().get(f'run_{mode}_sim')(
job, device, complete_filename=f'{mode}_{device_name}_complete')

if communicator.rank == 0:
print(f'completed hard_disk_{mode}_{device_name} '
f'{job} in {communicator.walltime} s')
print(f'completed hard_disk_{mode}_{device_name}: {job}')

sampling_jobs.append(sampling_operation)

Expand Down
18 changes: 8 additions & 10 deletions hoomd_validation/hard_sphere.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ def hard_sphere_create_initial_state(*jobs):
position = list(itertools.product(x, repeat=3))[:num_particles]

# create snapshot
device = hoomd.device.CPU(
communicator=communicator,
message_filename=job.fn('create_initial_state.log'))
device = hoomd.device.CPU(communicator=communicator,
message_filename=util.get_message_filename(
job, 'create_initial_state.log'))
snap = hoomd.Snapshot(device.communicator)

if device.communicator.rank == 0:
Expand Down Expand Up @@ -124,8 +124,7 @@ def hard_sphere_create_initial_state(*jobs):
mode='wb')

if communicator.rank == 0:
print(f'completed hard_sphere_create_initial_state: '
f'{job} in {communicator.walltime} s')
print(f'completed hard_sphere_create_initial_state: {job}')


def make_mc_simulation(job,
Expand Down Expand Up @@ -444,16 +443,15 @@ def sampling_operation(*jobs):
elif device_name == 'cpu':
device_cls = hoomd.device.CPU

device = device_cls(
communicator=communicator,
message_filename=job.fn(f'run_{mode}_{device_name}.log'))
device = device_cls(communicator=communicator,
message_filename=util.get_message_filename(
job, f'run_{mode}_{device_name}.log'))

globals().get(f'run_{mode}_sim')(
job, device, complete_filename=f'{mode}_{device_name}_complete')

if communicator.rank == 0:
print(f'completed hard_sphere_{mode}_{device_name} '
f'{job} in {communicator.walltime} s')
print(f'completed hard_sphere_{mode}_{device_name}: {job}')

sampling_jobs.append(sampling_operation)

Expand Down
57 changes: 28 additions & 29 deletions hoomd_validation/lj_fluid.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ def lj_fluid_create_initial_state(*jobs):
print('starting lj_fluid_create_initial_state:', job)

sp = job.sp
device = hoomd.device.CPU(
communicator=communicator,
message_filename=job.fn('create_initial_state.log'))
device = hoomd.device.CPU(communicator=communicator,
message_filename=util.get_message_filename(
job, 'create_initial_state.log'))

box_volume = sp["num_particles"] / sp["density"]
L = box_volume**(1 / 3.)
Expand Down Expand Up @@ -152,8 +152,7 @@ def lj_fluid_create_initial_state(*jobs):
mode='wb')

if communicator.rank == 0:
print(f'completed lj_fluid_create_initial_state: '
f'{job} in {communicator.walltime} s')
print(f'completed lj_fluid_create_initial_state: {job}')


#################################
Expand Down Expand Up @@ -408,9 +407,9 @@ def md_sampling_operation(*jobs):
elif device_name == 'cpu':
device_cls = hoomd.device.CPU

device = device_cls(
communicator=communicator,
message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
device = device_cls(communicator=communicator,
message_filename=util.get_message_filename(
job, f'{sim_mode}_{device_name}.log'))

run_md_sim(job,
device,
Expand All @@ -419,8 +418,7 @@ def md_sampling_operation(*jobs):
complete_filename=f'{sim_mode}_{device_name}_complete')

if communicator.rank == 0:
print(f'completed lj_fluid_{sim_mode}_{device_name}: '
f'{job} in {communicator.walltime} s')
print(f'completed lj_fluid_{sim_mode}_{device_name}: {job}')

md_sampling_jobs.append(md_sampling_operation)

Expand Down Expand Up @@ -808,16 +806,15 @@ def sampling_operation(*jobs):
elif device_name == 'cpu':
device_cls = hoomd.device.CPU

device = device_cls(
communicator=communicator,
message_filename=job.fn(f'{mode}_mc_{device_name}.log'))
device = device_cls(communicator=communicator,
message_filename=util.get_message_filename(
job, f'{mode}_mc_{device_name}.log'))

globals().get(f'run_{mode}_mc_sim')(
job, device, complete_filename=f'{mode}_mc_{device_name}_complete')

if communicator.rank == 0:
print(f'completed lj_fluid_{mode}_mc_{device_name} '
f'{job} in {communicator.walltime} s')
print(f'completed lj_fluid_{mode}_mc_{device_name}: {job}')

mc_sampling_jobs.append(sampling_operation)

Expand Down Expand Up @@ -1052,19 +1049,22 @@ def lj_fluid_compare_modes(*jobs):
separate_nvt_npt=True)

if quantity_name == "density":
print(f"Average npt_mc_cpu density {num_particles}:",
avg_quantity['npt_mc_cpu'], '+/-',
stderr_quantity['npt_mc_cpu'])
if 'npt_mc_cpu' in avg_quantity:
print(f"Average npt_mc_cpu density {num_particles}:",
avg_quantity['npt_mc_cpu'], '+/-',
stderr_quantity['npt_mc_cpu'])
print(f"Average npt_md_cpu density {num_particles}:",
avg_quantity['npt_bussi_md_cpu'], '+/-',
stderr_quantity['npt_bussi_md_cpu'])
if quantity_name == "pressure":
print(f"Average nvt_mc_cpu pressure {num_particles}:",
avg_quantity['nvt_mc_cpu'], '+/-',
stderr_quantity['nvt_mc_cpu'])
print(f"Average npt_mc_cpu pressure {num_particles}:",
avg_quantity['npt_mc_cpu'], '+/-',
stderr_quantity['npt_mc_cpu'])
if 'nvt_mc_cpu' in avg_quantity:
print(f"Average nvt_mc_cpu pressure {num_particles}:",
avg_quantity['nvt_mc_cpu'], '+/-',
stderr_quantity['nvt_mc_cpu'])
if 'npt_mc_cpu' in avg_quantity:
print(f"Average npt_mc_cpu pressure {num_particles}:",
avg_quantity['npt_mc_cpu'], '+/-',
stderr_quantity['npt_mc_cpu'])

filename = f'lj_fluid_compare_kT{kT}_density{round(set_density, 2)}_' \
f'r_cut{round(jobs[0].statepoint.r_cut, 2)}_' \
Expand Down Expand Up @@ -1339,17 +1339,16 @@ def lj_fluid_nve_md_job(*jobs):
elif device_name == 'cpu':
device_cls = hoomd.device.CPU

device = device_cls(
communicator=communicator,
message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
device = device_cls(communicator=communicator,
message_filename=util.get_message_filename(
job, f'{sim_mode}_{device_name}.log'))
run_nve_md_sim(job,
device,
run_length=run_length,
complete_filename=f'{sim_mode}_{device_name}_complete')

if communicator.rank == 0:
print(f'completed lj_fluid_{sim_mode}_{device_name} '
f'{job} in {communicator.walltime} s')
print(f'completed lj_fluid_{sim_mode}_{device_name} {job}')

nve_md_sampling_jobs.append(lj_fluid_nve_md_job)

Expand Down
43 changes: 20 additions & 23 deletions hoomd_validation/lj_union.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ def lj_union_create_initial_state(*jobs):
print('starting lj_union_create_initial_state:', job)

sp = job.sp
device = hoomd.device.CPU(
communicator=communicator,
message_filename=job.fn('create_initial_state.log'))
device = hoomd.device.CPU(communicator=communicator,
message_filename=util.get_message_filename(
job, 'create_initial_state.log'))

box_volume = sp["num_particles"] / sp["density"]
L = box_volume**(1 / 3.)
Expand Down Expand Up @@ -158,8 +158,7 @@ def lj_union_create_initial_state(*jobs):
mode='wb')

if communicator.rank == 0:
print(f'completed lj_union_create_initial_state: '
f'{job} in {communicator.walltime} s')
print(f'completed lj_union_create_initial_state: {job}')


#################################
Expand Down Expand Up @@ -438,9 +437,9 @@ def md_sampling_operation(*jobs):
elif device_name == 'cpu':
device_cls = hoomd.device.CPU

device = device_cls(
communicator=communicator,
message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
device = device_cls(communicator=communicator,
message_filename=util.get_message_filename(
job, f'{sim_mode}_{device_name}.log'))

run_md_sim(job,
device,
Expand All @@ -449,8 +448,7 @@ def md_sampling_operation(*jobs):
complete_filename=f'{sim_mode}_{device_name}_complete')

if communicator.rank == 0:
print(f'completed lj_union_{sim_mode}_{device_name}: '
f'{job} in {communicator.walltime} s')
print(f'completed lj_union_{sim_mode}_{device_name}: {job}')

md_sampling_jobs.append(md_sampling_operation)

Expand Down Expand Up @@ -866,16 +864,15 @@ def sampling_operation(*jobs):
elif device_name == 'cpu':
device_cls = hoomd.device.CPU

device = device_cls(
communicator=communicator,
message_filename=job.fn(f'{mode}_mc_{device_name}.log'))
device = device_cls(communicator=communicator,
message_filename=util.get_message_filename(
job, f'{mode}_mc_{device_name}.log'))

globals().get(f'run_{mode}_mc_sim')(
job, device, complete_filename=f'{mode}_mc_{device_name}_complete')

if communicator.rank == 0:
print(f'completed lj_union_{mode}_mc_{device_name} '
f'{job} in {communicator.walltime} s')
print(f'completed lj_union_{mode}_mc_{device_name} {job}')

mc_sampling_jobs.append(sampling_operation)

Expand Down Expand Up @@ -1107,9 +1104,10 @@ def lj_union_compare_modes(*jobs):
separate_nvt_npt=True)

if quantity_name == "density":
print(f"Average npt_mc_cpu density {num_particles}:",
avg_quantity['npt_mc_cpu'], '+/-',
stderr_quantity['npt_mc_cpu'])
if 'npt_mc_cpu' in avg_quantity:
print(f"Average npt_mc_cpu density {num_particles}:",
avg_quantity['npt_mc_cpu'], '+/-',
stderr_quantity['npt_mc_cpu'])
print(f"Average npt_md_cpu density {num_particles}:",
avg_quantity['npt_bussi_md_cpu'], '+/-',
stderr_quantity['npt_bussi_md_cpu'])
Expand Down Expand Up @@ -1421,17 +1419,16 @@ def lj_union_nve_md_job(*jobs):
elif device_name == 'cpu':
device_cls = hoomd.device.CPU

device = device_cls(
communicator=communicator,
message_filename=job.fn(f'{sim_mode}_{device_name}.log'))
device = device_cls(communicator=communicator,
message_filename=util.get_message_filename(
job, f'{sim_mode}_{device_name}.log'))
run_nve_md_sim(job,
device,
run_length=run_length,
complete_filename=f'{sim_mode}_{device_name}_complete')

if communicator.rank == 0:
print(f'completed lj_union_{sim_mode}_{device_name} '
f'{job} in {communicator.walltime} s')
print(f'completed lj_union_{sim_mode}_{device_name} {job}')

nve_md_sampling_jobs.append(lj_union_nve_md_job)

Expand Down
Loading

0 comments on commit 164a106

Please sign in to comment.