diff --git a/documentation/frontier.md b/documentation/frontier.md new file mode 100644 index 00000000..c7a51992 --- /dev/null +++ b/documentation/frontier.md @@ -0,0 +1,38 @@ +# Tips for running on OLCF Frontier + +# Recommended configuration + +``` +max_cores_sim: 56 +max_cores_submission: 7168 +max_gpus_submission: 256 +max_walltime: 2 +enable_llvm: false +enable_gpu: true +``` + +## Recommended template + +``` +{% extends "frontier.sh" %} + +{% block header %} + {{- super () -}} +#SBATCH -C nvme +{% endblock header %} +{% block custom_content %} + +echo "Loading software environment." + +export GLOTZERLAB_SOFTWARE_ROOT=/mnt/bb/${USER}/software +time srun --ntasks-per-node 1 mkdir ${GLOTZERLAB_SOFTWARE_ROOT} +time srun --ntasks-per-node 1 tar --directory ${GLOTZERLAB_SOFTWARE_ROOT} -xpf ${MEMBERWORK}/mat110/software.tar +source ${GLOTZERLAB_SOFTWARE_ROOT}/variables.sh + +{% endblock custom_content %} +{% block body %} + {{- super () -}} + +echo "Completed job in $SECONDS seconds" +{% endblock body %} +``` diff --git a/hoomd_validation/alj_2d.py b/hoomd_validation/alj_2d.py index f5ea8af6..97a2be53 100644 --- a/hoomd_validation/alj_2d.py +++ b/hoomd_validation/alj_2d.py @@ -89,9 +89,9 @@ def alj_2d_create_initial_state(*jobs): init_diameter = CIRCUMCIRCLE_RADIUS * 2 * 1.15 - device = hoomd.device.CPU( - communicator=communicator, - message_filename=job.fn('create_initial_state.log')) + device = hoomd.device.CPU(communicator=communicator, + message_filename=util.get_message_filename( + job, 'create_initial_state.log')) num_particles = job.statepoint['num_particles'] density = job.statepoint['density'] @@ -136,8 +136,7 @@ def alj_2d_create_initial_state(*jobs): mode='wb') if communicator.rank == 0: - print(f'completed alj_2d_create_initial_state: ' - f'{job} in {communicator.walltime} s') + print(f'completed alj_2d_create_initial_state: {job}') def make_md_simulation(job, @@ -321,16 +320,15 @@ def alj_2d_nve_md_job(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{sim_mode}_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{sim_mode}_{device_name}.log')) run_nve_md_sim(job, device, complete_filename=f'{sim_mode}_{device_name}_complete') if communicator.rank == 0: - print(f'completed alj_2d_{sim_mode}_{device_name} ' - f'{job} in {communicator.walltime} s') + print(f'completed alj_2d_{sim_mode}_{device_name}: {job}') nve_md_sampling_jobs.append(alj_2d_nve_md_job) diff --git a/hoomd_validation/hard_disk.py b/hoomd_validation/hard_disk.py index 8a2067a7..1a895b26 100644 --- a/hoomd_validation/hard_disk.py +++ b/hoomd_validation/hard_disk.py @@ -99,9 +99,9 @@ def hard_disk_create_initial_state(*jobs): position_2d = list(itertools.product(x, repeat=2))[:num_particles] # create snapshot - device = hoomd.device.CPU( - communicator=communicator, - message_filename=job.fn('create_initial_state.log')) + device = hoomd.device.CPU(communicator=communicator, + message_filename=util.get_message_filename( + job, 'create_initial_state.log')) snap = hoomd.Snapshot(communicator) if communicator.rank == 0: @@ -129,8 +129,7 @@ def hard_disk_create_initial_state(*jobs): mode='wb') if communicator.rank == 0: - print(f'completed hard_disk_create_initial_state: ' - f'{job} in {communicator.walltime} s') + print(f'completed hard_disk_create_initial_state: {job}') def make_mc_simulation(job, @@ -565,16 +564,15 @@ def sampling_operation(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{mode}_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{mode}_{device_name}.log')) globals().get(f'run_{mode}_sim')( job, device, complete_filename=f'{mode}_{device_name}_complete') if communicator.rank == 0: - print(f'completed hard_disk_{mode}_{device_name} ' - f'{job} in {communicator.walltime} s') + print(f'completed hard_disk_{mode}_{device_name}: {job}') sampling_jobs.append(sampling_operation) diff --git a/hoomd_validation/hard_sphere.py b/hoomd_validation/hard_sphere.py index 9c66c2d4..01327f49 100644 --- a/hoomd_validation/hard_sphere.py +++ b/hoomd_validation/hard_sphere.py @@ -94,9 +94,9 @@ def hard_sphere_create_initial_state(*jobs): position = list(itertools.product(x, repeat=3))[:num_particles] # create snapshot - device = hoomd.device.CPU( - communicator=communicator, - message_filename=job.fn('create_initial_state.log')) + device = hoomd.device.CPU(communicator=communicator, + message_filename=util.get_message_filename( + job, 'create_initial_state.log')) snap = hoomd.Snapshot(device.communicator) if device.communicator.rank == 0: @@ -124,8 +124,7 @@ def hard_sphere_create_initial_state(*jobs): mode='wb') if communicator.rank == 0: - print(f'completed hard_sphere_create_initial_state: ' - f'{job} in {communicator.walltime} s') + print(f'completed hard_sphere_create_initial_state: {job}') def make_mc_simulation(job, @@ -444,16 +443,15 @@ def sampling_operation(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'run_{mode}_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'run_{mode}_{device_name}.log')) globals().get(f'run_{mode}_sim')( job, device, complete_filename=f'{mode}_{device_name}_complete') if communicator.rank == 0: - print(f'completed hard_sphere_{mode}_{device_name} ' - f'{job} in {communicator.walltime} s') + print(f'completed hard_sphere_{mode}_{device_name}: {job}') sampling_jobs.append(sampling_operation) diff --git a/hoomd_validation/lj_fluid.py b/hoomd_validation/lj_fluid.py index 9ca6a645..e51ba2e7 100644 --- a/hoomd_validation/lj_fluid.py +++ b/hoomd_validation/lj_fluid.py @@ -109,9 +109,9 @@ def lj_fluid_create_initial_state(*jobs): print('starting lj_fluid_create_initial_state:', job) sp = job.sp - device = hoomd.device.CPU( - communicator=communicator, - message_filename=job.fn('create_initial_state.log')) + device = hoomd.device.CPU(communicator=communicator, + message_filename=util.get_message_filename( + job, 'create_initial_state.log')) box_volume = sp["num_particles"] / sp["density"] L = box_volume**(1 / 3.) @@ -152,8 +152,7 @@ def lj_fluid_create_initial_state(*jobs): mode='wb') if communicator.rank == 0: - print(f'completed lj_fluid_create_initial_state: ' - f'{job} in {communicator.walltime} s') + print(f'completed lj_fluid_create_initial_state: {job}') ################################# @@ -408,9 +407,9 @@ def md_sampling_operation(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{sim_mode}_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{sim_mode}_{device_name}.log')) run_md_sim(job, device, @@ -419,8 +418,7 @@ def md_sampling_operation(*jobs): complete_filename=f'{sim_mode}_{device_name}_complete') if communicator.rank == 0: - print(f'completed lj_fluid_{sim_mode}_{device_name}: ' - f'{job} in {communicator.walltime} s') + print(f'completed lj_fluid_{sim_mode}_{device_name}: {job}') md_sampling_jobs.append(md_sampling_operation) @@ -808,16 +806,15 @@ def sampling_operation(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{mode}_mc_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{mode}_mc_{device_name}.log')) globals().get(f'run_{mode}_mc_sim')( job, device, complete_filename=f'{mode}_mc_{device_name}_complete') if communicator.rank == 0: - print(f'completed lj_fluid_{mode}_mc_{device_name} ' - f'{job} in {communicator.walltime} s') + print(f'completed lj_fluid_{mode}_mc_{device_name}: {job}') mc_sampling_jobs.append(sampling_operation) @@ -1052,19 +1049,22 @@ def lj_fluid_compare_modes(*jobs): separate_nvt_npt=True) if quantity_name == "density": - print(f"Average npt_mc_cpu density {num_particles}:", - avg_quantity['npt_mc_cpu'], '+/-', - stderr_quantity['npt_mc_cpu']) + if 'npt_mc_cpu' in avg_quantity: + print(f"Average npt_mc_cpu density {num_particles}:", + avg_quantity['npt_mc_cpu'], '+/-', + stderr_quantity['npt_mc_cpu']) print(f"Average npt_md_cpu density {num_particles}:", avg_quantity['npt_bussi_md_cpu'], '+/-', stderr_quantity['npt_bussi_md_cpu']) if quantity_name == "pressure": - print(f"Average nvt_mc_cpu pressure {num_particles}:", - avg_quantity['nvt_mc_cpu'], '+/-', - stderr_quantity['nvt_mc_cpu']) - print(f"Average npt_mc_cpu pressure {num_particles}:", - avg_quantity['npt_mc_cpu'], '+/-', - stderr_quantity['npt_mc_cpu']) + if 'nvt_mc_cpu' in avg_quantity: + print(f"Average nvt_mc_cpu pressure {num_particles}:", + avg_quantity['nvt_mc_cpu'], '+/-', + stderr_quantity['nvt_mc_cpu']) + if 'npt_mc_cpu' in avg_quantity: + print(f"Average npt_mc_cpu pressure {num_particles}:", + avg_quantity['npt_mc_cpu'], '+/-', + stderr_quantity['npt_mc_cpu']) filename = f'lj_fluid_compare_kT{kT}_density{round(set_density, 2)}_' \ f'r_cut{round(jobs[0].statepoint.r_cut, 2)}_' \ @@ -1339,17 +1339,16 @@ def lj_fluid_nve_md_job(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{sim_mode}_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{sim_mode}_{device_name}.log')) run_nve_md_sim(job, device, run_length=run_length, complete_filename=f'{sim_mode}_{device_name}_complete') if communicator.rank == 0: - print(f'completed lj_fluid_{sim_mode}_{device_name} ' - f'{job} in {communicator.walltime} s') + print(f'completed lj_fluid_{sim_mode}_{device_name} {job}') nve_md_sampling_jobs.append(lj_fluid_nve_md_job) diff --git a/hoomd_validation/lj_union.py b/hoomd_validation/lj_union.py index 2bc060d7..64b0d218 100644 --- a/hoomd_validation/lj_union.py +++ b/hoomd_validation/lj_union.py @@ -97,9 +97,9 @@ def lj_union_create_initial_state(*jobs): print('starting lj_union_create_initial_state:', job) sp = job.sp - device = hoomd.device.CPU( - communicator=communicator, - message_filename=job.fn('create_initial_state.log')) + device = hoomd.device.CPU(communicator=communicator, + message_filename=util.get_message_filename( + job, 'create_initial_state.log')) box_volume = sp["num_particles"] / sp["density"] L = box_volume**(1 / 3.) @@ -158,8 +158,7 @@ def lj_union_create_initial_state(*jobs): mode='wb') if communicator.rank == 0: - print(f'completed lj_union_create_initial_state: ' - f'{job} in {communicator.walltime} s') + print(f'completed lj_union_create_initial_state: {job}') ################################# @@ -438,9 +437,9 @@ def md_sampling_operation(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{sim_mode}_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{sim_mode}_{device_name}.log')) run_md_sim(job, device, @@ -449,8 +448,7 @@ def md_sampling_operation(*jobs): complete_filename=f'{sim_mode}_{device_name}_complete') if communicator.rank == 0: - print(f'completed lj_union_{sim_mode}_{device_name}: ' - f'{job} in {communicator.walltime} s') + print(f'completed lj_union_{sim_mode}_{device_name}: {job}') md_sampling_jobs.append(md_sampling_operation) @@ -866,16 +864,15 @@ def sampling_operation(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{mode}_mc_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{mode}_mc_{device_name}.log')) globals().get(f'run_{mode}_mc_sim')( job, device, complete_filename=f'{mode}_mc_{device_name}_complete') if communicator.rank == 0: - print(f'completed lj_union_{mode}_mc_{device_name} ' - f'{job} in {communicator.walltime} s') + print(f'completed lj_union_{mode}_mc_{device_name} {job}') mc_sampling_jobs.append(sampling_operation) @@ -1107,9 +1104,10 @@ def lj_union_compare_modes(*jobs): separate_nvt_npt=True) if quantity_name == "density": - print(f"Average npt_mc_cpu density {num_particles}:", - avg_quantity['npt_mc_cpu'], '+/-', - stderr_quantity['npt_mc_cpu']) + if 'npt_mc_cpu' in avg_quantity: + print(f"Average npt_mc_cpu density {num_particles}:", + avg_quantity['npt_mc_cpu'], '+/-', + stderr_quantity['npt_mc_cpu']) print(f"Average npt_md_cpu density {num_particles}:", avg_quantity['npt_bussi_md_cpu'], '+/-', stderr_quantity['npt_bussi_md_cpu']) @@ -1421,17 +1419,16 @@ def lj_union_nve_md_job(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{sim_mode}_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{sim_mode}_{device_name}.log')) run_nve_md_sim(job, device, run_length=run_length, complete_filename=f'{sim_mode}_{device_name}_complete') if communicator.rank == 0: - print(f'completed lj_union_{sim_mode}_{device_name} ' - f'{job} in {communicator.walltime} s') + print(f'completed lj_union_{sim_mode}_{device_name} {job}') nve_md_sampling_jobs.append(lj_union_nve_md_job) diff --git a/hoomd_validation/patchy_particle_pressure.py b/hoomd_validation/patchy_particle_pressure.py index 3a960c6c..d8300057 100644 --- a/hoomd_validation/patchy_particle_pressure.py +++ b/hoomd_validation/patchy_particle_pressure.py @@ -149,6 +149,7 @@ def _single_patch_kern_frenkel_code(delta_rad, sq_well_lambda, sigma, kT, return patch_code +@Project.pre(lambda *jobs: CONFIG['enable_llvm']) @Project.post.isfile('patchy_particle_pressure_initial_state.gsd') @Project.operation( directives=dict(executable=CONFIG["executable"], @@ -189,9 +190,9 @@ def patchy_particle_pressure_create_initial_state(*jobs): position = list(itertools.product(x, repeat=3))[:num_particles] # create snapshot - device = hoomd.device.CPU( - communicator=communicator, - message_filename=job.fn('create_initial_state.log')) + device = hoomd.device.CPU(communicator=communicator, + message_filename=util.get_message_filename( + job, 'create_initial_state.log')) snap = hoomd.Snapshot(communicator) if communicator.rank == 0: @@ -599,9 +600,9 @@ def sampling_operation(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{mode}_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{mode}_{device_name}.log')) globals().get(f'run_{mode}_sim')( job, device, complete_filename=f'{mode}_{device_name}_complete') diff --git a/hoomd_validation/simple_polygon.py b/hoomd_validation/simple_polygon.py index e768af17..af1a61e8 100644 --- a/hoomd_validation/simple_polygon.py +++ b/hoomd_validation/simple_polygon.py @@ -110,9 +110,9 @@ def simple_polygon_create_initial_state(*jobs): position_2d = list(itertools.product(x, repeat=2))[:num_particles] # create snapshot - device = hoomd.device.CPU( - communicator=communicator, - message_filename=job.fn('create_initial_state.log')) + device = hoomd.device.CPU(communicator=communicator, + message_filename=util.get_message_filename( + job, 'create_initial_state.log')) snap = hoomd.Snapshot(communicator) if communicator.rank == 0: @@ -146,8 +146,7 @@ def simple_polygon_create_initial_state(*jobs): ) if communicator.rank == 0: - print(f'completed simple_polygon_create_initial_state: ' - f'{job} in {communicator.walltime} s') + print(f'completed simple_polygon_create_initial_state: {job}') def make_mc_simulation(job, @@ -487,16 +486,15 @@ def sampling_operation(*jobs): elif device_name == 'cpu': device_cls = hoomd.device.CPU - device = device_cls( - communicator=communicator, - message_filename=job.fn(f'{mode}_{device_name}.log')) + device = device_cls(communicator=communicator, + message_filename=util.get_message_filename( + job, f'{mode}_{device_name}.log')) globals().get(f'run_{mode}_sim')( job, device, complete_filename=f'{mode}_{device_name}_complete') if communicator.rank == 0: - print(f'completed simple_polygon_{mode}_{device_name} ' - f'{job} in {communicator.walltime} s') + print(f'completed simple_polygon_{mode}_{device_name} {job}') sampling_jobs.append(sampling_operation) diff --git a/hoomd_validation/util.py b/hoomd_validation/util.py index 8c6676b8..f442eb7f 100644 --- a/hoomd_validation/util.py +++ b/hoomd_validation/util.py @@ -5,6 +5,7 @@ import numpy import signac +import os def true_all(*jobs, key): @@ -28,6 +29,15 @@ def get_job_filename(sim_mode, device, name, type): return f"{sim_mode}_{suffix}_{name}.{type}" +def get_message_filename(job, filename): + """Get a cluster job unique message filename.""" + cluster_id = os.environ.get('SLURM_JOB_ID', None) + if cluster_id is not None: + return job.fn(f'{cluster_id}-{filename}') + else: + return job.fn(filename) + + def run_up_to_walltime(sim, end_step, steps, walltime_stop): """Run a simulation, stopping early if a walltime limit is reached.