-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
153 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH -N 2 # Number of nodes | ||
#SBATCH -J PINN # job name | ||
#SBATCH -L SCRATCH # job requires SCRATCH files | ||
#SBATCH -A m2859_g # allocation | ||
#SBATCH -C gpu | ||
#SBATCH -q regular | ||
#SBATCH -t 00:01:00 | ||
#SBATCH --gpus-per-node=4 | ||
#SBATCH --ntasks-per-gpu=1 | ||
#SBATCH --gpus 8 | ||
#SBATCH -o %j.out | ||
#SBATCH -e %j.err | ||
|
||
|
||
# Function to list GPUs on each node | ||
function list_gpus { | ||
node=$1 | ||
echo "Listing GPUs on node $node:" | ||
srun --nodes=1 --ntasks=1 --gpus-per-task=1 -w $node nvidia-smi --list-gpus | ||
echo "" | ||
} | ||
|
||
# Get the allocated nodes | ||
nodes=$(scontrol show hostname $SLURM_JOB_NODELIST) | ||
|
||
# Loop over each node and list GPUs | ||
for node in $nodes; do | ||
list_gpus $node | ||
done | ||
|
||
# Print the GPU devices | ||
# nvidia-smi --list-gpus |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
""" | ||
To run on NERSC: | ||
export MASTER_ADDR=$(hostname) | ||
""" | ||
import torch | ||
import torch.distributed as dist | ||
import torch.multiprocessing as mp | ||
import os | ||
|
||
def get_nodelist(): | ||
slurm_job_nodelist = os.environ.get('SLURM_JOB_NODELIST') | ||
nodes = [] | ||
prefix='nid' | ||
if slurm_job_nodelist: | ||
# Remove any enclosing brackets and split into individual nodes | ||
slurm_job_nodelist = slurm_job_nodelist.strip('nid').strip('[]').split(',') | ||
|
||
for node_spec in slurm_job_nodelist: | ||
if '-' in node_spec: | ||
# Expand node ranges, e.g., "001-003" becomes ["nid001", "nid002", "nid003"] | ||
node_range = node_spec.split('-') | ||
start = int(node_range[0]) | ||
end = int(node_range[1]) | ||
nodes.extend([prefix+str(i) for i in range(start, end + 1)]) | ||
else: | ||
nodes.append(prefix+str(node_spec.zfill(6))) | ||
|
||
print(nodes) | ||
return nodes | ||
|
||
def init_process(rank, world_size, fn, head_node, backend='nccl'): | ||
os.environ['MASTER_ADDR'] = head_node | ||
os.environ['MASTER_PORT'] = '29510' | ||
dist.init_process_group(backend=backend, | ||
rank=int(os.environ['SLURM_PROCID']), | ||
world_size=world_size) | ||
fn(rank,world_size) # this will be the run function | ||
|
||
def run(rank,world_size): | ||
# Set the GPU device for this rank | ||
device = torch.device(f'cuda:{rank}') | ||
x = torch.Tensor([1]).to(device) | ||
rank_confirm = dist.get_rank() | ||
print(f"Hello from process {rank}! Confirming rank {rank_confirm}. Running on GPU: {device}. Tensor {x}") | ||
|
||
def main(): | ||
# Get the total number of processes | ||
world_size = 4 #int(os.environ['SLURM_NTASKS']) | ||
|
||
"""Initialize the distributed environment""" | ||
node_list = get_nodelist() | ||
|
||
|
||
# Spawn the processes | ||
processes = [] | ||
mp.set_start_method("spawn") | ||
for rank in range(world_size): | ||
p = mp.Process(target=init_process, | ||
args=(rank, world_size, run, node_list[0])) | ||
p.start() | ||
processes.append(p) | ||
|
||
|
||
# Wait for all processes to finish | ||
for p in processes: | ||
p.join() | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH -N 2 # Number of nodes | ||
#SBATCH -J PINN # job name | ||
#SBATCH -L SCRATCH # job requires SCRATCH files | ||
#SBATCH -A m2859_g # allocation | ||
#SBATCH -C gpu | ||
#SBATCH -q regular | ||
#SBATCH -t 00:01:00 | ||
#SBATCH --gpus-per-node=4 | ||
#SBATCH --ntasks-per-gpu=1 | ||
#SBATCH --gpus 8 | ||
#SBATCH -o %j.out | ||
#SBATCH -e %j.err | ||
|
||
export MASTER_ADDR=$(hostname) | ||
|
||
echo "jobstart $(date)";pwd | ||
|
||
srun -n 2 python $SCRATCH/PINN/test_dist.py | ||
|
||
echo "jobend $(date)";pwd |