From 7deaedf4fff42a101d8507826df67a2cbd731ca4 Mon Sep 17 00:00:00 2001 From: Vidya Ganapati Date: Fri, 16 Jun 2023 11:15:45 -0700 Subject: [PATCH] debugging distributed computing --- README.md | 14 ++++++++++ list_gpus.sh | 34 +++++++++++++++++++++++++ main.py | 23 ++++++++--------- slurm_train.sh | 6 ++--- test_dist.py | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ test_dist.sh | 22 ++++++++++++++++ 6 files changed, 153 insertions(+), 15 deletions(-) create mode 100644 list_gpus.sh create mode 100644 test_dist.py create mode 100644 test_dist.sh diff --git a/README.md b/README.md index e1e8bf5..f448b60 100644 --- a/README.md +++ b/README.md @@ -174,4 +174,18 @@ https://theaisummer.com/distributed-training-pytorch/ https://towardsdatascience.com/distribute-your-pytorch-model-in-less-than-20-lines-of-code-61a786e6e7b0 +Physics-informed neural networks for inverse problems in nano-optics and metamaterials +Yuyao Chen, Lu Lu, George Em Karniadakis, and Luca Dal Negro +https://opg.optica.org/oe/fulltext.cfm?uri=oe-28-8-11618&id=429761 +Physics-informed neural networks with hard constraints for inverse design +Lu Lu, Raphael Pestourie, Wenjie Yao, Zhicheng Wang, Francesc Verdugo, Steven G. Johnson +https://epubs.siam.org/doi/10.1137/21M1397908 + +Efficient inversion of multiple-scattering model for optical diffraction tomography +Emmanuel Soubies, Thanh-An Pham, and Michael Unser +https://opg.optica.org/oe/fulltext.cfm?uri=oe-25-18-21786&id=371123 + +Lorenz–Mie theory for 2D scattering and resonance calculations +Denis Gagnon and Louis J Dubé +https://iopscience.iop.org/article/10.1088/2040-8978/17/10/103501 \ No newline at end of file diff --git a/list_gpus.sh b/list_gpus.sh new file mode 100644 index 0000000..bce673f --- /dev/null +++ b/list_gpus.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +#SBATCH -N 2 # Number of nodes +#SBATCH -J PINN # job name +#SBATCH -L SCRATCH # job requires SCRATCH files +#SBATCH -A m2859_g # allocation +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 00:01:00 +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-gpu=1 +#SBATCH --gpus 8 +#SBATCH -o %j.out +#SBATCH -e %j.err + + +# Function to list GPUs on each node +function list_gpus { + node=$1 + echo "Listing GPUs on node $node:" + srun --nodes=1 --ntasks=1 --gpus-per-task=1 -w $node nvidia-smi --list-gpus + echo "" +} + +# Get the allocated nodes +nodes=$(scontrol show hostname $SLURM_JOB_NODELIST) + +# Loop over each node and list GPUs +for node in $nodes; do + list_gpus $node +done + +# Print the GPU devices +# nvidia-smi --list-gpus diff --git a/main.py b/main.py index 09cb902..4c6732a 100644 --- a/main.py +++ b/main.py @@ -38,7 +38,7 @@ def get_args(): parser = argparse.ArgumentParser(description='Get command line args') parser.add_argument('--bs', type=int, action='store', dest='batch_size', - help='batch size', default = 1600) + help='batch size', default = 8192) parser.add_argument('--nb', type=int, action='store', dest='num_basis', help='number of basis functions, N in pde-cl paper', default = 200) parser.add_argument('--upc', action='store_true', dest='use_pde_cl', @@ -96,10 +96,11 @@ def setup(rank, world_size, fn, args, backend='nccl'): os.environ['MASTER_PORT'] = '29500' # Get the SLURM_PROCID for the current process - proc_id = int(os.environ['SLURM_PROCID']) + # proc_id = int(os.environ['SLURM_PROCID']) + + # print("Hello from " + str(proc_id)) + # print(get_rank(args.use_dist)) - print("Hello from " + str(proc_id)) - print(get_rank(args.use_dist)) # initialize the process group dist.init_process_group(backend, rank=rank, world_size=world_size) fn(rank,world_size, args) # this will be the run function @@ -185,13 +186,9 @@ def partition_dataset(args, world_size): def run(rank, world_size, args, dtype = torch.float, ): + if args.use_dist: - print("Running on rank " + str(rank) + ".") - - proc_id = int(os.environ['SLURM_PROCID']) - - print("Hello from " + str(proc_id)) - print(get_rank(args.use_dist)) + print("Running on rank " + str(rank) + ". Running on rank " + str(get_rank(args.use_dist))) train_set, train_set_2, test_set, bsz = partition_dataset(args, world_size) @@ -206,8 +203,10 @@ def run(rank, world_size, args, if args.use_dist: # device = rank #{'cuda:%d' % 0: 'cuda:%d' % rank} - device = torch.device(rank) - model.to(rank) + # device = torch.device(rank) + + device = torch.device(f'cuda:{rank}') + model.to(device) #ddp_model = DDP(model, device_ids=[rank]) else: device = get_device(args) diff --git a/slurm_train.sh b/slurm_train.sh index 971b06b..050bfcf 100644 --- a/slurm_train.sh +++ b/slurm_train.sh @@ -6,7 +6,7 @@ #SBATCH -A m2859_g # allocation #SBATCH -C gpu #SBATCH -q regular -#SBATCH -t 00:05:00 +#SBATCH -t 04:00:00 #SBATCH --gpus-per-node=4 #SBATCH --ntasks-per-gpu=1 #SBATCH --gpus 4 @@ -14,12 +14,12 @@ #SBATCH -e %j.err export MASTER_ADDR=$(hostname) -export BATCH_SIZE=$1 +export BATCH_SIZE=8192 export SCRATCH_FOLDER=$SCRATCH/output_PINN/$SLURM_JOB_ID mkdir -p $SCRATCH_FOLDER; cd $SCRATCH_FOLDER echo "jobstart $(date)";pwd -srun -n 4 -c 32 python $SCRATCH/PINN/main.py --upc --2d --dist --bs $BATCH_SIZE --epochs 5 +python $SCRATCH/PINN/main.py --upc --2d --dist --bs $BATCH_SIZE --epochs 500 echo "jobend $(date)";pwd \ No newline at end of file diff --git a/test_dist.py b/test_dist.py new file mode 100644 index 0000000..0c24a0f --- /dev/null +++ b/test_dist.py @@ -0,0 +1,69 @@ +""" +To run on NERSC: +export MASTER_ADDR=$(hostname) +""" +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import os + +def get_nodelist(): + slurm_job_nodelist = os.environ.get('SLURM_JOB_NODELIST') + nodes = [] + prefix='nid' + if slurm_job_nodelist: + # Remove any enclosing brackets and split into individual nodes + slurm_job_nodelist = slurm_job_nodelist.strip('nid').strip('[]').split(',') + + for node_spec in slurm_job_nodelist: + if '-' in node_spec: + # Expand node ranges, e.g., "001-003" becomes ["nid001", "nid002", "nid003"] + node_range = node_spec.split('-') + start = int(node_range[0]) + end = int(node_range[1]) + nodes.extend([prefix+str(i) for i in range(start, end + 1)]) + else: + nodes.append(prefix+str(node_spec.zfill(6))) + + print(nodes) + return nodes + +def init_process(rank, world_size, fn, head_node, backend='nccl'): + os.environ['MASTER_ADDR'] = head_node + os.environ['MASTER_PORT'] = '29510' + dist.init_process_group(backend=backend, + rank=int(os.environ['SLURM_PROCID']), + world_size=world_size) + fn(rank,world_size) # this will be the run function + +def run(rank,world_size): + # Set the GPU device for this rank + device = torch.device(f'cuda:{rank}') + x = torch.Tensor([1]).to(device) + rank_confirm = dist.get_rank() + print(f"Hello from process {rank}! Confirming rank {rank_confirm}. Running on GPU: {device}. Tensor {x}") + +def main(): + # Get the total number of processes + world_size = 4 #int(os.environ['SLURM_NTASKS']) + + """Initialize the distributed environment""" + node_list = get_nodelist() + + + # Spawn the processes + processes = [] + mp.set_start_method("spawn") + for rank in range(world_size): + p = mp.Process(target=init_process, + args=(rank, world_size, run, node_list[0])) + p.start() + processes.append(p) + + + # Wait for all processes to finish + for p in processes: + p.join() + +if __name__ == '__main__': + main() diff --git a/test_dist.sh b/test_dist.sh new file mode 100644 index 0000000..6ba0bcd --- /dev/null +++ b/test_dist.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +#SBATCH -N 2 # Number of nodes +#SBATCH -J PINN # job name +#SBATCH -L SCRATCH # job requires SCRATCH files +#SBATCH -A m2859_g # allocation +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 00:01:00 +#SBATCH --gpus-per-node=4 +#SBATCH --ntasks-per-gpu=1 +#SBATCH --gpus 8 +#SBATCH -o %j.out +#SBATCH -e %j.err + +export MASTER_ADDR=$(hostname) + +echo "jobstart $(date)";pwd + +srun -n 2 python $SCRATCH/PINN/test_dist.py + +echo "jobend $(date)";pwd \ No newline at end of file