debugging distributed computing

gigantocypris · Jun 16, 2023 · 7deaedf · 7deaedf
1 parent 67090af
commit 7deaedf
Show file tree

Hide file tree

Showing 6 changed files with 153 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -174,4 +174,18 @@ https://theaisummer.com/distributed-training-pytorch/
 https://towardsdatascience.com/distribute-your-pytorch-model-in-less-than-20-lines-of-code-61a786e6e7b0
 
 
+Physics-informed neural networks for inverse problems in nano-optics and metamaterials
+Yuyao Chen, Lu Lu, George Em Karniadakis, and Luca Dal Negro
+https://opg.optica.org/oe/fulltext.cfm?uri=oe-28-8-11618&id=429761
 
+Physics-informed neural networks with hard constraints for inverse design
+Lu Lu, Raphael Pestourie, Wenjie Yao, Zhicheng Wang, Francesc Verdugo, Steven G. Johnson
+https://epubs.siam.org/doi/10.1137/21M1397908
+
+Efficient inversion of multiple-scattering model for optical diffraction tomography
+Emmanuel Soubies, Thanh-An Pham, and Michael Unser
+https://opg.optica.org/oe/fulltext.cfm?uri=oe-25-18-21786&id=371123
+
+Lorenz–Mie theory for 2D scattering and resonance calculations
+Denis Gagnon and Louis J Dubé
+https://iopscience.iop.org/article/10.1088/2040-8978/17/10/103501
diff --git a/list_gpus.sh b/list_gpus.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+#SBATCH -N 2            # Number of nodes
+#SBATCH -J PINN          # job name
+#SBATCH -L SCRATCH       # job requires SCRATCH files
+#SBATCH -A m2859_g       # allocation
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 00:01:00
+#SBATCH --gpus-per-node=4
+#SBATCH --ntasks-per-gpu=1
+#SBATCH --gpus 8
+#SBATCH -o %j.out
+#SBATCH -e %j.err
+
+
+# Function to list GPUs on each node
+function list_gpus {
+    node=$1
+    echo "Listing GPUs on node $node:"
+    srun --nodes=1 --ntasks=1 --gpus-per-task=1 -w $node nvidia-smi --list-gpus
+    echo ""
+}
+
+# Get the allocated nodes
+nodes=$(scontrol show hostname $SLURM_JOB_NODELIST)
+
+# Loop over each node and list GPUs
+for node in $nodes; do
+    list_gpus $node
+done
+
+# Print the GPU devices
+# nvidia-smi --list-gpus
diff --git a/main.py b/main.py
@@ -38,7 +38,7 @@ def get_args():
     parser = argparse.ArgumentParser(description='Get command line args')
 
     parser.add_argument('--bs', type=int, action='store', dest='batch_size',
-                        help='batch size', default = 1600)    
+                        help='batch size', default = 8192)    
     parser.add_argument('--nb', type=int, action='store', dest='num_basis',
                         help='number of basis functions, N in pde-cl paper', default = 200)  
     parser.add_argument('--upc', action='store_true', dest='use_pde_cl', 
@@ -96,10 +96,11 @@ def setup(rank, world_size, fn, args, backend='nccl'):
     os.environ['MASTER_PORT'] = '29500'
 
     # Get the SLURM_PROCID for the current process
-    proc_id = int(os.environ['SLURM_PROCID'])
+    # proc_id = int(os.environ['SLURM_PROCID'])
+
+    # print("Hello from " + str(proc_id))
+    # print(get_rank(args.use_dist))
 
-    print("Hello from " + str(proc_id))
-    print(get_rank(args.use_dist))
     # initialize the process group
     dist.init_process_group(backend, rank=rank, world_size=world_size)
     fn(rank,world_size, args) # this will be the run function
@@ -185,13 +186,9 @@ def partition_dataset(args, world_size):
 def run(rank, world_size, args,
         dtype = torch.float,
         ):
+
     if args.use_dist:
-        print("Running on rank " + str(rank) + ".")
-
-    proc_id = int(os.environ['SLURM_PROCID'])
-
-    print("Hello from " + str(proc_id))
-    print(get_rank(args.use_dist))
+        print("Running on rank " + str(rank) + ". Running on rank " + str(get_rank(args.use_dist)))
 
     train_set, train_set_2, test_set, bsz = partition_dataset(args, world_size)
 
@@ -206,8 +203,10 @@ def run(rank, world_size, args,
 
     if args.use_dist:
         # device = rank #{'cuda:%d' % 0: 'cuda:%d' % rank}
-        device = torch.device(rank)
-        model.to(rank)
+        # device = torch.device(rank)
+
+        device = torch.device(f'cuda:{rank}')
+        model.to(device)
         #ddp_model = DDP(model, device_ids=[rank])
     else:
         device = get_device(args)

diff --git a/slurm_train.sh b/slurm_train.sh
@@ -6,20 +6,20 @@
 #SBATCH -A m2859_g       # allocation
 #SBATCH -C gpu
 #SBATCH -q regular
-#SBATCH -t 00:05:00
+#SBATCH -t 04:00:00
 #SBATCH --gpus-per-node=4
 #SBATCH --ntasks-per-gpu=1
 #SBATCH --gpus 4
 #SBATCH -o %j.out
 #SBATCH -e %j.err
 
 export MASTER_ADDR=$(hostname)
-export BATCH_SIZE=$1
+export BATCH_SIZE=8192
 export SCRATCH_FOLDER=$SCRATCH/output_PINN/$SLURM_JOB_ID
 mkdir -p $SCRATCH_FOLDER; cd $SCRATCH_FOLDER
 
 echo "jobstart $(date)";pwd
 
-srun -n 4 -c 32 python $SCRATCH/PINN/main.py --upc --2d --dist --bs $BATCH_SIZE --epochs 5
+python $SCRATCH/PINN/main.py --upc --2d --dist --bs $BATCH_SIZE --epochs 500
 
 echo "jobend $(date)";pwd
diff --git a/test_dist.py b/test_dist.py
@@ -0,0 +1,69 @@
+"""
+To run on NERSC:
+export MASTER_ADDR=$(hostname)
+"""
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import os
+
+def get_nodelist():
+    slurm_job_nodelist = os.environ.get('SLURM_JOB_NODELIST')
+    nodes = []
+    prefix='nid'
+    if slurm_job_nodelist:
+        # Remove any enclosing brackets and split into individual nodes
+        slurm_job_nodelist = slurm_job_nodelist.strip('nid').strip('[]').split(',')
+
+        for node_spec in slurm_job_nodelist:
+            if '-' in node_spec:
+                # Expand node ranges, e.g., "001-003" becomes ["nid001", "nid002", "nid003"]
+                node_range = node_spec.split('-')
+                start = int(node_range[0])
+                end = int(node_range[1])
+                nodes.extend([prefix+str(i) for i in range(start, end + 1)])
+            else:
+                nodes.append(prefix+str(node_spec.zfill(6)))
+
+    print(nodes)
+    return nodes
+
+def init_process(rank, world_size, fn, head_node, backend='nccl'):
+    os.environ['MASTER_ADDR'] = head_node
+    os.environ['MASTER_PORT'] = '29510'
+    dist.init_process_group(backend=backend, 
+                            rank=int(os.environ['SLURM_PROCID']), 
+                            world_size=world_size)
+    fn(rank,world_size) # this will be the run function
+
+def run(rank,world_size):
+    # Set the GPU device for this rank
+    device = torch.device(f'cuda:{rank}')
+    x = torch.Tensor([1]).to(device)
+    rank_confirm = dist.get_rank()
+    print(f"Hello from process {rank}! Confirming rank {rank_confirm}. Running on GPU: {device}. Tensor {x}")
+
+def main():
+    # Get the total number of processes
+    world_size = 4 #int(os.environ['SLURM_NTASKS'])
+
+    """Initialize the distributed environment"""
+    node_list = get_nodelist()
+
+
+    # Spawn the processes
+    processes = []
+    mp.set_start_method("spawn")
+    for rank in range(world_size):
+        p = mp.Process(target=init_process,
+                       args=(rank, world_size, run, node_list[0]))
+        p.start()
+        processes.append(p)
+
+
+    # Wait for all processes to finish
+    for p in processes:
+        p.join()
+
+if __name__ == '__main__':
+    main()
diff --git a/test_dist.sh b/test_dist.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+#SBATCH -N 2            # Number of nodes
+#SBATCH -J PINN          # job name
+#SBATCH -L SCRATCH       # job requires SCRATCH files
+#SBATCH -A m2859_g       # allocation
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 00:01:00
+#SBATCH --gpus-per-node=4
+#SBATCH --ntasks-per-gpu=1
+#SBATCH --gpus 8
+#SBATCH -o %j.out
+#SBATCH -e %j.err
+
+export MASTER_ADDR=$(hostname)
+
+echo "jobstart $(date)";pwd
+
+srun -n 2 python $SCRATCH/PINN/test_dist.py
+
+echo "jobend $(date)";pwd