Skip to content

Commit

Permalink
debugging distributed computing
Browse files Browse the repository at this point in the history
  • Loading branch information
vganapati committed Jun 16, 2023
1 parent 67090af commit 7deaedf
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 15 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,4 +174,18 @@ https://theaisummer.com/distributed-training-pytorch/
https://towardsdatascience.com/distribute-your-pytorch-model-in-less-than-20-lines-of-code-61a786e6e7b0


Physics-informed neural networks for inverse problems in nano-optics and metamaterials
Yuyao Chen, Lu Lu, George Em Karniadakis, and Luca Dal Negro
https://opg.optica.org/oe/fulltext.cfm?uri=oe-28-8-11618&id=429761

Physics-informed neural networks with hard constraints for inverse design
Lu Lu, Raphael Pestourie, Wenjie Yao, Zhicheng Wang, Francesc Verdugo, Steven G. Johnson
https://epubs.siam.org/doi/10.1137/21M1397908

Efficient inversion of multiple-scattering model for optical diffraction tomography
Emmanuel Soubies, Thanh-An Pham, and Michael Unser
https://opg.optica.org/oe/fulltext.cfm?uri=oe-25-18-21786&id=371123

Lorenz–Mie theory for 2D scattering and resonance calculations
Denis Gagnon and Louis J Dubé
https://iopscience.iop.org/article/10.1088/2040-8978/17/10/103501
34 changes: 34 additions & 0 deletions list_gpus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

#SBATCH -N 2 # Number of nodes
#SBATCH -J PINN # job name
#SBATCH -L SCRATCH # job requires SCRATCH files
#SBATCH -A m2859_g # allocation
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH -t 00:01:00
#SBATCH --gpus-per-node=4
#SBATCH --ntasks-per-gpu=1
#SBATCH --gpus 8
#SBATCH -o %j.out
#SBATCH -e %j.err


# Function to list GPUs on each node
function list_gpus {
node=$1
echo "Listing GPUs on node $node:"
srun --nodes=1 --ntasks=1 --gpus-per-task=1 -w $node nvidia-smi --list-gpus
echo ""
}

# Get the allocated nodes
nodes=$(scontrol show hostname $SLURM_JOB_NODELIST)

# Loop over each node and list GPUs
for node in $nodes; do
list_gpus $node
done

# Print the GPU devices
# nvidia-smi --list-gpus
23 changes: 11 additions & 12 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def get_args():
parser = argparse.ArgumentParser(description='Get command line args')

parser.add_argument('--bs', type=int, action='store', dest='batch_size',
help='batch size', default = 1600)
help='batch size', default = 8192)
parser.add_argument('--nb', type=int, action='store', dest='num_basis',
help='number of basis functions, N in pde-cl paper', default = 200)
parser.add_argument('--upc', action='store_true', dest='use_pde_cl',
Expand Down Expand Up @@ -96,10 +96,11 @@ def setup(rank, world_size, fn, args, backend='nccl'):
os.environ['MASTER_PORT'] = '29500'

# Get the SLURM_PROCID for the current process
proc_id = int(os.environ['SLURM_PROCID'])
# proc_id = int(os.environ['SLURM_PROCID'])

# print("Hello from " + str(proc_id))
# print(get_rank(args.use_dist))

print("Hello from " + str(proc_id))
print(get_rank(args.use_dist))
# initialize the process group
dist.init_process_group(backend, rank=rank, world_size=world_size)
fn(rank,world_size, args) # this will be the run function
Expand Down Expand Up @@ -185,13 +186,9 @@ def partition_dataset(args, world_size):
def run(rank, world_size, args,
dtype = torch.float,
):

if args.use_dist:
print("Running on rank " + str(rank) + ".")

proc_id = int(os.environ['SLURM_PROCID'])

print("Hello from " + str(proc_id))
print(get_rank(args.use_dist))
print("Running on rank " + str(rank) + ". Running on rank " + str(get_rank(args.use_dist)))

train_set, train_set_2, test_set, bsz = partition_dataset(args, world_size)

Expand All @@ -206,8 +203,10 @@ def run(rank, world_size, args,

if args.use_dist:
# device = rank #{'cuda:%d' % 0: 'cuda:%d' % rank}
device = torch.device(rank)
model.to(rank)
# device = torch.device(rank)

device = torch.device(f'cuda:{rank}')
model.to(device)
#ddp_model = DDP(model, device_ids=[rank])
else:
device = get_device(args)
Expand Down
6 changes: 3 additions & 3 deletions slurm_train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@
#SBATCH -A m2859_g # allocation
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH -t 00:05:00
#SBATCH -t 04:00:00
#SBATCH --gpus-per-node=4
#SBATCH --ntasks-per-gpu=1
#SBATCH --gpus 4
#SBATCH -o %j.out
#SBATCH -e %j.err

export MASTER_ADDR=$(hostname)
export BATCH_SIZE=$1
export BATCH_SIZE=8192
export SCRATCH_FOLDER=$SCRATCH/output_PINN/$SLURM_JOB_ID
mkdir -p $SCRATCH_FOLDER; cd $SCRATCH_FOLDER

echo "jobstart $(date)";pwd

srun -n 4 -c 32 python $SCRATCH/PINN/main.py --upc --2d --dist --bs $BATCH_SIZE --epochs 5
python $SCRATCH/PINN/main.py --upc --2d --dist --bs $BATCH_SIZE --epochs 500

echo "jobend $(date)";pwd
69 changes: 69 additions & 0 deletions test_dist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
To run on NERSC:
export MASTER_ADDR=$(hostname)
"""
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import os

def get_nodelist():
slurm_job_nodelist = os.environ.get('SLURM_JOB_NODELIST')
nodes = []
prefix='nid'
if slurm_job_nodelist:
# Remove any enclosing brackets and split into individual nodes
slurm_job_nodelist = slurm_job_nodelist.strip('nid').strip('[]').split(',')

for node_spec in slurm_job_nodelist:
if '-' in node_spec:
# Expand node ranges, e.g., "001-003" becomes ["nid001", "nid002", "nid003"]
node_range = node_spec.split('-')
start = int(node_range[0])
end = int(node_range[1])
nodes.extend([prefix+str(i) for i in range(start, end + 1)])
else:
nodes.append(prefix+str(node_spec.zfill(6)))

print(nodes)
return nodes

def init_process(rank, world_size, fn, head_node, backend='nccl'):
os.environ['MASTER_ADDR'] = head_node
os.environ['MASTER_PORT'] = '29510'
dist.init_process_group(backend=backend,
rank=int(os.environ['SLURM_PROCID']),
world_size=world_size)
fn(rank,world_size) # this will be the run function

def run(rank,world_size):
# Set the GPU device for this rank
device = torch.device(f'cuda:{rank}')
x = torch.Tensor([1]).to(device)
rank_confirm = dist.get_rank()
print(f"Hello from process {rank}! Confirming rank {rank_confirm}. Running on GPU: {device}. Tensor {x}")

def main():
# Get the total number of processes
world_size = 4 #int(os.environ['SLURM_NTASKS'])

"""Initialize the distributed environment"""
node_list = get_nodelist()


# Spawn the processes
processes = []
mp.set_start_method("spawn")
for rank in range(world_size):
p = mp.Process(target=init_process,
args=(rank, world_size, run, node_list[0]))
p.start()
processes.append(p)


# Wait for all processes to finish
for p in processes:
p.join()

if __name__ == '__main__':
main()
22 changes: 22 additions & 0 deletions test_dist.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

#SBATCH -N 2 # Number of nodes
#SBATCH -J PINN # job name
#SBATCH -L SCRATCH # job requires SCRATCH files
#SBATCH -A m2859_g # allocation
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH -t 00:01:00
#SBATCH --gpus-per-node=4
#SBATCH --ntasks-per-gpu=1
#SBATCH --gpus 8
#SBATCH -o %j.out
#SBATCH -e %j.err

export MASTER_ADDR=$(hostname)

echo "jobstart $(date)";pwd

srun -n 2 python $SCRATCH/PINN/test_dist.py

echo "jobend $(date)";pwd

0 comments on commit 7deaedf

Please sign in to comment.