From 79cd9f418044aff663af5ed52d840d71692cf041 Mon Sep 17 00:00:00 2001 From: Sikan Li Date: Fri, 28 Jun 2024 13:20:25 -0500 Subject: [PATCH] n_gpus --- README.md | 2 +- slurm_scripts/launch_helper.sh | 2 +- slurm_scripts/launch_train.sh | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4615a85..af351a2 100644 --- a/README.md +++ b/README.md @@ -288,7 +288,7 @@ GNS can be trained in parallel on multiple nodes with multiple GPUs. ### Usage ```shell -mpiexec.hydra -np $NNODES -ppn 1 ../slurm_scripts/launch_helper.sh $DOCKER_IMG_LOCATION +mpiexec.hydra -np $NNODES -ppn 1 ../slurm_scripts/launch_helper.sh $DOCKER_IMG_LOCATION $n_gpu_per_node ``` diff --git a/slurm_scripts/launch_helper.sh b/slurm_scripts/launch_helper.sh index cf96af2..90719c3 100755 --- a/slurm_scripts/launch_helper.sh +++ b/slurm_scripts/launch_helper.sh @@ -15,7 +15,7 @@ fi PRELOAD="/opt/apps/tacc-apptainer/1.1.8/bin/apptainer exec --nv $1 " -CMD="torchrun --nproc_per_node 4 --nnodes $NNODES --node_rank=$LOCAL_RANK --master_addr=$MAIN_RANK --master_port=1234 train.py" +CMD="torchrun --nproc_per_node $2 --nnodes $NNODES --node_rank=$LOCAL_RANK --master_addr=$MAIN_RANK --master_port=1234 train.py" FULL_CMD="$PRELOAD $CMD" echo "Training command: $FULL_CMD" diff --git a/slurm_scripts/launch_train.sh b/slurm_scripts/launch_train.sh index 8a7a187..77dde0a 100755 --- a/slurm_scripts/launch_train.sh +++ b/slurm_scripts/launch_train.sh @@ -16,4 +16,5 @@ scontrol show hostnames > $NODEFILE NNODES=$(< $NODEFILE wc -l) CONTAINER=$1 -mpiexec.hydra -np $NNODES -ppn 1 ../slurm_scripts/launch_helper.sh $CONTAINER +n_gpu_per_node=$2 +mpiexec.hydra -np $NNODES -ppn 1 ../slurm_scripts/launch_helper.sh $CONTAINER $n_gpu_per_node