Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create pretrain_starcoder2_1b.slurm #82

Open
wants to merge 4 commits into
base: multi-query-attention
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions examples/pretrain_starcoder2_1b.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/bin/bash
#SBATCH --job-name=1B_starcoder2
#SBATCH --nodes=32
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --gres=gpu:8
#SBATCH --exclusive
#SBATCH --partition=production-cluster
#SBATCH --output=/fsx/loubna/logs/starcoder2_1b/%x-%j.out
#SBATCH --qos=high
#SBATCH --array 1-3%1

set -x -e
source /admin/home/loubna/.bashrc

# a100
export CUDA_HOME=/usr/local/cuda-11.7

export NCCL_ASYNC_ERROR_HANDLING=1

# AWS specific
export NCCL_PROTO=simple
export RDMAV_FORK_SAFE=1
export FI_EFA_FORK_SAFE=1
export FI_EFA_USE_DEVICE_RDMA=1
export FI_PROVIDER=efa
export FI_LOG_LEVEL=1
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=ens

conda activate megatron_bigcode_a100

echo "START TIME: $(date)"

# File Path setup
SCRIPT_REPO=/fsx/loubna/bigcode_2/code/pr/Megatron-LM
pushd $SCRIPT_REPO
export CUDA_DEVICE_MAX_CONNECTIONS=1
LOG_PATH=$SCRIPT_REPO/main_log.txt

# Training setup
GPUS_PER_NODE=8
# so processes know who to talk to
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
NNODES=$SLURM_NNODES
NODE_RANK=$SLURM_PROCID
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

# File path setup
CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints # Adjust: Directory to store the checkpoints
# Starcoder2 tokenizer and data paths in /fsx/bigcode
TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json
WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp
WEIGHTS_VALID=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/valid_data_paths.txt.tmp

mkdir -p $CHECKPOINT_PATH/tensorboard

GPT_ARGS="\
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 2048 \
--num-attention-heads 16 \
--attention-head-type multiquery \
--init-method-std 0.02209 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--use-rotary-position-embeddings \
--rotary-theta 100000 \
--attention-dropout 0.1 \
--hidden-dropout 0.1 \
--micro-batch-size 4 \
--global-batch-size 1024 \
--lr 0.0004 \
--min-lr 0.00004 \
--train-iters 500000 \
--lr-decay-iters 500000 \
--lr-decay-style cosine \
--lr-warmup-iters 2000 \
--weight-decay .1 \
--adam-beta2 .95 \
--clip-grad 1.0 \
--bf16 \
--use-flash-attn \
--fim-rate 0.5 \
--fim-split-sample \"<file_sep>\" \
--fragment-fim-rate 0.5 \
--log-interval 10 \
--save-interval 10000 \
--eval-interval 10000 \
--eval-iters 2 \
--valid-num-workers 0 \
"

TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/starcoder2-1B/tensorboard"

CMD=" \
$SCRIPT_REPO/pretrain_gpt.py \
$GPT_ARGS \
--tokenizer-type TokenizerFromFile \
--tokenizer-file $TOKENIZER_FILE \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--train-weighted-split-paths-path $WEIGHTS_TRAIN \
--valid-weighted-split-paths-path $WEIGHTS_VALID \
--structured-logs \
--structured-logs-dir $CHECKPOINT_PATH/logs \
$TENSORBOARD_ARGS \
--wandb-entity-name loubnabnl \
--wandb-project-name starcoder2-1B \
"

export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
"

echo $CMD

# hide duplicated errors using this hack - will be properly fixed in pt-1.12
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json

# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was.
#export PATH="/usr/local/cuda-11.6/bin:$PATH"
#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH"
#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH

# srun error handling:
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
"

# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH

echo "END TIME: $(date)"
146 changes: 146 additions & 0 deletions examples/pretrain_starcoder2_1b_fix_rope.slurm
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
#!/bin/bash
#SBATCH --job-name=1B_test
#SBATCH --nodes=32
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
#SBATCH --gres=gpu:8
#SBATCH --exclusive
#SBATCH --partition=production-cluster
#SBATCH --output=/fsx/loubna/logs/starcoder2_1b/%x-%j.out
#SBATCH --qos=high
#SBATCH --array 1-3%1

set -x -e
source /admin/home/loubna/.bashrc

# a100
export CUDA_HOME=/usr/local/cuda-11.7

export NCCL_ASYNC_ERROR_HANDLING=1

# AWS specific
export NCCL_PROTO=simple
export RDMAV_FORK_SAFE=1
export FI_EFA_FORK_SAFE=1
export FI_EFA_USE_DEVICE_RDMA=1
export FI_PROVIDER=efa
export FI_LOG_LEVEL=1
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=ens

conda activate megatron_bigcode_a100

echo "START TIME: $(date)"

# File Path setup
SCRIPT_REPO=/fsx/loubna/bigcode_2/code/pr/Megatron-LM
pushd $SCRIPT_REPO
export CUDA_DEVICE_MAX_CONNECTIONS=1
LOG_PATH=$SCRIPT_REPO/main_log.txt

# Training setup
GPUS_PER_NODE=8
# so processes know who to talk to
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
NNODES=$SLURM_NNODES
NODE_RANK=$SLURM_PROCID
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

# File path setup
CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints_fix_rope # Adjust: Directory to store the checkpoints
# Starcoder2 tokenizer and data paths in /fsx/bigcode
TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json
WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp
WEIGHTS_VALID=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/valid_data_paths.txt.tmp

mkdir -p $CHECKPOINT_PATH/tensorboard

GPT_ARGS="\
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 2048 \
--num-attention-heads 16 \
--attention-head-type multiquery \
--init-method-std 0.02209 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--use-rotary-position-embeddings \
--rotary-theta 100000 \
--position-embedding-type rotary \
--no-position-embedding \
--attention-dropout 0.1 \
--hidden-dropout 0.1 \
--micro-batch-size 4 \
--global-batch-size 1024 \
--lr 0.0004 \
--min-lr 0.00004 \
--train-iters 500000 \
--lr-decay-iters 500000 \
--lr-decay-style cosine \
--lr-warmup-iters 2000 \
--weight-decay .1 \
--adam-beta2 .95 \
--clip-grad 1.0 \
--bf16 \
--use-flash-attn \
--fim-rate 0.5 \
--fim-split-sample \"<file_sep>\" \
--fragment-fim-rate 0.5 \
--log-interval 10 \
--save-interval 10000 \
--eval-interval 10000 \
--eval-iters 2 \
--valid-num-workers 0 \
"

TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/starcoder2-1B/tensorboard"

CMD=" \
$SCRIPT_REPO/pretrain_gpt.py \
$GPT_ARGS \
--tokenizer-type TokenizerFromFile \
--tokenizer-file $TOKENIZER_FILE \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--train-weighted-split-paths-path $WEIGHTS_TRAIN \
--valid-weighted-split-paths-path $WEIGHTS_VALID \
--structured-logs \
--structured-logs-dir $CHECKPOINT_PATH/logs \
$TENSORBOARD_ARGS \
--wandb-entity-name loubnabnl \
--wandb-project-name starcoder2-1B \
"

export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
"

echo $CMD

# hide duplicated errors using this hack - will be properly fixed in pt-1.12
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json

# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was.
#export PATH="/usr/local/cuda-11.6/bin:$PATH"
#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH"
#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH

# srun error handling:
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
"

# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH

echo "END TIME: $(date)"