Skip to content

Commit

Permalink
Refactor code to improve performance and
Browse files Browse the repository at this point in the history
readability.
  • Loading branch information
Luodian committed Nov 6, 2023
1 parent e0b0bb0 commit 65ade75
Showing 1 changed file with 65 additions and 0 deletions.
65 changes: 65 additions & 0 deletions shared_scripts/Demo_OtterMPT.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash
cd /root/of/Otter

export PYTHONPATH=.

# sent to sub script
export HOSTNAMES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=12955
export COUNT_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l)
export NCCL_NET=IB

echo HOSTNAMES = $HOSTNAMES
echo hostname = $(hostname)
echo MASTER_ADDR= $MASTER_ADDR
echo MASTER_PORT= $MASTER_PORT

GPU=$((${COUNT_NODE} * 8))
WORKERS=$((${COUNT_NODE} * 8))

if [ $WORKERS -gt 112 ]; then
WORKERS=112
fi

RUN_NAME="RunNamePlaceHolder"

echo GPU=${GPU}
echo COUNT_NODE=$COUNT_NODE
echo WORKERS=8
echo "Running ${RUN_NAME}"

H=$(hostname)
THEID=$(echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]")
export THEID=$THEID
echo $THEID

pkill python


accelerate launch --config_file=./pipeline/accelerate_configs/accelerate_config_zero2.yaml \
--machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
--num_machines=${COUNT_NODE} --num_processes=${GPU} \
pipeline/train/instruction_following.py \
--pretrained_model_name_or_path=adept/fuyu-8b \
--training_data_yaml=./Demo_Data.yaml \
--model_name=otter \
--instruction_format=simple \
--batch_size=8 \
--gradient_accumulation_steps=2 \
--num_epochs=3 \
--report_to_wandb \
--wandb_entity=libo0013 \
--external_save_dir=./checkpoints \
--run_name=${RUN_NAME} \
--wandb_project=Fuyu \
--workers=${WORKERS} \
--lr_scheduler=cosine \
--learning_rate=1e-5 \
--warmup_steps_ratio=0.03 \
--save_hf_model \
--max_seq_len=2048 \
--logging_steps=1000 \
--keep_symbols \
--save_ckpt_each_epoch \
--with_task_description

0 comments on commit 65ade75

Please sign in to comment.