Launch Llama 2 Small Fast #115
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Launch Llama 2 Small Fast | |
on: | |
workflow_run: | |
workflows: ["Build and Push Docker TPU Images"] | |
types: | |
- completed | |
branches: [main, "experiment/*"] | |
# pull_request: | |
workflow_dispatch: | |
jobs: | |
test: | |
if: (github.event.pull_request.head.repo.full_name == github.repository) | |
runs-on: ubuntu-latest | |
env: | |
TPU_ZONE: "us-central2-b" | |
TPU_TYPE: "v4-32" | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v2 | |
- name: Set up Google Cloud SDK | |
uses: google-github-actions/setup-gcloud@v1 | |
with: | |
project_id: ${{ secrets.GCP_PROJECT_ID }} | |
- name: Authenticate to Google Cloud | |
uses: google-github-actions/auth@v1 | |
with: | |
credentials_json: ${{ secrets.GCP_SA_KEY }} | |
- name: Configure Google Cloud | |
run: | | |
gcloud config set project ${{ secrets.GCP_PROJECT_ID }} | |
REGION=${TPU_ZONE%-*} | |
echo "$REGION" | |
gcloud auth configure-docker $REGION-docker.pkg.dev | |
- name: Install locally | |
run: | | |
python -m pip install --upgrade pip | |
pip install -e .[test] "jax[cpu]==0.4.30" | |
- name: Launch Small Fast TPU Train LM job | |
run: | | |
export TPU_NAME=small-fast-${{ github.run_id }} | |
export WANDB_API_KEY=${{ secrets.WANDB_API_KEY }} | |
export RUN_ID=small_fast_${{ github.run_id }} | |
export HF_TOKEN=${{ secrets.HF_TOKEN }} | |
cat > .config <<EOF | |
env: | |
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
WANDB_ENTITY: stanford-mercury | |
WANDB_PROJECT: levanter | |
HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
EOF | |
python infra/launch.py -e CI 1 --foreground --tpu_name ${TPU_NAME} --run_id $RUN_ID --zone ${TPU_ZONE} --tpu_type ${TPU_TYPE} --preemptible -- \ | |
python -m levanter.main.train_lm \ | |
--config_path config/llama_small_fast.yaml \ | |
--trainer.checkpointer.base_path gs://levanter-checkpoints/llama-itest/ \ | |
--trainer.checkpointer.save_interval 10m | |
--trainer.num_train_steps 10000 | |
- name: Cleanup | |
if: ${{ always() }} | |
run: | | |
export TPU_NAME=small-fast-${{ github.run_id }} | |
gcloud compute tpus queued-resources delete $TPU_NAME --zone ${TPU_ZONE} --quiet --force |