Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

radiation/cloud ML: workflow #2235

Draft
wants to merge 17 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions projects/cloud_ml/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
.PHONY: surface_reference_zarr
surface_reference_zarr:
cd scripts && python ./surface_reference_zarr.py

.PHONY: nudge_to_fine_baseline_run
nudge_to_fine_baseline_run: deploy
cd argo/prognostic-run && ./run.sh nudge-to-fine-baseline

.PHONY: extend_prognostic_run
extend_prognostic_run: deploy
cd argo/prognostic-run && ./extend.sh $(URL) $(SEGMENTS)

.PHONY: training_data_zarr
training_data_zarr:
cd scripts && python ./training_data_zarr.py training_data_config.yaml

.PHONY: prescribed_cloud_cc_decorr_run
prescribed_cloud_cc_decorr_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-cc-decorr

.PHONY: prescribed_cloud_cc_max_random_run
prescribed_cloud_cc_max_random_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-cc-max-random

.PHONY: prescribed_cloud_cc_random_run
prescribed_cloud_cc_random_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-cc-random

.PHONY: prescribed_cloud_decorr_run
prescribed_cloud_decorr_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-decorr

.PHONY: prescribed_cloud_max_random_run
prescribed_cloud_max_random_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-max-random

.PHONY: prescribed_cloud_random_run
prescribed_cloud_random_run: deploy
cd argo/prognostic-run && ./run.sh prescribed-cloud-random

.PHONY: train_cloud_ml_dense_seed
train_cloud_ml_dense_seed: deploy
cd argo/training && ./run_random_seed.sh dense 4

.PHONY: upload_squashed_models
upload_squashed_models:
cd scripts/upload-squashed-models && ./upload.sh 4 gs://vcm-ml-experiments/cloud-ml/2023-06-07/train-cloud-ml-dense

.PHONY: prescribed_cloud_dense_seed_squash_threshold
prescribed_cloud_dense_seed_squash_threshold: deploy
cd argo/prognostic-run && ./run_seed_squash_threshold.sh prescribed-cloud-dense 4 39

.PHONY: offline_cloud_predictions_zarr
offline_cloud_predictions_zarr:
cd scripts && python ./offline_predictions_zarr.py offline_predictions_config.yaml

.PHONY: offline_cloud_predictions_squashed_zarr
offline_cloud_predictions_squashed_zarr:
cd scripts && python ./offline_predictions_zarr.py offline_predictions_squashed_config.yaml

.PHONY: deploy
deploy: argo/kustomize
cd argo && ./kustomize build . | kubectl apply -f -

argo/kustomize:
cd argo && ./install_kustomize.sh 3.10.0
24 changes: 0 additions & 24 deletions projects/cloud_ml/argo/Makefile

This file was deleted.

51 changes: 51 additions & 0 deletions projects/cloud_ml/argo/fine-restarts-to-zarr/pod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
apiVersion: v1
kind: Pod
metadata:
name: fine-restarts-to-zarr
spec:
volumes:
- name: gcp-credentials-user-gcp-sa
secret:
secretName: gcp-key
containers:
- name: main
image: us.gcr.io/vcm-ml/fv3net:400c970a0af572973f5c6b8e970aa261b4baf8a2
command: ["bash", "-x", "-c"]
workingDir: /home/jovyan/fv3net/workflows/dataflow
env:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /secret/gcp-credentials/key.json
- name: CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE
value: /secret/gcp-credentials/key.json
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
volumeMounts:
- mountPath: /secret/gcp-credentials
name: gcp-credentials-user-gcp-sa
args:
- |
./dataflow.sh submit -m fv3net.pipelines.restarts_to_zarr \
gs://vcm-ml-raw-flexible-retention/2023-05-22-PIRE-like-C3072-ccnorm-true-simulation/C3072-to-C48-restarts \
gs://vcm-ml-intermediate/2023-05-22-PIRE-like-C3072-ccnorm-true-simulation-restarts.zarr \
--no-coarse-suffix \
--job_name $POD_NAME-$(openssl rand -hex 6) \
--project vcm-ml \
--region us-central1 \
--runner DataFlow \
--temp_location gs://vcm-ml-scratch/tmp_dataflow \
--num_workers 64 \
--autoscaling_algorithm=NONE
resources:
limits:
memory: 2Gi
cpu: "1000m"
requests:
memory: 1Gi
cpu: "1000m"
tolerations:
- effect: NoSchedule
key: dedicated
value: med-sim-pool
restartPolicy: Never
8 changes: 5 additions & 3 deletions projects/cloud_ml/argo/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ resources:
kind: Kustomization
images:
- name: us.gcr.io/vcm-ml/fv3net
newTag: 762dfc1e945720f59f5b9e9ab68e10e82a0594a9
newTag: b9decb4be05133c569a79223b50879e2fd292bd0
- name: us.gcr.io/vcm-ml/post_process_run
newTag: 762dfc1e945720f59f5b9e9ab68e10e82a0594a9
newTag: b9decb4be05133c569a79223b50879e2fd292bd0
- name: us.gcr.io/vcm-ml/prognostic_run
newTag: 762dfc1e945720f59f5b9e9ab68e10e82a0594a9
newTag: b9decb4be05133c569a79223b50879e2fd292bd0
- name: us.gcr.io/vcm-ml/fv3fit
newTag: b9decb4be05133c569a79223b50879e2fd292bd0
13 changes: 0 additions & 13 deletions projects/cloud_ml/argo/nudge-to-fine-run/extend.sh

This file was deleted.

Loading