Skip to content

Commit

Permalink
Modifications to Helm template and values.yaml to read in a COSI secr…
Browse files Browse the repository at this point in the history
…et, lookup the secret in the cluster and extract the required details from its JSON object to populate the type, bucket, accessKey, secretKey and endpointUrl fields in master-config.yaml.

Minor modifications to MNIST pytorch example to run the CNN model and enable checkpointing!
  • Loading branch information
gauriKrishnan committed Dec 13, 2024
1 parent 577ba8b commit ff85fc9
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 27 deletions.
3 changes: 3 additions & 0 deletions examples/tutorials/mnist_pytorch/const.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
name: mnist_pytorch_const
checkpoint_storage:
save_trial_latest: 5
hyperparameters:
learning_rate: 1.0
n_filters1: 32
n_filters2: 64
dropout1: 0.25
dropout2: 0.5
checkpoint_freq: 100
searcher:
name: single
metric: validation_loss
Expand Down
5 changes: 4 additions & 1 deletion examples/tutorials/mnist_pytorch/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
import filelock
from torchvision import datasets, transforms


import tarfile
def get_dataset(data_dir: pathlib.Path, train: bool) -> Any:
data_dir.mkdir(parents=True, exist_ok=True)
tar = tarfile.open('./data/MNIST.tar.gz', 'r:gz')
tar.extractall('./data')
tar.close()

# Use a file lock so that only one worker on each node downloads.
with filelock.FileLock(str(data_dir / "lock")):
Expand Down
Binary file not shown.
37 changes: 36 additions & 1 deletion helm/charts/determined/templates/master-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,41 @@ stringData:
color: {{ .Values.logColor | default true }}
checkpoint_storage:
{{- if not (empty .Values.checkpointStorage.cosiSecretName) }}
{{- if empty .Values.checkpointStorage.secretNamespace }}
{{- fail "The namespace of the COSI secret must be specified." }}
{{- end }}
{{- $secret := (lookup "v1" "Secret" .Values.checkpointStorage.secretNamespace .Values.checkpointStorage.cosiSecretName) }}
{{- if empty $secret }}
{{- fail "Failed to find COSI secret." }}
{{- end }}
{{- $info := $secret.data.BucketInfo | b64dec | fromJson }}
{{- range $k, $v := $info }}
{{- if eq $k "spec" }}
{{- range $k, $v := $v }}
{{- if eq $k "bucketName" }}
bucket: {{ $v }}
{{- end }}
{{- if eq $k "protocols" }}
type: {{ index $v 0 | quote }}
{{- end }}
{{- if eq $k "secretS3" }}
{{- range $k, $v := $v }}
{{- if eq $k "accessKeyID" }}
access_key: {{ $v | quote }}
{{- end }}
{{- if eq $k "accessSecretKey" }}
secret_key: {{ $v | quote }}
{{- end }}
{{- if eq $k "endpoint" }}
endpoint_url: {{ $v | quote }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}
{{- else }}
type: {{ required "A valid Values.checkpointStorage.type entry is required!" .Values.checkpointStorage.type | quote}}
{{- if eq .Values.checkpointStorage.type "shared_fs" }}
host_path: {{ required "A valid Values.checkpointStorage.hostPath entry is required!" .Values.checkpointStorage.hostPath | quote }}
Expand Down Expand Up @@ -378,7 +413,7 @@ stringData:
schedule: {{ .Values.retentionPolicy.schedule | quote }}
{{- end }}
{{- end }}
{{- if .Values.logging }}
{{- if .Values.logging.security }}
{{- if .Values.logging.security.tls }}
Expand Down
54 changes: 29 additions & 25 deletions helm/charts/determined/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ defaultImages:
kubeScheduler: "k8s.gcr.io/scheduler-plugins/kube-scheduler:v0.18.9"

# default images for CPU and GPU environments
cpuImage: "determinedai/pytorch-ngc-dev:0736b6d"
gpuImage: "determinedai/pytorch-ngc-dev:0736b6d"
cpuImage: "determinedai/pytorch-ngc:0.35.0"
gpuImage: "determinedai/pytorch-ngc:0.35.0"
rocmImage: "determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-mpich-0736b6d"

# Install Determined enterprise edition.
Expand All @@ -54,7 +54,7 @@ masterPort: 8080
#
# WARNING
# The first installation must run with the createNonNamespacedObjects flag set to true to ensure
# the non-namespaced objects are created.
# the non-namespaced objects are created.
# When deploying multiple namespaces within the same shared cluster, this should be set to false
# with helm overrides.
createNonNamespacedObjects: true
Expand Down Expand Up @@ -176,11 +176,11 @@ db:
# Only used for Determined DB deployment. Configures the size of the PersistentVolumeClaim for the
# Determined deployed database, as well as the CPU and memory requirements. Should be adjusted for
# scale.
storageSize: 30Gi
cpuRequest: 2
memRequest: 8Gi
# cpuLimit: 2
# memLimit: 8Gi
storageSize: 5Gi
cpuRequest: 500m
memRequest: 1Gi
cpuLimit: 1
memLimit: 2Gi

# useNodePortForDB configures whether ClusterIP or NodePort service type is used for the
# Determined deployed DB. By default ClusterIP is used.
Expand Down Expand Up @@ -212,8 +212,8 @@ db:
# snapshotSuffix:

# restoreSnapshotSuffix refers to the volume snapshot name suffix which you wish to restore the database to.
# During an upgrade, a new persistent volume & claim will be created, named `determined-db-pvc-{{ .Values.restoreSnapshotSuffix}}`,
# restoring the data from `determined-db-snapsnot-{{ .Values.restoreSnapshotSuffix}}`. MUST NOT be the name
# During an upgrade, a new persistent volume & claim will be created, named `determined-db-pvc-{{ .Values.restoreSnapshotSuffix}}`,
# restoring the data from `determined-db-snapsnot-{{ .Values.restoreSnapshotSuffix}}`. MUST NOT be the name
# of a persistent volume claim that already exists, e.g. the {{Release.Name}}.
# restoreSnapshotSuffix:

Expand All @@ -238,13 +238,17 @@ checkpointStorage:
# bucket: <bucket_name>
# prefix: <prefix>

# For storing in bucket created using COSI driver
# The cosiSecretName is required. It will be created by the COSI driver while granting access to the bucket.
cosiSecretName: cosi-access-secret-1
secretNamespace: default

# For storing in S3.
# type: s3
# bucket: <bucket_name>
# accessKey: <access_key>
# secretKey: <secret_key>
# endpointUrl: <endpoint_url>
# prefix: <prefix>
#type: s3
#bucket: bc114c5600f-1b60-48b9-9602-e79b6394bcb3
#accessKey: admins3
#secretKey: wrMFpZhUMzWkYSfFj3D+d77f1OsaRscgCSbYwJKRblNf45F1LBw2vd1dgw4LpoMN
#endpointUrl: http://10.157.37.124:8080

# For storing in Azure Blob Storage with a connection string.
# Do NOT use if already using Azure Blob Storage with account URL
Expand All @@ -266,21 +270,21 @@ checkpointStorage:
# Distributed tasks with sizes that are not divisible by `maxSlotsPerPod` are never scheduled. If
# you have a cluster of different size nodes (e.g., 4 and 8 GPUs per node), set `maxSlotsPerPod` to
# the greatest common divisor of all the sizes (4, in that case).
maxSlotsPerPod:
maxSlotsPerPod: 1

## For CPU-only clusters, use `slotType: cpu`, and make sure to set `slotResourceRequest` below.
# slotType: cpu
# slotResourceRequests:
slotType: cpu
slotResourceRequests:
## Number of cpu units requested for compute slots. Note: since kubernetes may schedule some
## system tasks on the nodes which take up some resources, 8-core node may not always fit
## a `cpu: 8` task container.
# cpu: 7
cpu: 1

# Memory and CPU requirements for the master instance. Should be adjusted for scale.
masterCpuRequest: 2
masterMemRequest: 8Gi
# masterCpuLimit: 2
# masterMemLimit: 8Gi
masterCpuRequest: 250m
masterMemRequest: 250Mi
masterCpuLimit: 500m
masterMemLimit: 750Mi

## Configure the task container defaults. Tasks include trials, commands, TensorBoards, notebooks,
## and shells. For all task containers, shm_size_bytes and network_mode are configurable. For
Expand Down Expand Up @@ -411,7 +415,7 @@ resourcePools:
# defaultComputeResourcePool: default

## Configure the initial user password for the cluster
# initialUserPassword:
initialUserPassword: initAdmin9

# determinedMasterHost configures the hostname that tasks launched by the primary resource manager use when
# communicating with our API server. This is useful when installations span multiple Kubernetes clusters and when there
Expand Down

0 comments on commit ff85fc9

Please sign in to comment.