From 50b7690487769e9f79c80d4d3374ec0637e2b01a Mon Sep 17 00:00:00 2001 From: Jerry G <124635598+jgongd@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:19:01 -0500 Subject: [PATCH] chore: 0.38.0 environment images (#10197) --- .circleci/real_config.yml | 14 ++++----- .circleci/scripts/pull_image_daemonset.yaml | 2 +- .../prepare-container/custom-env.rst | 8 ++--- .../prepare-container/tensorflow-support.rst | 2 +- .../deploy/helm-config-reference.rst | 4 +-- .../deploy/master-config-reference.rst | 4 +-- .../reference/experiment-config-reference.rst | 4 +-- docs/reference/job-config-reference.rst | 4 +-- .../deploy-cluster/slurm/singularity.rst | 4 +-- docs/setup-cluster/gcp/install-gcp.rst | 4 +-- docs/setup-cluster/slurm/singularity.rst | 4 +-- .../slurm/slurm-requirements.rst | 2 +- e2e_tests/tests/config.py | 12 ++++---- .../tests/fixtures/ports-proxy/config.yaml | 2 +- .../iris_tf_keras/adaptive.yaml | 4 +-- .../computer_vision/iris_tf_keras/const.yaml | 4 +-- .../iris_tf_keras/distributed.yaml | 4 +-- examples/deepspeed/dcgan/mnist.yaml | 2 +- .../hf_image_classification/deepspeed.yaml | 2 +- .../hf_language_modeling/deepspeed.yaml | 2 +- .../0.17.6-keras/metadata.json | 4 +-- .../0.17.6-pytorch/metadata.json | 4 +-- harness/tests/fixtures/checkpoint.json | 4 +-- helm/charts/determined/values.yaml | 4 +-- master/pkg/schemas/expconf/const.go | 4 +-- schemas/test_cases/v0/experiment.yaml | 4 +-- tools/scripts/bumpenvs.yaml | 29 +++++++++---------- tools/scripts/environments-target.txt | 2 +- .../non-scalar-metrics-4078.json | 4 +-- .../responses/experiment-details/set-a.json | 8 ++--- .../old-trial-config-noop-adaptive.json | 4 +-- 31 files changed, 78 insertions(+), 81 deletions(-) diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index 26628678a93..8a46115d37a 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -291,7 +291,7 @@ commands: - when: condition: <> steps: - - run: docker pull determinedai/pytorch-ngc-dev:0736b6d + - run: docker pull determinedai/pytorch-ngc:0.38.0 login-docker: parameters: @@ -2479,7 +2479,7 @@ jobs: test-unit-harness-gpu-tf: docker: - - image: determinedai/tensorflow-ngc-dev:0736b6d + - image: determinedai/tensorflow-ngc:0.38.0 resource_class: determined-ai/container-runner-gpu steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts @@ -2506,7 +2506,7 @@ jobs: test-unit-harness-pytorch2-gpu: docker: - - image: determinedai/pytorch-ngc-dev:0736b6d + - image: determinedai/pytorch-ngc:0.38.0 resource_class: determined-ai/container-runner-gpu steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts @@ -2533,7 +2533,7 @@ jobs: test-unit-harness-pytorch2-cpu: docker: - - image: determinedai/pytorch-ngc-dev:0736b6d + - image: determinedai/pytorch-ngc:0.38.0 steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts - checkout @@ -2559,7 +2559,7 @@ jobs: test-unit-harness-gpu-parallel: docker: - - image: determinedai/pytorch-ngc-dev:0736b6d + - image: determinedai/pytorch-ngc:0.38.0 resource_class: determined-ai/container-runner-multi-gpu steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts @@ -2586,7 +2586,7 @@ jobs: test-unit-harness-gpu-deepspeed: docker: - - image: determinedai/pytorch-ngc-dev:0736b6d + - image: determinedai/pytorch-ngc:0.38.0 resource_class: determined-ai/container-runner-gpu steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts @@ -3648,7 +3648,7 @@ jobs: type: string default: "1" environment-image: - default: determinedai/pytorch-ngc-dev:0736b6d + default: determinedai/pytorch-ngc:0.38.0 type: string accel-node-taints: type: string diff --git a/.circleci/scripts/pull_image_daemonset.yaml b/.circleci/scripts/pull_image_daemonset.yaml index f7ae90448fc..ea12a3248b2 100644 --- a/.circleci/scripts/pull_image_daemonset.yaml +++ b/.circleci/scripts/pull_image_daemonset.yaml @@ -13,7 +13,7 @@ spec: spec: containers: - name: pull-docker-daemonset - image: determinedai/pytorch-ngc-dev:0736b6d + image: determinedai/pytorch-ngc:0.38.0 command: ["/bin/bash"] args: ["echo", "test"] resources: diff --git a/docs/model-dev-guide/prepare-container/custom-env.rst b/docs/model-dev-guide/prepare-container/custom-env.rst index 3056f7854ae..e0e9772cc36 100644 --- a/docs/model-dev-guide/prepare-container/custom-env.rst +++ b/docs/model-dev-guide/prepare-container/custom-env.rst @@ -114,9 +114,9 @@ Default Images - - Environment - File Name - - CPUs - - ``determinedai/pytorch-ngc-dev:0736b6d`` + - ``determinedai/pytorch-ngc:0.38.0`` - - NVIDIA GPUs - - ``determinedai/pytorch-ngc-dev:0736b6d`` + - ``determinedai/pytorch-ngc:0.38.0`` - - AMD GPUs - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` @@ -155,7 +155,7 @@ Example Dockerfile that installs custom ``conda``-, ``pip``-, and ``apt``-based .. code:: bash # Determined Image - FROM determinedai/tensorflow-ngc-dev:0736b6d + FROM determinedai/tensorflow-ngc:0.38.0 # Custom Configuration RUN apt-get update && \ @@ -216,7 +216,7 @@ environments using :ref:`custom images `: .. code:: bash # Determined Image - FROM determinedai/pytorch-ngc-dev:0736b6d + FROM determinedai/pytorch-ngc:0.38.0 # Create a virtual environment RUN conda create -n myenv python=3.8 diff --git a/docs/model-dev-guide/prepare-container/tensorflow-support.rst b/docs/model-dev-guide/prepare-container/tensorflow-support.rst index f3db541c07f..c0d611e0285 100644 --- a/docs/model-dev-guide/prepare-container/tensorflow-support.rst +++ b/docs/model-dev-guide/prepare-container/tensorflow-support.rst @@ -20,7 +20,7 @@ Determined supports both TensorFlow 1 and 2. The version of TensorFlow used for experiment is controlled by the configured container image. Determined provides prebuilt Docker images that include TensorFlow 2+, 1.15, and 2.8, respectively: -- ``determinedai/tensorflow-ngc-dev:0736b6d`` +- ``determinedai/tensorflow-ngc:0.38.0`` - ``determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.21.2`` - ``determinedai/environments:cuda-11.2-tf-2.8-gpu-0.29.1`` diff --git a/docs/reference/deploy/helm-config-reference.rst b/docs/reference/deploy/helm-config-reference.rst index 5548c9ce93f..4a68384900a 100644 --- a/docs/reference/deploy/helm-config-reference.rst +++ b/docs/reference/deploy/helm-config-reference.rst @@ -239,13 +239,13 @@ - ``cpuImage``: Sets the default Docker image for all non-GPU tasks. If a Docker image is specified in the :ref:`experiment config ` this default is overriden. - Defaults to: ``determinedai/pytorch-ngc-dev:0736b6d``. + Defaults to: ``determinedai/pytorch-ngc:0.38.0``. - ``startupHook``: An optional inline script that will be executed as part of task set up. - ``gpuImage``: Sets the default Docker image for all GPU tasks. If a Docker image is specified in the :ref:`experiment config ` this default is overriden. Defaults - to: ``determinedai/pytorch-ngc-dev:0736b6d``. + to: ``determinedai/pytorch-ngc:0.38.0``. - ``logPolicies``: Sets log policies for trials. For details, visit :ref:`log_policies `. diff --git a/docs/reference/deploy/master-config-reference.rst b/docs/reference/deploy/master-config-reference.rst index 7175f8cea69..6c7fc8ca13a 100644 --- a/docs/reference/deploy/master-config-reference.rst +++ b/docs/reference/deploy/master-config-reference.rst @@ -89,12 +89,12 @@ configure different container images for NVIDIA GPU tasks using the ``cuda`` key Determined 0.17.6), CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using the ``rocm`` key. Default values: -- ``determinedai/pytorch-ngc-dev:0736b6d`` for NVIDIA GPUs and for CPUs. +- ``determinedai/pytorch-ngc:0.38.0`` for NVIDIA GPUs and for CPUs. - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm. For TensorFlow users, we provide an image that must be referenced in the experiment configuration: -- ``determinedai/tensorflow-ngc-dev:0736b6d`` for NVIDIA GPUs and for CPUs. +- ``determinedai/tensorflow-ngc:0.38.0`` for NVIDIA GPUs and for CPUs. ``environment_variables`` ========================= diff --git a/docs/reference/experiment-config-reference.rst b/docs/reference/experiment-config-reference.rst index 6b3222b2581..cfea6224dd3 100644 --- a/docs/reference/experiment-config-reference.rst +++ b/docs/reference/experiment-config-reference.rst @@ -1294,12 +1294,12 @@ Optional. The Docker image to use when executing the workload. This image must b container images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6), CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values: -- ``determinedai/pytorch-ngc-dev:0736b6d`` for NVIDIA GPUs and for CPUs. +- ``determinedai/pytorch-ngc:0.38.0`` for NVIDIA GPUs and for CPUs. - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm. For TensorFlow users, we provide an image that must be referenced in the experiment configuration: -- ``determinedai/tensorflow-ngc-dev:0736b6d`` for NVIDIA GPUs and for CPUs. +- ``determinedai/tensorflow-ngc:0.38.0`` for NVIDIA GPUs and for CPUs. When the cluster is configured with :ref:`resource_manager.type: slurm ` and ``container_run_type: singularity``, images are executed using diff --git a/docs/reference/job-config-reference.rst b/docs/reference/job-config-reference.rst index ffd4c990cd0..bd9f191ac6b 100644 --- a/docs/reference/job-config-reference.rst +++ b/docs/reference/job-config-reference.rst @@ -45,13 +45,13 @@ The following configuration settings are supported: different container images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6), CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values: - - ``determinedai/pytorch-ngc-dev:0736b6d`` for NVIDIA GPUs and for CPUs. + - ``determinedai/pytorch-ngc:0.38.0`` for NVIDIA GPUs and for CPUs. - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm. For TensorFlow users, we provide an image that must be referenced in the experiment configuration: - - ``determinedai/tensorflow-ngc-dev:0736b6d`` for NVIDIA GPUs and for CPUs. + - ``determinedai/tensorflow-ngc:0.38.0`` for NVIDIA GPUs and for CPUs. - ``force_pull_image``: Forcibly pull the image from the Docker registry and bypass the Docker cache. Defaults to ``false``. diff --git a/docs/setup-cluster/deploy-cluster/slurm/singularity.rst b/docs/setup-cluster/deploy-cluster/slurm/singularity.rst index 4bdff8193e4..cdcadbd4b94 100644 --- a/docs/setup-cluster/deploy-cluster/slurm/singularity.rst +++ b/docs/setup-cluster/deploy-cluster/slurm/singularity.rst @@ -30,9 +30,9 @@ by default in this version of Determined are described below. - - Environment - File Name - - CPUs - - ``determinedai/pytorch-ngc-dev:0736b6d`` + - ``determinedai/pytorch-ngc:0.38.0`` - - NVIDIA GPUs - - ``determinedai/pytorch-ngc-dev:0736b6d`` + - ``determinedai/pytorch-ngc:0.38.0`` - - AMD GPUs - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512`` diff --git a/docs/setup-cluster/gcp/install-gcp.rst b/docs/setup-cluster/gcp/install-gcp.rst index 3356df73756..71afb1f1036 100644 --- a/docs/setup-cluster/gcp/install-gcp.rst +++ b/docs/setup-cluster/gcp/install-gcp.rst @@ -406,5 +406,5 @@ This command line will spin up a cluster of up to 2 A100s in the ``us-central1-c --compute-agent-instance-type a2-highgpu-1g --gpu-num 1 \ --gpu-type nvidia-tesla-a100 \ --region us-central1 --zone us-central1-c \ - --gpu-env-image determinedai/pytorch-ngc-dev:0736b6d \ - --cpu-env-image determinedai/pytorch-ngc-dev:0736b6d + --gpu-env-image determinedai/pytorch-ngc:0.38.0 \ + --cpu-env-image determinedai/pytorch-ngc:0.38.0 diff --git a/docs/setup-cluster/slurm/singularity.rst b/docs/setup-cluster/slurm/singularity.rst index 035de2d5d19..4f893ef9848 100644 --- a/docs/setup-cluster/slurm/singularity.rst +++ b/docs/setup-cluster/slurm/singularity.rst @@ -30,9 +30,9 @@ by default in this version of Determined are described below. - - Environment - File Name - - CPUs - - ``determinedai/pytorch-ngc-dev:0736b6d`` + - ``determinedai/pytorch-ngc:0.38.0`` - - NVIDIA GPUs - - ``determinedai/pytorch-ngc-dev:0736b6d`` + - ``determinedai/pytorch-ngc:0.38.0`` - - AMD GPUs - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512`` diff --git a/docs/setup-cluster/slurm/slurm-requirements.rst b/docs/setup-cluster/slurm/slurm-requirements.rst index a8e82de7e27..47d7c06c3fa 100644 --- a/docs/setup-cluster/slurm/slurm-requirements.rst +++ b/docs/setup-cluster/slurm/slurm-requirements.rst @@ -438,7 +438,7 @@ platform. There may be additional per-user configuration that is required. .. code:: bash - image=determinedai/pytorch-ngc-dev:0736b6d + image=determinedai/pytorch-ngc:0.38.0 cd /shared/enroot/images enroot import docker://$image enroot create /shared/enroot/images/${image//[\/:]/\+}.sqsh diff --git a/e2e_tests/tests/config.py b/e2e_tests/tests/config.py index cfac33d5fd9..f40a9345dcc 100644 --- a/e2e_tests/tests/config.py +++ b/e2e_tests/tests/config.py @@ -14,12 +14,12 @@ MAX_TRIAL_BUILD_SECS = 90 -DEFAULT_TF2_CPU_IMAGE = "determinedai/tensorflow-ngc-dev:0736b6d" -DEFAULT_TF2_GPU_IMAGE = "determinedai/tensorflow-ngc-dev:0736b6d" -DEFAULT_PT_CPU_IMAGE = "determinedai/pytorch-tensorflow-cpu-dev:0736b6d" -DEFAULT_PT_GPU_IMAGE = "determinedai/pytorch-tensorflow-cuda-dev:0736b6d" -DEFAULT_PT2_CPU_IMAGE = "determinedai/pytorch-ngc-dev:0736b6d" -DEFAULT_PT2_GPU_IMAGE = "determinedai/pytorch-ngc-dev:0736b6d" +DEFAULT_TF2_CPU_IMAGE = "determinedai/tensorflow-ngc:0.38.0" +DEFAULT_TF2_GPU_IMAGE = "determinedai/tensorflow-ngc:0.38.0" +DEFAULT_PT_CPU_IMAGE = "determinedai/pytorch-tensorflow-cpu:0.38.0" +DEFAULT_PT_GPU_IMAGE = "determinedai/pytorch-tensorflow-cuda:0.38.0" +DEFAULT_PT2_CPU_IMAGE = "determinedai/pytorch-ngc:0.38.0" +DEFAULT_PT2_GPU_IMAGE = "determinedai/pytorch-ngc:0.38.0" TF2_CPU_IMAGE = os.environ.get("TF2_CPU_IMAGE") or DEFAULT_TF2_CPU_IMAGE TF2_GPU_IMAGE = os.environ.get("TF2_GPU_IMAGE") or DEFAULT_TF2_GPU_IMAGE diff --git a/e2e_tests/tests/fixtures/ports-proxy/config.yaml b/e2e_tests/tests/fixtures/ports-proxy/config.yaml index 738df5c36b0..3aa1c81874b 100644 --- a/e2e_tests/tests/fixtures/ports-proxy/config.yaml +++ b/e2e_tests/tests/fixtures/ports-proxy/config.yaml @@ -22,7 +22,7 @@ max_restarts: 0 # Hardcode the image because the new image has a bug. TODO fix this when the image bug is fixed. environment: - image: determinedai/pytorch-tensorflow-cpu-dev:0736b6d + image: determinedai/pytorch-tensorflow-cpu:0.38.0 proxy_ports: - proxy_port: 8000 proxy_tcp: false diff --git a/examples/computer_vision/iris_tf_keras/adaptive.yaml b/examples/computer_vision/iris_tf_keras/adaptive.yaml index c29d38067b2..a2a4124bc35 100644 --- a/examples/computer_vision/iris_tf_keras/adaptive.yaml +++ b/examples/computer_vision/iris_tf_keras/adaptive.yaml @@ -1,8 +1,8 @@ name: iris_tf_keras_adaptive_search environment: image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d + cpu: determinedai/tensorflow-ngc:0.38.0 + gpu: determinedai/tensorflow-ngc:0.38.0 hyperparameters: learning_rate: type: log diff --git a/examples/computer_vision/iris_tf_keras/const.yaml b/examples/computer_vision/iris_tf_keras/const.yaml index 37fd8de1e66..2423eb4fcb7 100644 --- a/examples/computer_vision/iris_tf_keras/const.yaml +++ b/examples/computer_vision/iris_tf_keras/const.yaml @@ -1,8 +1,8 @@ name: iris_tf_keras_const environment: image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d + cpu: determinedai/tensorflow-ngc:0.38.0 + gpu: determinedai/tensorflow-ngc:0.38.0 hyperparameters: learning_rate: 1.0e-4 learning_rate_decay: 1.0e-6 diff --git a/examples/computer_vision/iris_tf_keras/distributed.yaml b/examples/computer_vision/iris_tf_keras/distributed.yaml index 35ee042f776..e7677468169 100644 --- a/examples/computer_vision/iris_tf_keras/distributed.yaml +++ b/examples/computer_vision/iris_tf_keras/distributed.yaml @@ -1,8 +1,8 @@ name: iris_tf_keras_distributed environment: image: - cpu: determinedai/tensorflow-ngc-dev:0736b6d - gpu: determinedai/tensorflow-ngc-dev:0736b6d + cpu: determinedai/tensorflow-ngc:0.38.0 + gpu: determinedai/tensorflow-ngc:0.38.0 hyperparameters: learning_rate: 1.0e-4 learning_rate_decay: 1.0e-6 diff --git a/examples/deepspeed/dcgan/mnist.yaml b/examples/deepspeed/dcgan/mnist.yaml index fb996c55532..ee3a5a10563 100644 --- a/examples/deepspeed/dcgan/mnist.yaml +++ b/examples/deepspeed/dcgan/mnist.yaml @@ -13,7 +13,7 @@ environment: environment_variables: - NCCL_DEBUG=INFO - NCCL_SOCKET_IFNAME=ens,eth,ib - image: determinedai/pytorch-ngc-dev:0736b6d + image: determinedai/pytorch-ngc:0.38.0 bind_mounts: - host_path: /tmp container_path: /data diff --git a/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml b/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml index 698d68f8bba..66fc6d5297a 100644 --- a/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml +++ b/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml @@ -6,7 +6,7 @@ environment: # You may need to modify this to match your network configuration. - NCCL_SOCKET_IFNAME=ens,eth,ib image: - gpu: determinedai/pytorch-ngc-dev:0736b6d + gpu: determinedai/pytorch-ngc:0.38.0 resources: slots_per_trial: 2 searcher: diff --git a/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml b/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml index 8facb3c47ac..66c199299a2 100644 --- a/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml +++ b/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml @@ -6,7 +6,7 @@ environment: # You may need to modify this to match your network configuration. - NCCL_SOCKET_IFNAME=ens,eth,ib image: - gpu: determinedai/pytorch-ngc-dev:0736b6d + gpu: determinedai/pytorch-ngc:0.38.0 resources: slots_per_trial: 2 searcher: diff --git a/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-keras/metadata.json b/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-keras/metadata.json index 35498150bbb..9bbcc4649be 100644 --- a/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-keras/metadata.json +++ b/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-keras/metadata.json @@ -39,8 +39,8 @@ }, "force_pull_image": false, "image": { - "cpu": "determinedai/tensorflow-ngc-dev:0736b6d", - "cuda": "determinedai/tensorflow-ngc-dev:0736b6d", + "cpu": "determinedai/tensorflow-ngc:0.38.0", + "cuda": "determinedai/tensorflow-ngc:0.38.0", "rocm": "determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512" }, "pod_spec": null, diff --git a/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-pytorch/metadata.json b/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-pytorch/metadata.json index 7c49c7151c7..99cc52df23b 100644 --- a/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-pytorch/metadata.json +++ b/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-pytorch/metadata.json @@ -38,8 +38,8 @@ }, "force_pull_image": false, "image": { - "cpu": "determinedai/tensorflow-ngc-dev:0736b6d", - "cuda": "determinedai/tensorflow-ngc-dev:0736b6d", + "cpu": "determinedai/tensorflow-ngc:0.38.0", + "cuda": "determinedai/tensorflow-ngc:0.38.0", "rocm": "determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512" }, "pod_spec": null, diff --git a/harness/tests/fixtures/checkpoint.json b/harness/tests/fixtures/checkpoint.json index 72c23eabd41..81383f5248a 100644 --- a/harness/tests/fixtures/checkpoint.json +++ b/harness/tests/fixtures/checkpoint.json @@ -69,8 +69,8 @@ }, "force_pull_image":false, "image":{ - "cpu":"determinedai/pytorch-ngc-dev:0736b6d", - "cuda":"determinedai/pytorch-ngc-dev:0736b6d", + "cpu":"determinedai/pytorch-ngc:0.38.0", + "cuda":"determinedai/pytorch-ngc:0.38.0", "rocm":"determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512" }, "pod_spec":null, diff --git a/helm/charts/determined/values.yaml b/helm/charts/determined/values.yaml index 1104896c9bc..ae2250e0313 100644 --- a/helm/charts/determined/values.yaml +++ b/helm/charts/determined/values.yaml @@ -27,8 +27,8 @@ defaultImages: kubeScheduler: "k8s.gcr.io/scheduler-plugins/kube-scheduler:v0.18.9" # default images for CPU and GPU environments - cpuImage: "determinedai/pytorch-ngc-dev:0736b6d" - gpuImage: "determinedai/pytorch-ngc-dev:0736b6d" + cpuImage: "determinedai/pytorch-ngc:0.38.0" + gpuImage: "determinedai/pytorch-ngc:0.38.0" rocmImage: "determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-mpich-0736b6d" # Install Determined enterprise edition. diff --git a/master/pkg/schemas/expconf/const.go b/master/pkg/schemas/expconf/const.go index 48c3d0a20ec..51eb93beff2 100644 --- a/master/pkg/schemas/expconf/const.go +++ b/master/pkg/schemas/expconf/const.go @@ -8,8 +8,8 @@ const ( // Default task environment docker image names. const ( - CPUImage = "determinedai/pytorch-ngc-dev:0736b6d" - CUDAImage = "determinedai/pytorch-ngc-dev:0736b6d" + CPUImage = "determinedai/pytorch-ngc:0.38.0" + CUDAImage = "determinedai/pytorch-ngc:0.38.0" ROCMImage = "determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-mpich-0736b6d" ) diff --git a/schemas/test_cases/v0/experiment.yaml b/schemas/test_cases/v0/experiment.yaml index 6f12c8e6631..dd6c47007a7 100644 --- a/schemas/test_cases/v0/experiment.yaml +++ b/schemas/test_cases/v0/experiment.yaml @@ -47,8 +47,8 @@ environment_variables: {} force_pull_image: false image: - cpu: determinedai/pytorch-ngc-dev:0736b6d - cuda: determinedai/pytorch-ngc-dev:0736b6d + cpu: determinedai/pytorch-ngc:0.38.0 + cuda: determinedai/pytorch-ngc:0.38.0 rocm: determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512 pod_spec: null ports: diff --git a/tools/scripts/bumpenvs.yaml b/tools/scripts/bumpenvs.yaml index 770b987540a..2e03491db58 100644 --- a/tools/scripts/bumpenvs.yaml +++ b/tools/scripts/bumpenvs.yaml @@ -14,7 +14,7 @@ deepspeed_0_hashed: {new: determinedai/environments:cuda-11.3-pytorch-1.10-deeps old: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-079eb6d} deepspeed_0_versioned: {new: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.31.1, old: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.30.1} -deepspeed_gpt_neox_0_hashed: {new: determinedai/deepspeed-cuda-gpt-neox:0736b6d, old: determinedai/deepspeed-cuda-gpt-neox:5432424} +deepspeed_gpt_neox_0_hashed: {new: determinedai/deepspeed-cuda-gpt-neox:0.38.0, old: determinedai/deepspeed-cuda-gpt-neox:0736b6d} deepspeed_gpu_0_hashed: {new: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-2196775, old: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-f66cbce} deepspeed_gpu_0_versioned: {new: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.29.1, @@ -105,21 +105,18 @@ pytorch20_tf210_rocm56_1_versioned: {new: determinedai/environments:rocm-5.6-pyt old: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-0.27.1} pytorch20_tf210_rocm61_0_hashed: {new: determinedai/environments:rocm-6.1-pytorch-2.0-tf-2.10-rocm-mpich-0736b6d} pytorch20_tf210_rocm61_0_versioned: {new: determinedai/environments:rocm-6.1-pytorch-2.0-tf-2.10-rocm-mpich-0.35.1} -pytorch_cpu_0_hashed: {new: determinedai/pytorch-cpu-dev:0736b6d, old: determinedai/pytorch-cpu-dev:5432424} -pytorch_cpu_1_hashed: {new: determinedai/pytorch-cpu-hpc-dev:0736b6d, old: determinedai/pytorch-cpu-hpc-dev:5432424} -pytorch_cuda_0_hashed: {new: determinedai/pytorch-cuda-dev:0736b6d, old: determinedai/pytorch-cuda-dev:5432424} -pytorch_cuda_1_hashed: {new: determinedai/pytorch-cuda-hpc-dev:0736b6d, old: determinedai/pytorch-cuda-hpc-dev:5432424} -pytorch_ngc_hashed: {new: determinedai/pytorch-ngc-dev:0736b6d, old: determinedai/pytorch-ngc-dev:5432424} -pytorch_ngc_hpc_hashed: {new: determinedai/pytorch-ngc-hpc-dev:0736b6d, old: determinedai/pytorch-ngc-hpc-dev:5432424} -tensorflow_cpu_0_hashed: {new: determinedai/pytorch-tensorflow-cpu-dev:0736b6d, old: determinedai/pytorch-tensorflow-cpu-dev:5432424} -tensorflow_cpu_1_hashed: {new: determinedai/pytorch-tensorflow-cpu-hpc-dev:0736b6d, - old: determinedai/pytorch-tensorflow-cpu-hpc-dev:5432424} -tensorflow_cuda_0_hashed: {new: determinedai/pytorch-tensorflow-cuda-dev:0736b6d, - old: determinedai/pytorch-tensorflow-cuda-dev:5432424} -tensorflow_cuda_1_hashed: {new: determinedai/pytorch-tensorflow-cuda-hpc-dev:0736b6d, - old: determinedai/pytorch-tensorflow-cuda-hpc-dev:5432424} -tensorflow_ngc_hashed: {new: determinedai/tensorflow-ngc-dev:0736b6d, old: determinedai/tensorflow-ngc-dev:5432424} -tensorflow_ngc_hpc_hashed: {new: determinedai/tensorflow-ngc-hpc-dev:0736b6d, old: determinedai/tensorflow-ngc-hpc-dev:5432424} +pytorch_cpu_0_hashed: {new: determinedai/pytorch-cpu:0.38.0, old: determinedai/pytorch-cpu-dev:0736b6d} +pytorch_cpu_1_hashed: {new: determinedai/pytorch-cpu-hpc:0.38.0, old: determinedai/pytorch-cpu-hpc-dev:0736b6d} +pytorch_cuda_0_hashed: {new: determinedai/pytorch-cuda:0.38.0, old: determinedai/pytorch-cuda-dev:0736b6d} +pytorch_cuda_1_hashed: {new: determinedai/pytorch-cuda-hpc:0.38.0, old: determinedai/pytorch-cuda-hpc-dev:0736b6d} +pytorch_ngc_hashed: {new: determinedai/pytorch-ngc:0.38.0, old: determinedai/pytorch-ngc-dev:0736b6d} +pytorch_ngc_hpc_hashed: {new: determinedai/pytorch-ngc-hpc:0.38.0, old: determinedai/pytorch-ngc-hpc-dev:0736b6d} +tensorflow_cpu_0_hashed: {new: determinedai/pytorch-tensorflow-cpu:0.38.0, old: determinedai/pytorch-tensorflow-cpu-dev:0736b6d} +tensorflow_cpu_1_hashed: {new: determinedai/pytorch-tensorflow-cpu-hpc:0.38.0, old: determinedai/pytorch-tensorflow-cpu-hpc-dev:0736b6d} +tensorflow_cuda_0_hashed: {new: determinedai/pytorch-tensorflow-cuda:0.38.0, old: determinedai/pytorch-tensorflow-cuda-dev:0736b6d} +tensorflow_cuda_1_hashed: {new: determinedai/pytorch-tensorflow-cuda-hpc:0.38.0, old: determinedai/pytorch-tensorflow-cuda-hpc-dev:0736b6d} +tensorflow_ngc_hashed: {new: determinedai/tensorflow-ngc:0.38.0, old: determinedai/tensorflow-ngc-dev:0736b6d} +tensorflow_ngc_hpc_hashed: {new: determinedai/tensorflow-ngc-hpc:0.38.0, old: determinedai/tensorflow-ngc-hpc-dev:0736b6d} tf24_cpu_0_hashed: {new: determinedai/environments:py-3.8-pytorch-1.9-tf-2.4-cpu-24586f0, old: determinedai/environments-dev:py-3.8-pytorch-1.9-tf-2.4-cpu-1c769fb} tf24_cpu_0_versioned: {new: determinedai/environments:py-3.8-pytorch-1.9-tf-2.4-cpu-0.19.10, diff --git a/tools/scripts/environments-target.txt b/tools/scripts/environments-target.txt index 12461fa7720..ca75280b09b 100644 --- a/tools/scripts/environments-target.txt +++ b/tools/scripts/environments-target.txt @@ -1 +1 @@ -0736b6d +0.38.0 diff --git a/webui/react/src/fixtures/responses/experiment-details/non-scalar-metrics-4078.json b/webui/react/src/fixtures/responses/experiment-details/non-scalar-metrics-4078.json index 48ce35dabb8..786d1b19624 100644 --- a/webui/react/src/fixtures/responses/experiment-details/non-scalar-metrics-4078.json +++ b/webui/react/src/fixtures/responses/experiment-details/non-scalar-metrics-4078.json @@ -29,8 +29,8 @@ "name": "Fork of Fork of mnist_tp_to_estimator_const", "environment": { "image": { - "cpu": "determinedai/pytorch-ngc-dev:0736b6d", - "gpu": "determinedai/pytorch-ngc-dev:0736b6d" + "cpu": "determinedai/pytorch-ngc:0.38.0", + "gpu": "determinedai/pytorch-ngc:0.38.0" }, "ports": null, "pod_spec": null, diff --git a/webui/react/src/fixtures/responses/experiment-details/set-a.json b/webui/react/src/fixtures/responses/experiment-details/set-a.json index d8d619c7239..6ac4d0f4cfa 100644 --- a/webui/react/src/fixtures/responses/experiment-details/set-a.json +++ b/webui/react/src/fixtures/responses/experiment-details/set-a.json @@ -679,8 +679,8 @@ "environment_variables": {}, "force_pull_image": false, "image": { - "cpu": "determinedai/pytorch-ngc-dev:0736b6d", - "gpu": "determinedai/pytorch-ngc-dev:0736b6d" + "cpu": "determinedai/pytorch-ngc:0.38.0", + "gpu": "determinedai/pytorch-ngc:0.38.0" }, "pod_spec": null, "ports": null @@ -820,8 +820,8 @@ "environment_variables": {}, "force_pull_image": false, "image": { - "cpu": "determinedai/tensorflow-ngc-dev:0736b6d", - "gpu": "determinedai/tensorflow-ngc-dev:0736b6d" + "cpu": "determinedai/tensorflow-ngc:0.38.0", + "gpu": "determinedai/tensorflow-ngc:0.38.0" }, "pod_spec": { "metadata": { diff --git a/webui/react/src/fixtures/responses/trial-details/old-trial-config-noop-adaptive.json b/webui/react/src/fixtures/responses/trial-details/old-trial-config-noop-adaptive.json index 2d798dac0a4..13874845ce2 100644 --- a/webui/react/src/fixtures/responses/trial-details/old-trial-config-noop-adaptive.json +++ b/webui/react/src/fixtures/responses/trial-details/old-trial-config-noop-adaptive.json @@ -30,8 +30,8 @@ "name": "noop_adaptive", "environment": { "image": { - "cpu": "determinedai/pytorch-ngc-dev:0736b6d", - "gpu": "determinedai/pytorch-ngc-dev:0736b6d" + "cpu": "determinedai/pytorch-ngc:0.38.0", + "gpu": "determinedai/pytorch-ngc:0.38.0" }, "ports": null, "force_pull_image": false,