Merge branch 'master' into fil_rest

red-hat-data-services · Dec 4, 2024 · a60778a · a60778a
2 parents 9134b31 + cc96d97
commit a60778a
Show file tree

Hide file tree

Showing 20 changed files with 664 additions and 515 deletions.
diff --git a/ods_ci/tests/Resources/CLI/MustGather/MustGather.resource b/ods_ci/tests/Resources/CLI/MustGather/MustGather.resource
@@ -7,7 +7,7 @@ Resource         ../../Common.robot
 
 
 *** Keywords ***
-Get must-gather Logs
+Get Must-Gather Logs
     [Documentation]    Runs the must-gather image and obtains the ODH/RHOAI logs
     ${output}=    Run process    tests/Resources/CLI/MustGather/get-must-gather-logs.sh     shell=yes
     Should Be Equal As Integers	  ${output.rc}	 0
@@ -27,6 +27,6 @@ Verify Logs For ${namespace}
     ${log_files}=     Run   find ${namespaces_log_dir}/${namespace}/pods -type f -name "*.log"
     Should Not Be Equal    ${log_files}  ${EMPTY}
 
-Cleanup must-gather Logs
+Cleanup Must-Gather Logs
     [Documentation]    Deletes the folder with the must-gather logs
     Run Keyword If      "${must_gather_dir}" != "${EMPTY}"      Remove Directory   ${must_gather_dir}    recursive=True
diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/caikit_standalone_servingruntime_grpc.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/caikit_standalone_servingruntime_grpc.yaml
@@ -10,7 +10,7 @@ spec:
       name: caikit
   containers:
     - name: kserve-container
-      image: quay.io/opendatahub/caikit-nlp:stable
+      image: quay.io/modh/caikit-nlp@sha256:3c33185fda84d7bac6715c8743c446a6713cdbc0cb0ed831acc0df89bd8bab6b
       command: ["python", "-m", "caikit.runtime"]
       env:
         - name: RUNTIME_LOCAL_MODELS_DIR

diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/caikit_standalone_servingruntime_http.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/caikit_standalone_servingruntime_http.yaml
@@ -10,7 +10,7 @@ spec:
       name: caikit
   containers:
     - name: kserve-container
-      image: quay.io/opendatahub/caikit-nlp:stable
+      image: quay.io/modh/caikit-nlp@sha256:3c33185fda84d7bac6715c8743c446a6713cdbc0cb0ed831acc0df89bd8bab6b
       command: ["python", "-m", "caikit.runtime"]
       env:
         - name: RUNTIME_LOCAL_MODELS_DIR

diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml
@@ -12,7 +12,7 @@ spec:
         - '--served-model-name={{.Name}}'
         - '--distributed-executor-backend=mp'
         - '--chat-template=/app/data/template/template_chatml.jinja'
-      image: quay.io/modh/vllm@sha256:c86ff1e89c86bc9821b75d7f2bbc170b3c13e3ccf538bf543b1110f23e056316
+      image: ${runtime_image}
       name: kserve-container
       command:
         - python3

diff --git a/...rces/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py b/...rces/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py
@@ -3,7 +3,7 @@
 
 #  Runtime: Pytorch with ROCm and Python 3.9 (UBI 9)
 common_base_image = (
-    "quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10"
+    "quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852"
 )
 
 
@@ -14,11 +14,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li
     kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")
 
 
-@dsl.component(
-    base_image=common_base_image
-)
+@dsl.component(base_image=common_base_image)
 def verify_gpu_availability(gpu_toleration: bool):
-    import torch
+    import torch  # noqa: PLC0415
 
     cuda_available = torch.cuda.is_available()
     device_count = torch.cuda.device_count()
@@ -30,7 +28,7 @@ def verify_gpu_availability(gpu_toleration: bool):
     if gpu_toleration:
         assert torch.cuda.is_available()
         assert torch.cuda.device_count() > 0
-        t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
+        t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda")
     else:
         assert not torch.cuda.is_available()
         assert torch.cuda.device_count() == 0

diff --git a/...pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml b/...pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml
@@ -42,18 +42,18 @@ deploymentSpec:
 
           '
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
-          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\n\
-          \n    cuda_available = torch.cuda.is_available()\n    device_count = torch.cuda.device_count()\n\
-          \    print(\"------------------------------\")\n    print(\"GPU availability\"\
-          )\n    print(\"------------------------------\")\n    print(f\"cuda available:\
-          \ {cuda_available}\")\n    print(f\"device count: {device_count}\")\n  \
-          \  if gpu_toleration:\n        assert torch.cuda.is_available()\n      \
-          \  assert torch.cuda.device_count() > 0\n        t = torch.tensor([5, 5,\
-          \ 5], dtype=torch.int64, device='cuda')\n    else:\n        assert not torch.cuda.is_available()\n\
-          \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
-          \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
-          \ availability test: PASS\")\n\n"
-        image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
+          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\
+          \  # noqa: PLC0415\n\n    cuda_available = torch.cuda.is_available()\n \
+          \   device_count = torch.cuda.device_count()\n    print(\"------------------------------\"\
+          )\n    print(\"GPU availability\")\n    print(\"------------------------------\"\
+          )\n    print(f\"cuda available: {cuda_available}\")\n    print(f\"device\
+          \ count: {device_count}\")\n    if gpu_toleration:\n        assert torch.cuda.is_available()\n\
+          \        assert torch.cuda.device_count() > 0\n        t = torch.tensor([5,\
+          \ 5, 5], dtype=torch.int64, device=\"cuda\")\n    else:\n        assert\
+          \ not torch.cuda.is_available()\n        assert torch.cuda.device_count()\
+          \ == 0\n        t = torch.tensor([5, 5, 5], dtype=torch.int64)\n    print(f\"\
+          tensor: {t}\")\n    print(\"GPU availability test: PASS\")\n\n"
+        image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852
     exec-verify-gpu-availability-2:
       container:
         args:
@@ -80,18 +80,18 @@ deploymentSpec:
 
           '
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
-          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\n\
-          \n    cuda_available = torch.cuda.is_available()\n    device_count = torch.cuda.device_count()\n\
-          \    print(\"------------------------------\")\n    print(\"GPU availability\"\
-          )\n    print(\"------------------------------\")\n    print(f\"cuda available:\
-          \ {cuda_available}\")\n    print(f\"device count: {device_count}\")\n  \
-          \  if gpu_toleration:\n        assert torch.cuda.is_available()\n      \
-          \  assert torch.cuda.device_count() > 0\n        t = torch.tensor([5, 5,\
-          \ 5], dtype=torch.int64, device='cuda')\n    else:\n        assert not torch.cuda.is_available()\n\
-          \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
-          \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
-          \ availability test: PASS\")\n\n"
-        image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
+          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\
+          \  # noqa: PLC0415\n\n    cuda_available = torch.cuda.is_available()\n \
+          \   device_count = torch.cuda.device_count()\n    print(\"------------------------------\"\
+          )\n    print(\"GPU availability\")\n    print(\"------------------------------\"\
+          )\n    print(f\"cuda available: {cuda_available}\")\n    print(f\"device\
+          \ count: {device_count}\")\n    if gpu_toleration:\n        assert torch.cuda.is_available()\n\
+          \        assert torch.cuda.device_count() > 0\n        t = torch.tensor([5,\
+          \ 5, 5], dtype=torch.int64, device=\"cuda\")\n    else:\n        assert\
+          \ not torch.cuda.is_available()\n        assert torch.cuda.device_count()\
+          \ == 0\n        t = torch.tensor([5, 5, 5], dtype=torch.int64)\n    print(f\"\
+          tensor: {t}\")\n    print(\"GPU availability test: PASS\")\n\n"
+        image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852
         resources:
           accelerator:
             count: '1'

diff --git a/...s/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py b/...s/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py
@@ -2,8 +2,9 @@
 from kfp.dsl import PipelineTask
 
 #  Runtime: Pytorch with CUDA and Python 3.9 (UBI 9)
+#  Images for each release can be found here (in the branch for the release)
 common_base_image = (
-    "quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a"
+    "quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd"
 )
 
 
@@ -14,11 +15,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li
     kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")
 
 
-@dsl.component(
-    base_image=common_base_image
-)
+@dsl.component(base_image=common_base_image)
 def verify_gpu_availability(gpu_toleration: bool):
-    import torch
+    import torch  # noqa: PLC0415
 
     cuda_available = torch.cuda.is_available()
     device_count = torch.cuda.device_count()
@@ -30,7 +29,7 @@ def verify_gpu_availability(gpu_toleration: bool):
     if gpu_toleration:
         assert torch.cuda.is_available()
         assert torch.cuda.device_count() > 0
-        t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
+        t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda")
     else:
         assert not torch.cuda.is_available()
         assert torch.cuda.device_count() == 0

diff --git a/...eline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml b/...eline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml
@@ -42,18 +42,18 @@ deploymentSpec:
 
           '
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
-          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\n\
-          \n    cuda_available = torch.cuda.is_available()\n    device_count = torch.cuda.device_count()\n\
-          \    print(\"------------------------------\")\n    print(\"GPU availability\"\
-          )\n    print(\"------------------------------\")\n    print(f\"cuda available:\
-          \ {cuda_available}\")\n    print(f\"device count: {device_count}\")\n  \
-          \  if gpu_toleration:\n        assert torch.cuda.is_available()\n      \
-          \  assert torch.cuda.device_count() > 0\n        t = torch.tensor([5, 5,\
-          \ 5], dtype=torch.int64, device='cuda')\n    else:\n        assert not torch.cuda.is_available()\n\
-          \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
-          \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
-          \ availability test: PASS\")\n\n"
-        image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
+          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\
+          \  # noqa: PLC0415\n\n    cuda_available = torch.cuda.is_available()\n \
+          \   device_count = torch.cuda.device_count()\n    print(\"------------------------------\"\
+          )\n    print(\"GPU availability\")\n    print(\"------------------------------\"\
+          )\n    print(f\"cuda available: {cuda_available}\")\n    print(f\"device\
+          \ count: {device_count}\")\n    if gpu_toleration:\n        assert torch.cuda.is_available()\n\
+          \        assert torch.cuda.device_count() > 0\n        t = torch.tensor([5,\
+          \ 5, 5], dtype=torch.int64, device=\"cuda\")\n    else:\n        assert\
+          \ not torch.cuda.is_available()\n        assert torch.cuda.device_count()\
+          \ == 0\n        t = torch.tensor([5, 5, 5], dtype=torch.int64)\n    print(f\"\
+          tensor: {t}\")\n    print(\"GPU availability test: PASS\")\n\n"
+        image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd
     exec-verify-gpu-availability-2:
       container:
         args:
@@ -80,18 +80,18 @@ deploymentSpec:
 
           '
         - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
-          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\n\
-          \n    cuda_available = torch.cuda.is_available()\n    device_count = torch.cuda.device_count()\n\
-          \    print(\"------------------------------\")\n    print(\"GPU availability\"\
-          )\n    print(\"------------------------------\")\n    print(f\"cuda available:\
-          \ {cuda_available}\")\n    print(f\"device count: {device_count}\")\n  \
-          \  if gpu_toleration:\n        assert torch.cuda.is_available()\n      \
-          \  assert torch.cuda.device_count() > 0\n        t = torch.tensor([5, 5,\
-          \ 5], dtype=torch.int64, device='cuda')\n    else:\n        assert not torch.cuda.is_available()\n\
-          \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
-          \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
-          \ availability test: PASS\")\n\n"
-        image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
+          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\
+          \  # noqa: PLC0415\n\n    cuda_available = torch.cuda.is_available()\n \
+          \   device_count = torch.cuda.device_count()\n    print(\"------------------------------\"\
+          )\n    print(\"GPU availability\")\n    print(\"------------------------------\"\
+          )\n    print(f\"cuda available: {cuda_available}\")\n    print(f\"device\
+          \ count: {device_count}\")\n    if gpu_toleration:\n        assert torch.cuda.is_available()\n\
+          \        assert torch.cuda.device_count() > 0\n        t = torch.tensor([5,\
+          \ 5, 5], dtype=torch.int64, device=\"cuda\")\n    else:\n        assert\
+          \ not torch.cuda.is_available()\n        assert torch.cuda.device_count()\
+          \ == 0\n        t = torch.tensor([5, 5, 5], dtype=torch.int64)\n    print(f\"\
+          tensor: {t}\")\n    print(\"GPU availability test: PASS\")\n\n"
+        image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd
         resources:
           accelerator:
             count: '1'

diff --git a/ods_ci/tests/Resources/Page/DistributedWorkloads/DistributedWorkloads.resource b/ods_ci/tests/Resources/Page/DistributedWorkloads/DistributedWorkloads.resource
@@ -10,11 +10,26 @@ ${CODEFLARE-SDK-RELEASE-TAG-3.9}         adjustments-release-0.21.1
 ${CODEFLARE-SDK_DIR}                     codeflare-sdk
 ${CODEFLARE-SDK_REPO_URL}                %{CODEFLARE-SDK_REPO_URL=https://github.com/project-codeflare/codeflare-sdk.git}
 ${DISTRIBUTED_WORKLOADS_RELEASE_ASSETS}  https://github.com/opendatahub-io/distributed-workloads/releases/latest/download
-${RAY_IMAGE_3.11}                        quay.io/modh/ray@sha256:db667df1bc437a7b0965e8031e905d3ab04b86390d764d120e05ea5a5c18d1b4
-${RAY_IMAGE_3.9}                         quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+# Corresponds to quay.io/modh/ray:2.35.0-py311-cu121
+${RAY_CUDA_IMAGE_3.11}                   quay.io/modh/ray@sha256:db667df1bc437a7b0965e8031e905d3ab04b86390d764d120e05ea5a5c18d1b4
+# Corresponds to quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26
+${RAY_TORCH_CUDA_IMAGE_3.11}             quay.io/rhoai/ray@sha256:5077f9bb230dfa88f34089fecdfcdaa8abc6964716a8a8325c7f9dcdf11bbbb3
+# Corresponds to quay.io/modh/ray:2.35.0-py311-rocm61
+${RAY_ROCM_IMAGE_3.11}                   quay.io/modh/ray@sha256:f8b4f2b1c954187753c1f5254f7bb6a4286cec5a4f1b43def7ef4e009f2d28cb
+# Corresponds to quay.io/modh/ray:2.35.0-py39-cu121
+${RAY_CUDA_IMAGE_3.9}                    quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
+# Corresponds to quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26
+${RAY_TORCH_CUDA_IMAGE_3.9}              quay.io/rhoai/ray@sha256:158b481b8e9110008d60ac9fb8d156eadd71cb057ac30382e62e3a231ceb39c0
+# Corresponds to quay.io/modh/fms-hf-tuning:v2.1.2
 ${FMS_HF_TUNING_IMAGE}                   quay.io/modh/fms-hf-tuning@sha256:6f98907f9095db72932caa54094438eae742145f4b66c28d15887d5303ff1186
+# Corresponds to quay.io/modh/training:py311-cuda121-torch241
 ${CUDA_TRAINING_IMAGE}                   quay.io/modh/training@sha256:b98e373a972ff6f896a9dc054d56920e915675339c02ea7fa123e0f4bbef4d74
+# Corresponds to quay.io/modh/training:py311-rocm61-torch241
 ${ROCM_TRAINING_IMAGE}                   quay.io/modh/training@sha256:2efb6efba4ec08e63847d701e3062a5f6ddf51c91af5fbcef6378b9e6520a3bb
+# Corresponds to quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111
+${NOTEBOOK_IMAGE_3.11}                   quay.io/modh/odh-generic-data-science-notebook@sha256:7c1a4ca213b71d342a2d1366171304e469da06d5f15710fab5dd3ce013aa1b73
+# Corresponds to quay.io/modh/odh-generic-data-science-notebook:v2-2024a-20241108
+${NOTEBOOK_IMAGE_3.9}                    quay.io/modh/odh-generic-data-science-notebook@sha256:b1066204611b4bcfa6172c3115650a8e8393089d5606458fa0d8c53633d2ce17
 ${NOTEBOOK_USER_NAME}                    ${TEST_USER_3.USERNAME}
 ${NOTEBOOK_USER_PASSWORD}                ${TEST_USER_3.PASSWORD}
 ${KFTO_CORE_BINARY_NAME}                 kfto

diff --git a/ods_ci/tests/Tests/0100__platform/0103__must_gather/test-must-gather-logs.robot b/ods_ci/tests/Tests/0100__platform/0103__must_gather/test-must-gather-logs.robot
@@ -16,10 +16,10 @@ Verify that the must-gather image provides RHODS logs and info
     ...      MustGather
     ...      ExcludeOnODH
     ...      ExcludeOnDisconnected
-    Get must-gather Logs
-    Verify logs for ${APPLICATIONS_NAMESPACE}
+    Get Must-Gather Logs
+    Verify Logs For ${APPLICATIONS_NAMESPACE}
     IF  "${PRODUCT}" == "RHODS"
         Verify Logs For ${OPERATOR_NAMESPACE}
-        Run Keyword If RHODS Is Managed    Verify logs for ${MONITORING_NAMESPACE}
+        Run Keyword If RHODS Is Managed    Verify Logs For ${MONITORING_NAMESPACE}
     END
     [Teardown]  Cleanup must-gather Logs