From 40d3509868f184b3198546141525f2ba0d35b449 Mon Sep 17 00:00:00 2001 From: RAGHUL M Date: Tue, 3 Dec 2024 14:39:57 +0530 Subject: [PATCH 1/2] Smoke Test failure - Name fix for Runtime template (#2103) * Name fix for Runtime template * Name fix for Runtime YAML template * uncommented teardown --------- Co-authored-by: Tarun Kumar --- .../1003__model_serving_customruntimes.robot | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ods_ci/tests/Tests/1000__model_serving/1003__model_serving_customruntimes.robot b/ods_ci/tests/Tests/1000__model_serving/1003__model_serving_customruntimes.robot index 7cd957f57..6bb59ee08 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1003__model_serving_customruntimes.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1003__model_serving_customruntimes.robot @@ -13,6 +13,7 @@ Test Tags Dashboard ${RESOURCES_DIRPATH}= tests/Resources/Files ${OVMS_RUNTIME_FILEPATH}= ${RESOURCES_DIRPATH}/ovms_servingruntime.yaml ${UPLOADED_OVMS_DISPLAYED_NAME}= ODS-CI Custom OpenVINO Model Server +${UPLOADED_OVMS_YAML_NAME}= ovms-ods-ci ${PRJ_TITLE}= CustomServingRuntimesProject ${PRJ_DESCRIPTION}= ODS-CI DS Project for testing of Custom Serving Runtimes ${MODEL_SERVER_NAME}= ODS-CI CustomServingRuntime Server @@ -25,7 +26,7 @@ Verify RHODS Admins Can Import A Custom Serving Runtime Template By Uploading A Open Dashboard Settings settings_page=Serving runtimes Upload Serving Runtime Template runtime_filepath=${OVMS_RUNTIME_FILEPATH} ... serving_platform=multi runtime_protocol=gRPC - Serving Runtime Template Should Be Listed displayed_name=${UPLOADED_OVMS_DISPLAYED_NAME} + Serving Runtime Template Should Be Listed displayed_name=${UPLOADED_OVMS_YAML_NAME} ... serving_platform=multi Verify RHODS Admins Can Delete A Custom Serving Runtime Template From 0e5a93e0d5d123f62d1b1a1cb08b020c9942266a Mon Sep 17 00:00:00 2001 From: Jorge Date: Tue, 3 Dec 2024 10:18:38 +0100 Subject: [PATCH 2/2] Update images used in nvidia and rocm pipeline testing for 2.16 (master) (#2086) Update images used in nvidia and rocm pipeline testing for 2.16 Use the workbench images availables in 2.16 RC2 Signed-off-by: Jorge Garcia Oncins --- .../pytorch/pytorch_amd_gpu_availability.py | 10 ++-- ...pytorch_amd_gpu_availability_compiled.yaml | 48 +++++++++---------- .../pytorch_nvidia_gpu_availability.py | 11 ++--- ...orch_nvidia_gpu_availability_compiled.yaml | 48 +++++++++---------- 4 files changed, 57 insertions(+), 60 deletions(-) diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py index bd9b74b69..52c6d83d2 100644 --- a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py +++ b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py @@ -3,7 +3,7 @@ # Runtime: Pytorch with ROCm and Python 3.9 (UBI 9) common_base_image = ( - "quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10" + "quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852" ) @@ -14,11 +14,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule") -@dsl.component( - base_image=common_base_image -) +@dsl.component(base_image=common_base_image) def verify_gpu_availability(gpu_toleration: bool): - import torch + import torch # noqa: PLC0415 cuda_available = torch.cuda.is_available() device_count = torch.cuda.device_count() @@ -30,7 +28,7 @@ def verify_gpu_availability(gpu_toleration: bool): if gpu_toleration: assert torch.cuda.is_available() assert torch.cuda.device_count() > 0 - t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda') + t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda") else: assert not torch.cuda.is_available() assert torch.cuda.device_count() == 0 diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml index 8652d23c5..d3f158ecd 100644 --- a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml +++ b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml @@ -42,18 +42,18 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\ - \n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\ - \ print(\"------------------------------\")\n print(\"GPU availability\"\ - )\n print(\"------------------------------\")\n print(f\"cuda available:\ - \ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \ - \ if gpu_toleration:\n assert torch.cuda.is_available()\n \ - \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\ - \ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\ - \ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\ - \ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\ - \ availability test: PASS\")\n\n" - image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10 + \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\ + \ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \ + \ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\ + )\n print(\"GPU availability\")\n print(\"------------------------------\"\ + )\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\ + \ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\ + \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\ + \ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\ + \ not torch.cuda.is_available()\n assert torch.cuda.device_count()\ + \ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\ + tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n" + image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852 exec-verify-gpu-availability-2: container: args: @@ -80,18 +80,18 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\ - \n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\ - \ print(\"------------------------------\")\n print(\"GPU availability\"\ - )\n print(\"------------------------------\")\n print(f\"cuda available:\ - \ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \ - \ if gpu_toleration:\n assert torch.cuda.is_available()\n \ - \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\ - \ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\ - \ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\ - \ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\ - \ availability test: PASS\")\n\n" - image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10 + \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\ + \ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \ + \ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\ + )\n print(\"GPU availability\")\n print(\"------------------------------\"\ + )\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\ + \ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\ + \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\ + \ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\ + \ not torch.cuda.is_available()\n assert torch.cuda.device_count()\ + \ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\ + tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n" + image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852 resources: accelerator: count: '1' diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py index fa32cd9b0..d593a8c5c 100644 --- a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py +++ b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py @@ -2,8 +2,9 @@ from kfp.dsl import PipelineTask # Runtime: Pytorch with CUDA and Python 3.9 (UBI 9) +# Images for each release can be found here (in the branch for the release) common_base_image = ( - "quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a" + "quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd" ) @@ -14,11 +15,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule") -@dsl.component( - base_image=common_base_image -) +@dsl.component(base_image=common_base_image) def verify_gpu_availability(gpu_toleration: bool): - import torch + import torch # noqa: PLC0415 cuda_available = torch.cuda.is_available() device_count = torch.cuda.device_count() @@ -30,7 +29,7 @@ def verify_gpu_availability(gpu_toleration: bool): if gpu_toleration: assert torch.cuda.is_available() assert torch.cuda.device_count() > 0 - t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda') + t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda") else: assert not torch.cuda.is_available() assert torch.cuda.device_count() == 0 diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml index d66218962..95cbebf16 100644 --- a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml +++ b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml @@ -42,18 +42,18 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\ - \n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\ - \ print(\"------------------------------\")\n print(\"GPU availability\"\ - )\n print(\"------------------------------\")\n print(f\"cuda available:\ - \ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \ - \ if gpu_toleration:\n assert torch.cuda.is_available()\n \ - \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\ - \ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\ - \ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\ - \ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\ - \ availability test: PASS\")\n\n" - image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a + \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\ + \ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \ + \ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\ + )\n print(\"GPU availability\")\n print(\"------------------------------\"\ + )\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\ + \ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\ + \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\ + \ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\ + \ not torch.cuda.is_available()\n assert torch.cuda.device_count()\ + \ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\ + tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n" + image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd exec-verify-gpu-availability-2: container: args: @@ -80,18 +80,18 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\ - \n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\ - \ print(\"------------------------------\")\n print(\"GPU availability\"\ - )\n print(\"------------------------------\")\n print(f\"cuda available:\ - \ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \ - \ if gpu_toleration:\n assert torch.cuda.is_available()\n \ - \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\ - \ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\ - \ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\ - \ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\ - \ availability test: PASS\")\n\n" - image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a + \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\ + \ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \ + \ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\ + )\n print(\"GPU availability\")\n print(\"------------------------------\"\ + )\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\ + \ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\ + \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\ + \ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\ + \ not torch.cuda.is_available()\n assert torch.cuda.device_count()\ + \ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\ + tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n" + image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd resources: accelerator: count: '1'