Skip to content

Commit

Permalink
Update images used in nvidia and rocm pipeline testing for 2.16 (back…
Browse files Browse the repository at this point in the history
…port to releases/2.16.0) (red-hat-data-services#2087)

Update images used in nvidia and rocm pipeline testing for 2.16

Use the workbench images availables in 2.16 RC2

Signed-off-by: Jorge Garcia Oncins <[email protected]>
  • Loading branch information
jgarciao authored Dec 3, 2024
1 parent fd60ad0 commit db89f0a
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# Runtime: Pytorch with ROCm and Python 3.9 (UBI 9)
common_base_image = (
"quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10"
"quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852"
)


Expand All @@ -14,11 +14,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li
kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")


@dsl.component(
base_image=common_base_image
)
@dsl.component(base_image=common_base_image)
def verify_gpu_availability(gpu_toleration: bool):
import torch
import torch # noqa: PLC0415

cuda_available = torch.cuda.is_available()
device_count = torch.cuda.device_count()
Expand All @@ -30,7 +28,7 @@ def verify_gpu_availability(gpu_toleration: bool):
if gpu_toleration:
assert torch.cuda.is_available()
assert torch.cuda.device_count() > 0
t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda")
else:
assert not torch.cuda.is_available()
assert torch.cuda.device_count() == 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852
exec-verify-gpu-availability-2:
container:
args:
Expand All @@ -80,18 +80,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852
resources:
accelerator:
count: '1'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from kfp.dsl import PipelineTask

# Runtime: Pytorch with CUDA and Python 3.9 (UBI 9)
# Images for each release can be found here (in the branch for the release)
common_base_image = (
"quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a"
"quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd"
)


Expand All @@ -14,11 +15,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li
kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")


@dsl.component(
base_image=common_base_image
)
@dsl.component(base_image=common_base_image)
def verify_gpu_availability(gpu_toleration: bool):
import torch
import torch # noqa: PLC0415

cuda_available = torch.cuda.is_available()
device_count = torch.cuda.device_count()
Expand All @@ -30,7 +29,7 @@ def verify_gpu_availability(gpu_toleration: bool):
if gpu_toleration:
assert torch.cuda.is_available()
assert torch.cuda.device_count() > 0
t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda")
else:
assert not torch.cuda.is_available()
assert torch.cuda.device_count() == 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd
exec-verify-gpu-availability-2:
container:
args:
Expand All @@ -80,18 +80,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd
resources:
accelerator:
count: '1'
Expand Down

0 comments on commit db89f0a

Please sign in to comment.