Skip to content

Commit

Permalink
Merge branch 'master' into fil_rest
Browse files Browse the repository at this point in the history
  • Loading branch information
tarukumar authored Dec 4, 2024
2 parents 9134b31 + cc96d97 commit a60778a
Show file tree
Hide file tree
Showing 20 changed files with 664 additions and 515 deletions.
4 changes: 2 additions & 2 deletions ods_ci/tests/Resources/CLI/MustGather/MustGather.resource
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Resource ../../Common.robot


*** Keywords ***
Get must-gather Logs
Get Must-Gather Logs
[Documentation] Runs the must-gather image and obtains the ODH/RHOAI logs
${output}= Run process tests/Resources/CLI/MustGather/get-must-gather-logs.sh shell=yes
Should Be Equal As Integers ${output.rc} 0
Expand All @@ -27,6 +27,6 @@ Verify Logs For ${namespace}
${log_files}= Run find ${namespaces_log_dir}/${namespace}/pods -type f -name "*.log"
Should Not Be Equal ${log_files} ${EMPTY}

Cleanup must-gather Logs
Cleanup Must-Gather Logs
[Documentation] Deletes the folder with the must-gather logs
Run Keyword If "${must_gather_dir}" != "${EMPTY}" Remove Directory ${must_gather_dir} recursive=True
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
name: caikit
containers:
- name: kserve-container
image: quay.io/opendatahub/caikit-nlp:stable
image: quay.io/modh/caikit-nlp@sha256:3c33185fda84d7bac6715c8743c446a6713cdbc0cb0ed831acc0df89bd8bab6b
command: ["python", "-m", "caikit.runtime"]
env:
- name: RUNTIME_LOCAL_MODELS_DIR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
name: caikit
containers:
- name: kserve-container
image: quay.io/opendatahub/caikit-nlp:stable
image: quay.io/modh/caikit-nlp@sha256:3c33185fda84d7bac6715c8743c446a6713cdbc0cb0ed831acc0df89bd8bab6b
command: ["python", "-m", "caikit.runtime"]
env:
- name: RUNTIME_LOCAL_MODELS_DIR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
- '--served-model-name={{.Name}}'
- '--distributed-executor-backend=mp'
- '--chat-template=/app/data/template/template_chatml.jinja'
image: quay.io/modh/vllm@sha256:c86ff1e89c86bc9821b75d7f2bbc170b3c13e3ccf538bf543b1110f23e056316
image: ${runtime_image}
name: kserve-container
command:
- python3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# Runtime: Pytorch with ROCm and Python 3.9 (UBI 9)
common_base_image = (
"quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10"
"quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852"
)


Expand All @@ -14,11 +14,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li
kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")


@dsl.component(
base_image=common_base_image
)
@dsl.component(base_image=common_base_image)
def verify_gpu_availability(gpu_toleration: bool):
import torch
import torch # noqa: PLC0415

cuda_available = torch.cuda.is_available()
device_count = torch.cuda.device_count()
Expand All @@ -30,7 +28,7 @@ def verify_gpu_availability(gpu_toleration: bool):
if gpu_toleration:
assert torch.cuda.is_available()
assert torch.cuda.device_count() > 0
t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda")
else:
assert not torch.cuda.is_available()
assert torch.cuda.device_count() == 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852
exec-verify-gpu-availability-2:
container:
args:
Expand All @@ -80,18 +80,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:a1cfb7bfcff3b2aae2b20b17da83b6683d632403f674a51af6efdfe809a6fc10
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:6340efaa92bc54bcede518e890492db626fb9fe96f028c2cd5251f286b2b2852
resources:
accelerator:
count: '1'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from kfp.dsl import PipelineTask

# Runtime: Pytorch with CUDA and Python 3.9 (UBI 9)
# Images for each release can be found here (in the branch for the release)
common_base_image = (
"quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a"
"quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd"
)


Expand All @@ -14,11 +15,9 @@ def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_li
kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")


@dsl.component(
base_image=common_base_image
)
@dsl.component(base_image=common_base_image)
def verify_gpu_availability(gpu_toleration: bool):
import torch
import torch # noqa: PLC0415

cuda_available = torch.cuda.is_available()
device_count = torch.cuda.device_count()
Expand All @@ -30,7 +29,7 @@ def verify_gpu_availability(gpu_toleration: bool):
if gpu_toleration:
assert torch.cuda.is_available()
assert torch.cuda.device_count() > 0
t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
t = torch.tensor([5, 5, 5], dtype=torch.int64, device="cuda")
else:
assert not torch.cuda.is_available()
assert torch.cuda.device_count() == 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd
exec-verify-gpu-availability-2:
container:
args:
Expand All @@ -80,18 +80,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:7d1b065f100666fe46f64a2e8aae888cb41a38b5482bb9b9343b14db05c2a14a
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\
\ # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n \
\ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\
)\n print(\"GPU availability\")\n print(\"------------------------------\"\
)\n print(f\"cuda available: {cuda_available}\")\n print(f\"device\
\ count: {device_count}\")\n if gpu_toleration:\n assert torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64, device=\"cuda\")\n else:\n assert\
\ not torch.cuda.is_available()\n assert torch.cuda.device_count()\
\ == 0\n t = torch.tensor([5, 5, 5], dtype=torch.int64)\n print(f\"\
tensor: {t}\")\n print(\"GPU availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:e1f7ad986f694236a818796af290a99b4e7f73d44cd39ca45860087644d136dd
resources:
accelerator:
count: '1'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,26 @@ ${CODEFLARE-SDK-RELEASE-TAG-3.9} adjustments-release-0.21.1
${CODEFLARE-SDK_DIR} codeflare-sdk
${CODEFLARE-SDK_REPO_URL} %{CODEFLARE-SDK_REPO_URL=https://github.com/project-codeflare/codeflare-sdk.git}
${DISTRIBUTED_WORKLOADS_RELEASE_ASSETS} https://github.com/opendatahub-io/distributed-workloads/releases/latest/download
${RAY_IMAGE_3.11} quay.io/modh/ray@sha256:db667df1bc437a7b0965e8031e905d3ab04b86390d764d120e05ea5a5c18d1b4
${RAY_IMAGE_3.9} quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
# Corresponds to quay.io/modh/ray:2.35.0-py311-cu121
${RAY_CUDA_IMAGE_3.11} quay.io/modh/ray@sha256:db667df1bc437a7b0965e8031e905d3ab04b86390d764d120e05ea5a5c18d1b4
# Corresponds to quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26
${RAY_TORCH_CUDA_IMAGE_3.11} quay.io/rhoai/ray@sha256:5077f9bb230dfa88f34089fecdfcdaa8abc6964716a8a8325c7f9dcdf11bbbb3
# Corresponds to quay.io/modh/ray:2.35.0-py311-rocm61
${RAY_ROCM_IMAGE_3.11} quay.io/modh/ray@sha256:f8b4f2b1c954187753c1f5254f7bb6a4286cec5a4f1b43def7ef4e009f2d28cb
# Corresponds to quay.io/modh/ray:2.35.0-py39-cu121
${RAY_CUDA_IMAGE_3.9} quay.io/modh/ray@sha256:0d715f92570a2997381b7cafc0e224cfa25323f18b9545acfd23bc2b71576d06
# Corresponds to quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26
${RAY_TORCH_CUDA_IMAGE_3.9} quay.io/rhoai/ray@sha256:158b481b8e9110008d60ac9fb8d156eadd71cb057ac30382e62e3a231ceb39c0
# Corresponds to quay.io/modh/fms-hf-tuning:v2.1.2
${FMS_HF_TUNING_IMAGE} quay.io/modh/fms-hf-tuning@sha256:6f98907f9095db72932caa54094438eae742145f4b66c28d15887d5303ff1186
# Corresponds to quay.io/modh/training:py311-cuda121-torch241
${CUDA_TRAINING_IMAGE} quay.io/modh/training@sha256:b98e373a972ff6f896a9dc054d56920e915675339c02ea7fa123e0f4bbef4d74
# Corresponds to quay.io/modh/training:py311-rocm61-torch241
${ROCM_TRAINING_IMAGE} quay.io/modh/training@sha256:2efb6efba4ec08e63847d701e3062a5f6ddf51c91af5fbcef6378b9e6520a3bb
# Corresponds to quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111
${NOTEBOOK_IMAGE_3.11} quay.io/modh/odh-generic-data-science-notebook@sha256:7c1a4ca213b71d342a2d1366171304e469da06d5f15710fab5dd3ce013aa1b73
# Corresponds to quay.io/modh/odh-generic-data-science-notebook:v2-2024a-20241108
${NOTEBOOK_IMAGE_3.9} quay.io/modh/odh-generic-data-science-notebook@sha256:b1066204611b4bcfa6172c3115650a8e8393089d5606458fa0d8c53633d2ce17
${NOTEBOOK_USER_NAME} ${TEST_USER_3.USERNAME}
${NOTEBOOK_USER_PASSWORD} ${TEST_USER_3.PASSWORD}
${KFTO_CORE_BINARY_NAME} kfto
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ Verify that the must-gather image provides RHODS logs and info
... MustGather
... ExcludeOnODH
... ExcludeOnDisconnected
Get must-gather Logs
Verify logs for ${APPLICATIONS_NAMESPACE}
Get Must-Gather Logs
Verify Logs For ${APPLICATIONS_NAMESPACE}
IF "${PRODUCT}" == "RHODS"
Verify Logs For ${OPERATOR_NAMESPACE}
Run Keyword If RHODS Is Managed Verify logs for ${MONITORING_NAMESPACE}
Run Keyword If RHODS Is Managed Verify Logs For ${MONITORING_NAMESPACE}
END
[Teardown] Cleanup must-gather Logs
Loading

0 comments on commit a60778a

Please sign in to comment.