From bf9fb1167841259477fce8b6a3152b085a7e0332 Mon Sep 17 00:00:00 2001
From: Berto D'Attoma <88311595+bdattoma@users.noreply.github.com>
Date: Wed, 2 Oct 2024 10:23:51 +0200
Subject: [PATCH] Fix check for nvidia-device-plugin-daemonset when deploying
 NVIDIA operator stack (#1871)

* repeat pod checks until timeout instead of waiting 20 min

* remove second timeout var

* pass timeout as param for nvidia-device-plugin-daemonset

* wait for machine status after provisioning

* update wait for machine

* add print before waiting for gpu node

* Update ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh

Co-authored-by: Kobi Hakimi <kobihk@users.noreply.github.com>

---------

Co-authored-by: Kobi Hakimi <kobihk@users.noreply.github.com>
---
 .../Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh    | 10 ++++++----
 .../tasks/Resources/Provisioning/GPU/provision-gpu.sh  |  9 +++++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
index 5edbff0b3..7635bc535 100755
--- a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
+++ b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
@@ -30,10 +30,9 @@ oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpr
 oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpressions[].operator}'=Exists operator gpu-operator-certified.nvidia-gpu-operator
 
 function wait_until_pod_ready_status() {
-  local timeout_seconds=1200
   local pod_label=$1
   local namespace=nvidia-gpu-operator
-  local timeout=240
+  local timeout=${2:-360}
   start_time=$(date +%s)
   while [ $(($(date +%s) - start_time)) -lt $timeout ]; do
      pod_status="$(oc get pod -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)"
@@ -42,7 +41,10 @@ function wait_until_pod_ready_status() {
         echo "Waiting until GPU Pods or Daemonset of '$pod_label' in namespace '$namespace' are in running state..."
         echo "Pods status: '$pod_status'"
         echo "Daemonset status: '$daemon_status'"
-        oc wait --timeout="${timeout_seconds}s" --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
+        oc wait --timeout=10s --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
+        if [ $? -ne 0 ]; then
+          continue
+        fi
         oc rollout status --watch --timeout=3m daemonset -n "$namespace" -l app="$pod_label" || continue
         break
      fi
@@ -83,7 +85,7 @@ wait_until_pod_ready_status  "gpu-operator"
 oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
 oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
 oc apply -f clusterpolicy.json
-wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
+wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
 wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
 wait_until_pod_ready_status "nvidia-dcgm-exporter"
 wait_until_pod_ready_status "gpu-feature-discovery"
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh b/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
index 80a16a567..65bd689af 100755
--- a/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
+++ b/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
@@ -8,6 +8,7 @@ GPU_COUNT=${3:-"1"}
 KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU"
 MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml"
 PROVIDER_OVERLAY_DIR=$KUSTOMIZE_PATH/overlays/$PROVIDER
+MACHINE_WAIT_TIMEOUT=10m
 # Check if existing machineset GPU already exists
 EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")"
 if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then
@@ -39,3 +40,11 @@ sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $PROVIDER_OVERLAY_DIR/gpu.yaml
 oc apply --kustomize $PROVIDER_OVERLAY_DIR
 # Add GPU label to the new machine-set
 oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge
+# wait for the machine to be Ready
+echo "Waiting for GPU Node to be Ready"
+oc wait --timeout=$MACHINE_WAIT_TIMEOUT --for jsonpath='{.status.readyReplicas}'=1 machineset $NEW_MACHINESET_NAME -n openshift-machine-api
+ if [ $? -ne 0 ]; then
+  echo "Machine Set $NEW_MACHINESET_NAME does not have its Machines in Running status after $MACHINE_WAIT_TIMEOUT timeout"
+  echo "Please check the cluster"
+  exit 1
+fi
\ No newline at end of file