From bf9fb1167841259477fce8b6a3152b085a7e0332 Mon Sep 17 00:00:00 2001 From: Berto D'Attoma <88311595+bdattoma@users.noreply.github.com> Date: Wed, 2 Oct 2024 10:23:51 +0200 Subject: [PATCH] Fix check for nvidia-device-plugin-daemonset when deploying NVIDIA operator stack (#1871) * repeat pod checks until timeout instead of waiting 20 min * remove second timeout var * pass timeout as param for nvidia-device-plugin-daemonset * wait for machine status after provisioning * update wait for machine * add print before waiting for gpu node * Update ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh Co-authored-by: Kobi Hakimi --------- Co-authored-by: Kobi Hakimi --- .../Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh | 10 ++++++---- .../tasks/Resources/Provisioning/GPU/provision-gpu.sh | 9 +++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh index 5edbff0b3..7635bc535 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh @@ -30,10 +30,9 @@ oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpr oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpressions[].operator}'=Exists operator gpu-operator-certified.nvidia-gpu-operator function wait_until_pod_ready_status() { - local timeout_seconds=1200 local pod_label=$1 local namespace=nvidia-gpu-operator - local timeout=240 + local timeout=${2:-360} start_time=$(date +%s) while [ $(($(date +%s) - start_time)) -lt $timeout ]; do pod_status="$(oc get pod -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)" @@ -42,7 +41,10 @@ function wait_until_pod_ready_status() { echo "Waiting until GPU Pods or Daemonset of '$pod_label' in namespace '$namespace' are in running state..." echo "Pods status: '$pod_status'" echo "Daemonset status: '$daemon_status'" - oc wait --timeout="${timeout_seconds}s" --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \ + oc wait --timeout=10s --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \ + if [ $? -ne 0 ]; then + continue + fi oc rollout status --watch --timeout=3m daemonset -n "$namespace" -l app="$pod_label" || continue break fi @@ -83,7 +85,7 @@ wait_until_pod_ready_status "gpu-operator" oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml" oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json oc apply -f clusterpolicy.json -wait_until_pod_ready_status "nvidia-device-plugin-daemonset" +wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600 wait_until_pod_ready_status "nvidia-container-toolkit-daemonset" wait_until_pod_ready_status "nvidia-dcgm-exporter" wait_until_pod_ready_status "gpu-feature-discovery" diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh b/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh index 80a16a567..65bd689af 100755 --- a/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh +++ b/ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh @@ -8,6 +8,7 @@ GPU_COUNT=${3:-"1"} KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU" MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml" PROVIDER_OVERLAY_DIR=$KUSTOMIZE_PATH/overlays/$PROVIDER +MACHINE_WAIT_TIMEOUT=10m # Check if existing machineset GPU already exists EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")" if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then @@ -39,3 +40,11 @@ sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $PROVIDER_OVERLAY_DIR/gpu.yaml oc apply --kustomize $PROVIDER_OVERLAY_DIR # Add GPU label to the new machine-set oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge +# wait for the machine to be Ready +echo "Waiting for GPU Node to be Ready" +oc wait --timeout=$MACHINE_WAIT_TIMEOUT --for jsonpath='{.status.readyReplicas}'=1 machineset $NEW_MACHINESET_NAME -n openshift-machine-api + if [ $? -ne 0 ]; then + echo "Machine Set $NEW_MACHINESET_NAME does not have its Machines in Running status after $MACHINE_WAIT_TIMEOUT timeout" + echo "Please check the cluster" + exit 1 +fi \ No newline at end of file