Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix check for nvidia-device-plugin-daemonset when deploying NVIDIA operator stack #1871

Merged
merged 8 commits into from
Oct 2, 2024
10 changes: 6 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpr
oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpressions[].operator}'=Exists operator gpu-operator-certified.nvidia-gpu-operator

function wait_until_pod_ready_status() {
local timeout_seconds=1200
local pod_label=$1
local namespace=nvidia-gpu-operator
local timeout=240
local timeout=${2:-360}
start_time=$(date +%s)
while [ $(($(date +%s) - start_time)) -lt $timeout ]; do
pod_status="$(oc get pod -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)"
Expand All @@ -42,7 +41,10 @@ function wait_until_pod_ready_status() {
echo "Waiting until GPU Pods or Daemonset of '$pod_label' in namespace '$namespace' are in running state..."
echo "Pods status: '$pod_status'"
echo "Daemonset status: '$daemon_status'"
oc wait --timeout="${timeout_seconds}s" --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
oc wait --timeout=10s --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
if [ $? -ne 0 ]; then
continue
fi
oc rollout status --watch --timeout=3m daemonset -n "$namespace" -l app="$pod_label" || continue
break
fi
Expand Down Expand Up @@ -83,7 +85,7 @@ wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
wait_until_pod_ready_status "nvidia-dcgm-exporter"
wait_until_pod_ready_status "gpu-feature-discovery"
Expand Down
9 changes: 9 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ GPU_COUNT=${3:-"1"}
KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU"
MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml"
PROVIDER_OVERLAY_DIR=$KUSTOMIZE_PATH/overlays/$PROVIDER
MACHINE_WAIT_TIMEOUT=10m
# Check if existing machineset GPU already exists
EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")"
if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then
Expand Down Expand Up @@ -39,3 +40,11 @@ sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $PROVIDER_OVERLAY_DIR/gpu.yaml
oc apply --kustomize $PROVIDER_OVERLAY_DIR
# Add GPU label to the new machine-set
oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge
# wait for the machine to be Ready
echo "Waiting for GPU Node to be Ready"
oc wait --timeout=$MACHINE_WAIT_TIMEOUT --for jsonpath='{.status.readyReplicas}'=1 machineset $NEW_MACHINESET_NAME -n openshift-machine-api
if [ $? -ne 0 ]; then
echo "Machine Set $NEW_MACHINESET_NAME does not have its Machines in Running status after $MACHINE_WAIT_TIMEOUT timeout"
echo "Please check the cluster"
bdattoma marked this conversation as resolved.
Show resolved Hide resolved
exit 1
fi
Loading