Skip to content

Commit

Permalink
Merge branch 'master' into feature/argo_workflow_test
Browse files Browse the repository at this point in the history
  • Loading branch information
asanzgom authored Oct 2, 2024
2 parents 3cbda0f + bf9fb11 commit 2081966
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 4 deletions.
10 changes: 6 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpr
oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpressions[].operator}'=Exists operator gpu-operator-certified.nvidia-gpu-operator

function wait_until_pod_ready_status() {
local timeout_seconds=1200
local pod_label=$1
local namespace=nvidia-gpu-operator
local timeout=240
local timeout=${2:-360}
start_time=$(date +%s)
while [ $(($(date +%s) - start_time)) -lt $timeout ]; do
pod_status="$(oc get pod -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)"
Expand All @@ -42,7 +41,10 @@ function wait_until_pod_ready_status() {
echo "Waiting until GPU Pods or Daemonset of '$pod_label' in namespace '$namespace' are in running state..."
echo "Pods status: '$pod_status'"
echo "Daemonset status: '$daemon_status'"
oc wait --timeout="${timeout_seconds}s" --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
oc wait --timeout=10s --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
if [ $? -ne 0 ]; then
continue
fi
oc rollout status --watch --timeout=3m daemonset -n "$namespace" -l app="$pod_label" || continue
break
fi
Expand Down Expand Up @@ -83,7 +85,7 @@ wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
wait_until_pod_ready_status "nvidia-dcgm-exporter"
wait_until_pod_ready_status "gpu-feature-discovery"
Expand Down
9 changes: 9 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ GPU_COUNT=${3:-"1"}
KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU"
MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml"
PROVIDER_OVERLAY_DIR=$KUSTOMIZE_PATH/overlays/$PROVIDER
MACHINE_WAIT_TIMEOUT=10m
# Check if existing machineset GPU already exists
EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")"
if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then
Expand Down Expand Up @@ -39,3 +40,11 @@ sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $PROVIDER_OVERLAY_DIR/gpu.yaml
oc apply --kustomize $PROVIDER_OVERLAY_DIR
# Add GPU label to the new machine-set
oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge
# wait for the machine to be Ready
echo "Waiting for GPU Node to be Ready"
oc wait --timeout=$MACHINE_WAIT_TIMEOUT --for jsonpath='{.status.readyReplicas}'=1 machineset $NEW_MACHINESET_NAME -n openshift-machine-api
if [ $? -ne 0 ]; then
echo "Machine Set $NEW_MACHINESET_NAME does not have its Machines in Running status after $MACHINE_WAIT_TIMEOUT timeout"
echo "Please check the cluster"
exit 1
fi

0 comments on commit 2081966

Please sign in to comment.