Skip to content

Commit

Permalink
Merge branch 'master' into pytorch_grpc
Browse files Browse the repository at this point in the history
  • Loading branch information
rpancham authored Dec 10, 2024
2 parents 7e6da6d + f4a85a2 commit bd5e1c5
Show file tree
Hide file tree
Showing 213 changed files with 7,506 additions and 3,041 deletions.
38 changes: 10 additions & 28 deletions .github/workflows/code_quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,9 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install robotframework-robocop
run: pipx install robotframework-robocop
- name: Run robocop
run: python -m robocop --verbose --reports sarif . || true
run: robocop --verbose --reports sarif . || true
- name: Upload SARIF file
uses: github/codeql-action/upload-sarif@v2
with:
Expand All @@ -39,28 +37,21 @@ jobs:
name: python linters
runs-on: ubuntu-latest
env:
poetry_version: '1.7.1'
python_version: '3.11'
poetry_version: '1.8.3'
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Cache poetry in ~/.local
uses: actions/cache@v4
id: cached-home-local
with:
path: ~/.local
key: "${{ runner.os }}-local-${{ env.poetry_version }}"

- name: Install poetry
if: steps.cached-home-local.outputs.cache-hit != 'true'
run: pip install poetry==${{ env.poetry_version }}
run: pipx install poetry==${{ env.poetry_version }}

- name: Set up Python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: '3.11'
python-version: '${{ env.python_version }}'
cache: 'poetry'

- name: Configure poetry
Expand All @@ -86,28 +77,19 @@ jobs:
name: selftests
runs-on: ubuntu-latest
env:
poetry_version: '1.7.1'
python_version: '3.11'
poetry_version: '1.8.3'
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Cache poetry in ~/.local
uses: actions/cache@v4
id: cached-home-local
with:
path: ~/.local
key: "${{ runner.os }}-local-${{ env.poetry_version }}"

- name: Install poetry
if: steps.cached-home-local.outputs.cache-hit != 'true'
run: pip install poetry==${{ env.poetry_version }}
run: pipx install poetry==${{ env.poetry_version }}

- name: Set up Python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: '3.11'
python-version: '${{ env.python_version }}'
cache: 'poetry'

- name: Configure poetry
Expand Down
23 changes: 22 additions & 1 deletion ods_ci/libs/DataSciencePipelinesKfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,12 @@ def wait_for_run_completion(self, run_id, timeout=160, sleep_duration=5):
response = self.client.wait_for_run_completion(run_id=run_id, timeout=timeout, sleep_duration=sleep_duration)
return response.state

@keyword
def get_run_status(self, run_id):
"""###Gets run status"""
response = self.client.get_run(run_id)
return response.state

@keyword
def check_run_status(self, run_id, timeout=160):
"""Waits for a run to complete"""
Expand All @@ -298,6 +304,22 @@ def check_run_status(self, run_id, timeout=160):
count += 1
return run_status # pyright: ignore [reportPossiblyUnboundVariable]

@keyword
def get_last_run_by_pipeline_name(self, pipeline_name: str | None = None, namespace: str | None = None):
"""
Gets run_id of the last run created for pipeline_name
:param pipeline_name:
:param namespace:
:return:
run_id
"""
pipeline_id = self.client.get_pipeline_id(pipeline_name)
pipeline_version_id = self.get_last_pipeline_version(pipeline_id)
all_runs = self.get_all_runs(namespace=namespace, pipeline_version_id=pipeline_version_id)
if len(all_runs) > 0:
return all_runs[-1].run_id
return None

@keyword
def delete_pipeline(self, pipeline_id):
"""Deletes a pipeline"""
Expand Down Expand Up @@ -455,7 +477,6 @@ def create_run_from_pipeline_func(
current_path = os.getcwd()
my_source = self.import_souce_code(f"{current_path}/tests/Resources/Files/pipeline-samples/v2/{source_code}")
pipeline_func = getattr(my_source, fn)

# pipeline_params
# there are some special keys to retrieve argument values dynamically
# in pipeline v2, we must match the parameters names
Expand Down
7 changes: 7 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,13 @@ function wait_until_driver_image_is_built() {
}

function create_acceleratorprofile() {
echo "Creating AMD Accelerator Profile"
rhoai_ns=$(oc get namespace redhat-ods-applications --ignore-not-found -oname)
if [ -z $rhoai_ns ];
then
echo "redhat-ods-applications namespace not found. Is RHOAI Installed? NVIDIA Accelerator Profile creation SKIPPED."
return 0
fi
echo "Creating an Accelerator Profile for Dashboard"
oc apply -f - <<EOF
apiVersion: dashboard.opendatahub.io/v1
Expand Down
18 changes: 13 additions & 5 deletions ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpr
oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpressions[].operator}'=Exists operator gpu-operator-certified.nvidia-gpu-operator

function wait_until_pod_ready_status() {
local timeout_seconds=1200
local pod_label=$1
local namespace=nvidia-gpu-operator
local timeout=240
local timeout=${2:-360}
start_time=$(date +%s)
while [ $(($(date +%s) - start_time)) -lt $timeout ]; do
pod_status="$(oc get pod -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)"
Expand All @@ -42,7 +41,10 @@ function wait_until_pod_ready_status() {
echo "Waiting until GPU Pods or Daemonset of '$pod_label' in namespace '$namespace' are in running state..."
echo "Pods status: '$pod_status'"
echo "Daemonset status: '$daemon_status'"
oc wait --timeout="${timeout_seconds}s" --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
oc wait --timeout=10s --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
if [ $? -ne 0 ]; then
continue
fi
oc rollout status --watch --timeout=3m daemonset -n "$namespace" -l app="$pod_label" || continue
break
fi
Expand All @@ -57,7 +59,13 @@ function rerun_accelerator_migration() {
# 1. Delete the migration configmap
# 2. Rollout restart dashboard deployment, so the configmap is created again and the migration run again
# Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938

echo "Creating NVIDIA Accelerator Profile via RHOAI Dashboard deployment rollout"
configmap=$(oc get configmap migration-gpu-status --ignore-not-found -n redhat-ods-applications -oname)
if [ -z $configmap ];
then
echo "migration-gpu-status not found. Is RHOAI Installed? NVIDIA Accelerator Profile creation SKIPPED."
return 0
fi
echo "Deleting configmap migration-gpu-status"
if ! oc delete configmap migration-gpu-status -n redhat-ods-applications;
then
Expand All @@ -83,7 +91,7 @@ wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
wait_until_pod_ready_status "nvidia-dcgm-exporter"
wait_until_pod_ready_status "gpu-feature-discovery"
Expand Down
9 changes: 9 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ GPU_COUNT=${3:-"1"}
KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU"
MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml"
PROVIDER_OVERLAY_DIR=$KUSTOMIZE_PATH/overlays/$PROVIDER
MACHINE_WAIT_TIMEOUT=10m
# Check if existing machineset GPU already exists
EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")"
if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then
Expand Down Expand Up @@ -39,3 +40,11 @@ sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $PROVIDER_OVERLAY_DIR/gpu.yaml
oc apply --kustomize $PROVIDER_OVERLAY_DIR
# Add GPU label to the new machine-set
oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge
# wait for the machine to be Ready
echo "Waiting for GPU Node to be Ready"
oc wait --timeout=$MACHINE_WAIT_TIMEOUT --for jsonpath='{.status.readyReplicas}'=1 machineset $NEW_MACHINESET_NAME -n openshift-machine-api
if [ $? -ne 0 ]; then
echo "Machine Set $NEW_MACHINESET_NAME does not have its Machines in Running status after $MACHINE_WAIT_TIMEOUT timeout"
echo "Please check the cluster"
exit 1
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- op: remove
path: /spec/template/spec/providerSpec/value/zone
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
- op: replace
path: /spec/template/spec/providerSpec/value/vmSize
value: INSTANCE_TYPE
- op: remove
path: /spec/template/spec/providerSpec/value/zone

- op: replace
path: /spec/replicas
value: 1
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,7 @@ patches:
- path: gpu.yaml
target:
kind: MachineSet
- path: gpu-remove-zone.yaml
target:
kind: MachineSet
name: ".*[^(westus)]" # regions without availability zones. If needed, add | <region name> after westus
72 changes: 0 additions & 72 deletions ods_ci/tasks/Resources/Provisioning/Hive/deprovision.robot

This file was deleted.

8 changes: 2 additions & 6 deletions ods_ci/tasks/Resources/Provisioning/Hive/provision.robot
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
*** Settings ***
Resource deprovision.robot
Resource ../../../../tests/Resources/Common.robot
Library Process

Expand Down Expand Up @@ -72,9 +71,7 @@ Handle Already Existing Cluster
${result} = Run Process oc -n ${hive_namespace} get cd ${cluster_name} -o json | jq -r '.status.webConsoleURL' --exit-status shell=yes # robocop: disable:line-too-long
END
IF ${result.rc} != 0
Log Cluster '${cluster_name}' has previously failed to be provisioned - Cleaning Hive resources
... console=True
Delete Cluster Configuration
FAIL Cluster '${cluster_name}' has previously failed to be provisioned but some Hive and/or Cloud resources are still present.
ELSE
FAIL Cluster '${cluster_name}' is already in use, please choose a different name.
END
Expand Down Expand Up @@ -210,9 +207,8 @@ Wait For Cluster To Be Ready
${provision_status} = Run Process oc -n ${pool_namespace} get cd ${clusterdeployment_name} -o json shell=yes # robocop: disable:line-too-long
${custer_status} = Run Process oc -n ${hive_namespace} get clusterclaim ${claim_name} -o json shell=yes
Log Cluster '${cluster_name}' deployment had errors, see: ${\n}${provision_status.stdout}${\n}${custer_status.stdout} level=ERROR # robocop: disable:line-too-long
Log Cluster '${cluster_name}' install completed, but it is not accessible - Cleaning Hive resources now
Log Cluster '${cluster_name}' install completed, but it is not accessible
... console=True
Deprovision Cluster
FAIL Cluster '${cluster_name}' provisioning failed. Please look into the logs for more details.
END
Log Cluster '${cluster_name}' install completed and accessible at: ${web_access.stdout} console=True
Expand Down
Loading

0 comments on commit bd5e1c5

Please sign in to comment.