Skip to content

Commit

Permalink
Merge branch 'master' into feature/ods-2173
Browse files Browse the repository at this point in the history
  • Loading branch information
asanzgom committed Jan 11, 2024
2 parents 2c0a14e + 08fc8d0 commit 0810a01
Show file tree
Hide file tree
Showing 51 changed files with 777 additions and 644 deletions.
15 changes: 15 additions & 0 deletions .github/workflows/code_quality.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,18 @@ jobs:
isort $file --check --diff
fi
done
ruff:
name: ruff
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Install ruff
run: |
pip install poetry
poetry install --only dev
- name: Run ruff check
run: poetry run ruff check ods_ci/
Empty file added ods_ci/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion ods_ci/docs/infrastructure_configuration_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ The infrastructure configuration variables are used to configure the infrastruct
| `hive_claim_name` | The name of the claim. | `rhods{provider}claim` | `all` |
| `hive_claim_ns` | The namespace of the claim. | `rhods` | `all` |
| `image_set` | The image set name to use for the cluster. | `rhods-openshift` | `all` |
| `aws_domain` | The AWS domain to use for the cluster. | `rhods.ccitredhat.com` | `aws` |
| `base_domain` | The base domain to use for the cluster. | `""` | `aws` |
| `worker_node_instance_type` | The instance type to use for the worker nodes. | `m5.xlarge` | `all` |
| `worker_node_replicas` | The number of worker nodes to create. | `2` | `all` |
| `master_node_instance_type` | The AWS instance type to use for the master nodes. | `m5.xlarge` | `all` |
Expand Down
38 changes: 23 additions & 15 deletions ods_ci/libs/DataSciencePipelinesAPI.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def wait_until_openshift_pipelines_operator_is_deployed(self):
while deployment_count != 1 and count < 30:
deployments = []
response, _ = self.run_oc(
f"oc get deployment -n openshift-operators openshift-pipelines-operator -o json"
"oc get deployment -n openshift-operators openshift-pipelines-operator -o json"
)
try:
response = json.loads(response)
Expand All @@ -45,12 +45,14 @@ def wait_until_openshift_pipelines_operator_is_deployed(self):
while pipeline_run_crd_count < 1 and count < 60:
# https://github.com/opendatahub-io/odh-dashboard/issues/1673
# It is possible to start the Pipeline Server without pipelineruns.tekton.dev CRD
pipeline_run_crd_count = self.count_pods("oc get crd pipelineruns.tekton.dev", 1)
pipeline_run_crd_count = self.count_pods(
"oc get crd pipelineruns.tekton.dev", 1
)
time.sleep(1)
count += 1
assert pipeline_run_crd_count == 1
return self.count_running_pods(
f"oc get pods -n openshift-operators -l name=openshift-pipelines-operator -o json",
"oc get pods -n openshift-operators -l name=openshift-pipelines-operator -o json",
"openshift-pipelines-operator",
"Running",
1,
Expand Down Expand Up @@ -92,7 +94,9 @@ def login_and_wait_dsp_route(
count += 1

assert self.route != "", "Route must not be empty"
print(f"Waiting for Data Science Pipeline route to be ready to avoid firing false alerts: {self.route}")
print(
f"Waiting for Data Science Pipeline route to be ready to avoid firing false alerts: {self.route}"
)
time.sleep(45)
status = -1
count = 0
Expand Down Expand Up @@ -195,12 +199,11 @@ def check_run_status(self, run_id, timeout=160):
run_status = run_json["run"]["status"]
except JSONDecodeError:
print(response, status)
pass
print(f"Checking run status: {run_status}")
if run_status == 'Failed':
if run_status == "Failed":
break
# https://github.com/tektoncd/pipeline/blob/main/docs/pipelineruns.md#monitoring-execution-status
if run_status == "Completed" or run_status == "Succeeded":
if run_status in ("Completed", "Succeeded"):
run_finished_ok = True
break
time.sleep(1)
Expand Down Expand Up @@ -255,13 +258,13 @@ def do_http_request(self, url):
return response.url

def count_pods(self, oc_command, pod_criteria, timeout=30):
oc_command = f'{oc_command} --no-headers'
oc_command = f"{oc_command} --no-headers"
pod_count = 0
count = 0
while pod_count != pod_criteria and count < timeout:
bash_str, _ = self.run_oc(oc_command)
# | wc -l is returning an empty string
pod_count = sum(1 for line in bash_str.split('\n') if line.strip())
pod_count = sum(1 for line in bash_str.split("\n") if line.strip())
if pod_count >= pod_criteria:
break
time.sleep(1)
Expand Down Expand Up @@ -304,17 +307,22 @@ def retrieve_auth_url(self):
def get_default_storage(self):
result, _ = self.run_oc("oc get storageclass -A -o json")
result = json.loads(result)
for storage_class in result['items']:
if 'annotations' in storage_class['metadata']:
if storage_class['metadata']['annotations']['storageclass.kubernetes.io/is-default-class'] == 'true':
for storage_class in result["items"]:
if "annotations" in storage_class["metadata"]:
if (
storage_class["metadata"]["annotations"][
"storageclass.kubernetes.io/is-default-class"
]
== "true"
):
break
return storage_class['metadata']['name']
return storage_class["metadata"]["name"]

def get_openshift_server(self):
return self.run_oc('oc whoami --show-server=true')[0].replace('\n', '')
return self.run_oc("oc whoami --show-server=true")[0].replace("\n", "")

def get_openshift_token(self):
return self.run_oc('oc whoami --show-token=true')[0].replace('\n', '')
return self.run_oc("oc whoami --show-token=true")[0].replace("\n", "")

def run_oc(self, command):
process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)
Expand Down
40 changes: 21 additions & 19 deletions ods_ci/libs/DataSciencePipelinesKfpTekton.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import json
import os
import sys

from DataSciencePipelinesAPI import DataSciencePipelinesAPI
from robotlibcore import keyword
from urllib3.exceptions import MaxRetryError, SSLError


class DataSciencePipelinesKfpTekton:

base_image = 'registry.redhat.io/ubi8/python-39@sha256:3523b184212e1f2243e76d8094ab52b01ea3015471471290d011625e1763af61'
base_image = "registry.redhat.io/ubi8/python-39@sha256:3523b184212e1f2243e76d8094ab52b01ea3015471471290d011625e1763af61"

# init should not have a call to external system, otherwise dry-run will fail
def __init__(self):
Expand All @@ -30,7 +30,7 @@ def get_client(self, user, pwd, project, route_name):
os.environ["TEKTON_COPY_RESULTS_STEP_IMAGE"] = default_image
os.environ["CONDITION_IMAGE_NAME"] = default_image
# https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes
os.environ["DEFAULT_ACCESSMODES"] = 'ReadWriteOnce'
os.environ["DEFAULT_ACCESSMODES"] = "ReadWriteOnce"
import kfp_tekton

# the following fallback it is to simplify the test development
Expand All @@ -53,7 +53,7 @@ def get_client(self, user, pwd, project, route_name):
return self.client, self.api

def get_cert(self, api):
cert_json = self.get_secret(api, 'openshift-ingress-operator', 'router-ca')
cert_json = self.get_secret(api, "openshift-ingress-operator", "router-ca")
cert = cert_json["data"]["tls.crt"]
decoded_cert = base64.b64decode(cert).decode("utf-8")

Expand All @@ -64,20 +64,18 @@ def get_cert(self, api):
return file_name

def get_secret(self, api, project, name):
secret_json, _ = api.run_oc(
f"oc get secret -n {project} {name} -o json"
)
secret_json, _ = api.run_oc(f"oc get secret -n {project} {name} -o json")
return json.loads(secret_json)

def get_bucket_name(self, api, project):
bucket_name, _ = api.run_oc(
f"oc get dspa -n {project} pipelines-definition -o json"
)
objectStorage = json.loads(bucket_name)['spec']['objectStorage']
if 'minio' in objectStorage:
return objectStorage['minio']['bucket']
objectStorage = json.loads(bucket_name)["spec"]["objectStorage"]
if "minio" in objectStorage:
return objectStorage["minio"]["bucket"]
else:
return objectStorage['externalStorage']['bucket']
return objectStorage["externalStorage"]["bucket"]

def import_souce_code(self, path):
module_name = os.path.basename(path).replace("-", "_")
Expand All @@ -94,7 +92,9 @@ def kfp_tekton_create_run_from_pipeline_func(
self, user, pwd, project, route_name, source_code, fn, current_path=None
):
client, api = self.get_client(user, pwd, project, route_name)
mlpipeline_minio_artifact_secret = self.get_secret(api, project, 'mlpipeline-minio-artifact')
mlpipeline_minio_artifact_secret = self.get_secret(
api, project, "mlpipeline-minio-artifact"
)
bucket_name = self.get_bucket_name(api, project)
# the current path is from where you are running the script
# sh ods_ci/run_robot_test.sh
Expand All @@ -109,12 +109,15 @@ def kfp_tekton_create_run_from_pipeline_func(
# create_run_from_pipeline_func will compile the code
# if you need to see the yaml, for debugging purpose, call: TektonCompiler().compile(pipeline, f'{fn}.yaml')
result = client.create_run_from_pipeline_func(
pipeline_func=pipeline, arguments={
'mlpipeline_minio_artifact_secret': mlpipeline_minio_artifact_secret["data"],
'bucket_name': bucket_name,
'openshift_server': self.api.get_openshift_server(),
'openshift_token': self.api.get_openshift_token()
}
pipeline_func=pipeline,
arguments={
"mlpipeline_minio_artifact_secret": mlpipeline_minio_artifact_secret[
"data"
],
"bucket_name": bucket_name,
"openshift_server": self.api.get_openshift_server(),
"openshift_token": self.api.get_openshift_token(),
},
)
# easy to debug and double check failures
print(result)
Expand All @@ -128,4 +131,3 @@ def kfp_tekton_wait_for_run_completion(
):
_, api = self.get_client(user, pwd, project, route_name)
return api.check_run_status(run_result.run_id, timeout=timeout)

13 changes: 7 additions & 6 deletions ods_ci/libs/Helpers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from robot.api import logger
from robot.libraries.BuiltIn import BuiltIn
from robotlibcore import keyword
from semver import VersionInfo

from ods_ci.utils.scripts.ocm.ocm import OpenshiftClusterManager
from robotlibcore import keyword


class Helpers:
Expand Down Expand Up @@ -136,7 +136,7 @@ def parse_file_for_tolerations(self, filename):
elif line.startswith("Events:"):
break
else:
if saving == True:
if saving is True:
tolerations.append(line.strip())
print(line)
print(tolerations)
Expand Down Expand Up @@ -213,7 +213,7 @@ def _inference_object_comparison(expected, received, threshold):
# if element is model name, don't care about ID
result_ex = model_name.match(expected)
result_rec = model_name.match(received)
if result_ex != None and result_rec != None:
if result_ex is not None and result_rec is not None:
if expected.split("__")[0] != received.split("__")[0]:
failures.append([expected, received])
# else compare values are equal
Expand Down Expand Up @@ -245,9 +245,10 @@ def send_random_inference_request(
):
import os
import random
import requests
from pathlib import Path

import requests

for _ in range(no_requests):
data_img = [
random.randrange(value_range[0], value_range[1])
Expand Down Expand Up @@ -304,11 +305,11 @@ def process_resource_list(self, filename_in, filename_out=None):
resource_name = line.split()[1]
resource_name = regex.sub(repl="", string=resource_name)
out.append(line.split()[0] + " " * spaces + resource_name + "\n")
if filename_out == None:
if filename_out is None:
filename_out = filename_in.split(".")[0] + "_processed.txt"
with open(filename_out, "w") as outfile:
outfile.write("".join(str(l) for l in out))

@keyword
def escape_forward_slashes(self, string_to_escape):
return string_to_escape.replace('/','\/')
return string_to_escape.replace("/", "\/")
58 changes: 25 additions & 33 deletions ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/bin/bash
set -e

# Make changes to gpu install file

GPU_INSTALL_DIR="$(dirname "$0")"
Expand All @@ -11,62 +13,52 @@ sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME

oc apply -f ${GPU_INSTALL_DIR}/gpu_install.yaml

function wait_until_gpu_pods_are_running() {

function wait_until_pod_ready_status() {
local timeout_seconds=1200
local sleep_time=90

echo "Waiting until gpu pods are in running state..."

SECONDS=0
while [ "$SECONDS" -le "$timeout_seconds" ]; do
pod_status=$(oc get pods -n "nvidia-gpu-operator" | grep gpu-operator | awk 'NR == 1 { print $3 }')
if [ "$pod_status" == "Running" ]; then
break
else
((remaining_seconds = timeout_seconds - SECONDS))
echo "GPU installation seems to be still running (timeout in $remaining_seconds seconds)..."
sleep $sleep_time
fi
done

if [ "$pod_status" == "Running" ]; then
printf "GPU operator is up and running\n"
return 0
else
printf "ERROR: Timeout reached while waiting for gpu operator to be in running state\n"
return 1
fi
local pod_label=$1
local namespace=nvidia-gpu-operator

echo "Waiting until GPU pods of '$pod_label' in namespace '$namespace' are in running state..."
oc wait --timeout=${timeout_seconds}s --for=condition=ready pod -n $namespace -l app="$pod_label"
}

function rerun_accelerator_migration() {
# As we are adding the GPUs after installing the RHODS operator, those GPUs are not discovered automatically.
# In order to rerun the migration we need to
# 1. Delete the migration configmap
# 2. Rollout restart dashboard deployment, so the configmap is created again and the migration run again
# Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938
# As we are adding the GPUs after installing the RHODS operator, those GPUs are not discovered automatically.
# In order to rerun the migration we need to
# 1. Delete the migration configmap
# 2. Rollout restart dashboard deployment, so the configmap is created again and the migration run again
# Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938

echo "Deleting configmap migration-gpu-status"
if ! oc delete configmap migration-gpu-status -n redhat-ods-applications;
then
printf "ERROR: When trying to delete the migration-gpu-status configmap\n"
echo "ERROR: When trying to delete the migration-gpu-status configmap"
return 1
fi

echo "Rollout restart rhods-dashboard deployment"
if ! oc rollout restart deployment.apps/rhods-dashboard -n redhat-ods-applications;
then
printf "ERROR: When trying to rollout restart rhods-dashboard deployment\n"
echo "ERROR: When trying to rollout restart rhods-dashboard deployment"
return 1
fi

echo "Waiting for up to 3 minutes until rhods-dashboard deployment is rolled out"
oc rollout status deployment.apps/rhods-dashboard -n redhat-ods-applications --watch --timeout 3m

echo "Verifying that an AcceleratorProfiles resource was created in redhat-ods-applications"
oc describe AcceleratorProfiles -n redhat-ods-applications
}

wait_until_gpu_pods_are_running
wait_until_pod_ready_status "gpu-operator"
oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml
oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
wait_until_pod_ready_status "nvidia-dcgm-exporter"
wait_until_pod_ready_status "gpu-feature-discovery"
wait_until_pod_ready_status "nvidia-operator-validator"
rerun_accelerator_migration


Loading

0 comments on commit 0810a01

Please sign in to comment.