Merge branch 'master' into feature/ods-2173

asanzgom · Jan 11, 2024 · 0810a01 · 0810a01
2 parents 2c0a14e + 08fc8d0
commit 0810a01
Show file tree

Hide file tree

Showing 51 changed files with 777 additions and 644 deletions.
diff --git a/.github/workflows/code_quality.yaml b/.github/workflows/code_quality.yaml
@@ -81,3 +81,18 @@ jobs:
                isort $file --check --diff
             fi
           done
+  ruff:
+    name: ruff
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install ruff
+        run: |
+          pip install poetry
+          poetry install --only dev
+
+      - name: Run ruff check
+        run: poetry run ruff check ods_ci/
diff --git a/ods_ci/__init__.py b/ods_ci/__init__.py
diff --git a/ods_ci/docs/infrastructure_configuration_variables.md b/ods_ci/docs/infrastructure_configuration_variables.md
@@ -9,7 +9,7 @@ The infrastructure configuration variables are used to configure the infrastruct
 | `hive_claim_name` | The name of the claim. | `rhods{provider}claim` | `all` |
 | `hive_claim_ns` | The namespace of the claim. | `rhods` | `all` |
 | `image_set` | The image set name to use for the cluster. | `rhods-openshift` | `all` |
-| `aws_domain` | The AWS domain to use for the cluster. | `rhods.ccitredhat.com` | `aws` |
+| `base_domain` | The base domain to use for the cluster. | `""` | `aws` |
 | `worker_node_instance_type` | The instance type to use for the worker nodes. | `m5.xlarge` | `all` |
 | `worker_node_replicas` | The number of worker nodes to create. | `2` | `all` |
 | `master_node_instance_type` | The AWS instance type to use for the master nodes. | `m5.xlarge` | `all` |

diff --git a/ods_ci/libs/DataSciencePipelinesAPI.py b/ods_ci/libs/DataSciencePipelinesAPI.py
@@ -25,7 +25,7 @@ def wait_until_openshift_pipelines_operator_is_deployed(self):
         while deployment_count != 1 and count < 30:
             deployments = []
             response, _ = self.run_oc(
-                f"oc get deployment -n openshift-operators openshift-pipelines-operator -o json"
+                "oc get deployment -n openshift-operators openshift-pipelines-operator -o json"
             )
             try:
                 response = json.loads(response)
@@ -45,12 +45,14 @@ def wait_until_openshift_pipelines_operator_is_deployed(self):
         while pipeline_run_crd_count < 1 and count < 60:
             # https://github.com/opendatahub-io/odh-dashboard/issues/1673
             # It is possible to start the Pipeline Server without pipelineruns.tekton.dev CRD
-            pipeline_run_crd_count = self.count_pods("oc get crd pipelineruns.tekton.dev", 1)
+            pipeline_run_crd_count = self.count_pods(
+                "oc get crd pipelineruns.tekton.dev", 1
+            )
             time.sleep(1)
             count += 1
         assert pipeline_run_crd_count == 1
         return self.count_running_pods(
-            f"oc get pods -n openshift-operators -l name=openshift-pipelines-operator -o json",
+            "oc get pods -n openshift-operators -l name=openshift-pipelines-operator -o json",
             "openshift-pipelines-operator",
             "Running",
             1,
@@ -92,7 +94,9 @@ def login_and_wait_dsp_route(
             count += 1
 
         assert self.route != "", "Route must not be empty"
-        print(f"Waiting for Data Science Pipeline route to be ready to avoid firing false alerts: {self.route}")
+        print(
+            f"Waiting for Data Science Pipeline route to be ready to avoid firing false alerts: {self.route}"
+        )
         time.sleep(45)
         status = -1
         count = 0
@@ -195,12 +199,11 @@ def check_run_status(self, run_id, timeout=160):
                     run_status = run_json["run"]["status"]
             except JSONDecodeError:
                 print(response, status)
-                pass
             print(f"Checking run status: {run_status}")
-            if run_status == 'Failed':
+            if run_status == "Failed":
                 break
             # https://github.com/tektoncd/pipeline/blob/main/docs/pipelineruns.md#monitoring-execution-status
-            if run_status == "Completed" or run_status == "Succeeded":
+            if run_status in ("Completed", "Succeeded"):
                 run_finished_ok = True
                 break
             time.sleep(1)
@@ -255,13 +258,13 @@ def do_http_request(self, url):
         return response.url
 
     def count_pods(self, oc_command, pod_criteria, timeout=30):
-        oc_command = f'{oc_command} --no-headers'
+        oc_command = f"{oc_command} --no-headers"
         pod_count = 0
         count = 0
         while pod_count != pod_criteria and count < timeout:
             bash_str, _ = self.run_oc(oc_command)
             # | wc -l is returning an empty string
-            pod_count = sum(1 for line in bash_str.split('\n') if line.strip())
+            pod_count = sum(1 for line in bash_str.split("\n") if line.strip())
             if pod_count >= pod_criteria:
                 break
             time.sleep(1)
@@ -304,17 +307,22 @@ def retrieve_auth_url(self):
     def get_default_storage(self):
         result, _ = self.run_oc("oc get storageclass -A -o json")
         result = json.loads(result)
-        for storage_class in result['items']:
-            if 'annotations' in storage_class['metadata']:
-                if storage_class['metadata']['annotations']['storageclass.kubernetes.io/is-default-class'] == 'true':
+        for storage_class in result["items"]:
+            if "annotations" in storage_class["metadata"]:
+                if (
+                    storage_class["metadata"]["annotations"][
+                        "storageclass.kubernetes.io/is-default-class"
+                    ]
+                    == "true"
+                ):
                     break
-        return storage_class['metadata']['name']
+        return storage_class["metadata"]["name"]
 
     def get_openshift_server(self):
-        return self.run_oc('oc whoami --show-server=true')[0].replace('\n', '')
+        return self.run_oc("oc whoami --show-server=true")[0].replace("\n", "")
 
     def get_openshift_token(self):
-        return self.run_oc('oc whoami --show-token=true')[0].replace('\n', '')
+        return self.run_oc("oc whoami --show-token=true")[0].replace("\n", "")
 
     def run_oc(self, command):
         process = subprocess.Popen(command.split(), stdout=subprocess.PIPE)

diff --git a/ods_ci/libs/DataSciencePipelinesKfpTekton.py b/ods_ci/libs/DataSciencePipelinesKfpTekton.py
@@ -3,14 +3,14 @@
 import json
 import os
 import sys
+
 from DataSciencePipelinesAPI import DataSciencePipelinesAPI
 from robotlibcore import keyword
 from urllib3.exceptions import MaxRetryError, SSLError
 
 
 class DataSciencePipelinesKfpTekton:
-
-    base_image = 'registry.redhat.io/ubi8/python-39@sha256:3523b184212e1f2243e76d8094ab52b01ea3015471471290d011625e1763af61'
+    base_image = "registry.redhat.io/ubi8/python-39@sha256:3523b184212e1f2243e76d8094ab52b01ea3015471471290d011625e1763af61"
 
     # init should not have a call to external system, otherwise dry-run will fail
     def __init__(self):
@@ -30,7 +30,7 @@ def get_client(self, user, pwd, project, route_name):
             os.environ["TEKTON_COPY_RESULTS_STEP_IMAGE"] = default_image
             os.environ["CONDITION_IMAGE_NAME"] = default_image
             # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes
-            os.environ["DEFAULT_ACCESSMODES"] = 'ReadWriteOnce'
+            os.environ["DEFAULT_ACCESSMODES"] = "ReadWriteOnce"
             import kfp_tekton
 
             # the following fallback it is to simplify the test development
@@ -53,7 +53,7 @@ def get_client(self, user, pwd, project, route_name):
         return self.client, self.api
 
     def get_cert(self, api):
-        cert_json = self.get_secret(api, 'openshift-ingress-operator', 'router-ca')
+        cert_json = self.get_secret(api, "openshift-ingress-operator", "router-ca")
         cert = cert_json["data"]["tls.crt"]
         decoded_cert = base64.b64decode(cert).decode("utf-8")
 
@@ -64,20 +64,18 @@ def get_cert(self, api):
         return file_name
 
     def get_secret(self, api, project, name):
-        secret_json, _ = api.run_oc(
-            f"oc get secret -n {project} {name} -o json"
-        )
+        secret_json, _ = api.run_oc(f"oc get secret -n {project} {name} -o json")
         return json.loads(secret_json)
 
     def get_bucket_name(self, api, project):
         bucket_name, _ = api.run_oc(
             f"oc get dspa -n {project} pipelines-definition -o json"
         )
-        objectStorage = json.loads(bucket_name)['spec']['objectStorage']
-        if 'minio' in objectStorage:
-            return objectStorage['minio']['bucket']
+        objectStorage = json.loads(bucket_name)["spec"]["objectStorage"]
+        if "minio" in objectStorage:
+            return objectStorage["minio"]["bucket"]
         else:
-            return objectStorage['externalStorage']['bucket']
+            return objectStorage["externalStorage"]["bucket"]
 
     def import_souce_code(self, path):
         module_name = os.path.basename(path).replace("-", "_")
@@ -94,7 +92,9 @@ def kfp_tekton_create_run_from_pipeline_func(
         self, user, pwd, project, route_name, source_code, fn, current_path=None
     ):
         client, api = self.get_client(user, pwd, project, route_name)
-        mlpipeline_minio_artifact_secret = self.get_secret(api, project, 'mlpipeline-minio-artifact')
+        mlpipeline_minio_artifact_secret = self.get_secret(
+            api, project, "mlpipeline-minio-artifact"
+        )
         bucket_name = self.get_bucket_name(api, project)
         # the current path is from where you are running the script
         # sh ods_ci/run_robot_test.sh
@@ -109,12 +109,15 @@ def kfp_tekton_create_run_from_pipeline_func(
         # create_run_from_pipeline_func will compile the code
         # if you need to see the yaml, for debugging purpose, call: TektonCompiler().compile(pipeline, f'{fn}.yaml')
         result = client.create_run_from_pipeline_func(
-            pipeline_func=pipeline, arguments={
-                'mlpipeline_minio_artifact_secret': mlpipeline_minio_artifact_secret["data"],
-                'bucket_name': bucket_name,
-                'openshift_server': self.api.get_openshift_server(),
-                'openshift_token': self.api.get_openshift_token()
-            }
+            pipeline_func=pipeline,
+            arguments={
+                "mlpipeline_minio_artifact_secret": mlpipeline_minio_artifact_secret[
+                    "data"
+                ],
+                "bucket_name": bucket_name,
+                "openshift_server": self.api.get_openshift_server(),
+                "openshift_token": self.api.get_openshift_token(),
+            },
         )
         # easy to debug and double check failures
         print(result)
@@ -128,4 +131,3 @@ def kfp_tekton_wait_for_run_completion(
     ):
         _, api = self.get_client(user, pwd, project, route_name)
         return api.check_run_status(run_result.run_id, timeout=timeout)
-
diff --git a/ods_ci/libs/Helpers.py b/ods_ci/libs/Helpers.py
@@ -1,9 +1,9 @@
 from robot.api import logger
 from robot.libraries.BuiltIn import BuiltIn
+from robotlibcore import keyword
 from semver import VersionInfo
 
 from ods_ci.utils.scripts.ocm.ocm import OpenshiftClusterManager
-from robotlibcore import keyword
 
 
 class Helpers:
@@ -136,7 +136,7 @@ def parse_file_for_tolerations(self, filename):
             elif line.startswith("Events:"):
                 break
             else:
-                if saving == True:
+                if saving is True:
                     tolerations.append(line.strip())
                     print(line)
                     print(tolerations)
@@ -213,7 +213,7 @@ def _inference_object_comparison(expected, received, threshold):
                     # if element is model name, don't care about ID
                     result_ex = model_name.match(expected)
                     result_rec = model_name.match(received)
-                    if result_ex != None and result_rec != None:
+                    if result_ex is not None and result_rec is not None:
                         if expected.split("__")[0] != received.split("__")[0]:
                             failures.append([expected, received])
                     # else compare values are equal
@@ -245,9 +245,10 @@ def send_random_inference_request(
     ):
         import os
         import random
-        import requests
         from pathlib import Path
 
+        import requests
+
         for _ in range(no_requests):
             data_img = [
                 random.randrange(value_range[0], value_range[1])
@@ -304,11 +305,11 @@ def process_resource_list(self, filename_in, filename_out=None):
                 resource_name = line.split()[1]
                 resource_name = regex.sub(repl="", string=resource_name)
                 out.append(line.split()[0] + " " * spaces + resource_name + "\n")
-        if filename_out == None:
+        if filename_out is None:
             filename_out = filename_in.split(".")[0] + "_processed.txt"
         with open(filename_out, "w") as outfile:
             outfile.write("".join(str(l) for l in out))
 
     @keyword
     def escape_forward_slashes(self, string_to_escape):
-        return string_to_escape.replace('/','\/')
+        return string_to_escape.replace("/", "\/")
diff --git a/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh b/ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+set -e
+
 # Make changes to gpu install file
 
 GPU_INSTALL_DIR="$(dirname "$0")"
@@ -11,62 +13,52 @@ sed -i -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME
 
 oc apply -f ${GPU_INSTALL_DIR}/gpu_install.yaml
 
-function wait_until_gpu_pods_are_running() {
-
+function wait_until_pod_ready_status() {
   local timeout_seconds=1200
-  local sleep_time=90
-
-  echo "Waiting until gpu pods are in running state..."
-
-  SECONDS=0
-  while [ "$SECONDS" -le "$timeout_seconds" ]; do
-    pod_status=$(oc get pods -n "nvidia-gpu-operator" | grep gpu-operator | awk 'NR == 1 { print $3 }')
-    if [ "$pod_status" == "Running" ]; then
-      break
-    else
-      ((remaining_seconds = timeout_seconds - SECONDS))
-      echo "GPU installation seems to be still running (timeout in $remaining_seconds seconds)..."
-      sleep $sleep_time
-    fi
-  done
-
-  if [ "$pod_status" == "Running" ]; then
-    printf "GPU operator is up and running\n"
-    return 0
-  else
-    printf "ERROR: Timeout reached while waiting for gpu operator to be in running state\n"
-    return 1
-  fi
+  local pod_label=$1
+  local namespace=nvidia-gpu-operator
 
+  echo "Waiting until GPU pods of '$pod_label' in namespace '$namespace' are in running state..."
+  oc wait --timeout=${timeout_seconds}s --for=condition=ready pod -n $namespace -l app="$pod_label"
 }
 
 function rerun_accelerator_migration() {
-# As we are adding the GPUs after installing the RHODS operator, those GPUs are not discovered automatically.
-# In order to rerun the migration we need to
-# 1. Delete the migration configmap
-# 2. Rollout restart dashboard deployment, so the configmap is created again and the migration run again
-# Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938
+  # As we are adding the GPUs after installing the RHODS operator, those GPUs are not discovered automatically.
+  # In order to rerun the migration we need to
+  # 1. Delete the migration configmap
+  # 2. Rollout restart dashboard deployment, so the configmap is created again and the migration run again
+  # Context: https://github.com/opendatahub-io/odh-dashboard/issues/1938
 
   echo "Deleting configmap migration-gpu-status"
   if ! oc delete configmap migration-gpu-status -n redhat-ods-applications;
     then
-      printf "ERROR: When trying to delete the migration-gpu-status configmap\n"
+      echo "ERROR: When trying to delete the migration-gpu-status configmap"
       return 1
   fi
 
   echo "Rollout restart rhods-dashboard deployment"
   if ! oc rollout restart deployment.apps/rhods-dashboard -n redhat-ods-applications;
     then
-      printf "ERROR: When trying to rollout restart rhods-dashboard deployment\n"
+      echo "ERROR: When trying to rollout restart rhods-dashboard deployment"
       return 1
   fi
 
+  echo "Waiting for up to 3 minutes until rhods-dashboard deployment is rolled out"
+  oc rollout status deployment.apps/rhods-dashboard -n redhat-ods-applications --watch --timeout 3m
+
+  echo "Verifying that an AcceleratorProfiles resource was created in redhat-ods-applications"
+  oc describe AcceleratorProfiles -n redhat-ods-applications
 }
 
-wait_until_gpu_pods_are_running
+wait_until_pod_ready_status  "gpu-operator"
 oc apply -f ${GPU_INSTALL_DIR}/nfd_deploy.yaml
 oc get csv -n nvidia-gpu-operator $CSVNAME -ojsonpath={.metadata.annotations.alm-examples} | jq .[0] > clusterpolicy.json
 oc apply -f clusterpolicy.json
+wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
+wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
+wait_until_pod_ready_status "nvidia-dcgm-exporter"
+wait_until_pod_ready_status "gpu-feature-discovery"
+wait_until_pod_ready_status "nvidia-operator-validator"
 rerun_accelerator_migration