red-hat-storage · yitzhak12 · Nov 24, 2024 · Nov 25, 2024 · Nov 26, 2024 · ebenahar
@@ -3578,20 +3578,21 @@ def restart_pods_in_statuses(
     logger.info("Finish restarting the pods")
 
 
-def wait_for_ceph_cmd_execute_successfully(timeout=300):
+def base_wait_for_ceph_cmd_execute_successfully(timeout=300, sleep=20):
     """
     Wait for a Ceph command to execute successfully
 
     Args:
         timeout (int): The time to wait for a Ceph command to execute successfully
+        sleep (int): Time to sleep between the iterations
 
     Returns:
         bool: True, if the Ceph command executed successfully. False, otherwise
 
     """
     try:
         for res in TimeoutSampler(
-            timeout=timeout, sleep=10, func=check_ceph_cmd_execute_successfully
+            timeout=timeout, sleep=sleep, func=check_ceph_cmd_execute_successfully
         ):
             if res:
                 return True
@@ -3955,3 +3956,66 @@ def get_prometheus_pods(
     pods_with_label_match = get_pods_having_label(prometheus_label, namespace)
     prometheus_pod_objs = [Pod(**prometheus) for prometheus in pods_with_label_match]
     return prometheus_pod_objs
+
+
+def get_ceph_tool_pods(ceph_tool_label=constants.TOOL_APP_LABEL, namespace=None):
+    """
+    Fetches info about the ceph tools pods in the cluster
+
+    Args:
+        ceph_tool_label (str): label associated with ceph tool pods
+            (default: constants.TOOL_APP_LABEL)
+        namespace (str): Namespace in which ceph cluster lives
+            (default: config.ENV_DATA["cluster_namespace"])
+
+    Returns:
+        list : List of the ceph tool pod objects
+    """
+    namespace = namespace or config.ENV_DATA["cluster_namespace"]
+    ceph_tools = get_pods_having_label(ceph_tool_label, namespace)
+    ceph_tool_pods = [Pod(**ceph_tool) for ceph_tool in ceph_tools]
+    return ceph_tool_pods
+
+
+def wait_for_ceph_cmd_execute_successfully(
+    timeout=300, sleep=20, num_of_retries=1, restart_tool_pod_before_retry=True
+):
+    """
+    Wait for the Ceph command to execute successfully in the given timeout and number of retries.
+    For, example, if the timeout is 300 and 'num_of_retries' is 2, we will wait 600 seconds
+    for the ceph command to execute successfully.
+
+    Args:
+        timeout (int): The time to wait for a Ceph command to execute successfully
+        sleep (int): Time to sleep between the iterations
+        num_of_retries (int): The number of retries to wait for the Ceph command to execute successfully.
+        restart_tool_pod_before_retry (bool): If True, restart the rook-ceph-tool pod before the next retry.
+            False, otherwise.
+
+    Returns:
+        bool: True, if the Ceph command executed successfully. False, otherwise
+
+    """
+    logger.info("Wait for the ceph command to execute successfully")
+
+    for num_of_retry in range(num_of_retries):
+        logger.info(f"num of retries = {num_of_retry}")
+        res = base_wait_for_ceph_cmd_execute_successfully(timeout=timeout, sleep=sleep)
+        if res:
+            return True
+        if num_of_retry < 1:
+            # Continue to the next iteration if we didn't reach the first retry
+            continue
+
+        if restart_tool_pod_before_retry:
+            try:
+                logger.info("Trying to restart the rook-ceph-tools pod...")
+                ceph_tool_pods = get_ceph_tool_pods()
+                delete_pods(ceph_tool_pods, wait=True)
+            except CommandFailed as ex:
+                logger.warning(ex)
+
+    logger.warning(
+        f"The ceph command failed to execute successfully after {num_of_retries} retries"
+    )
+    return False
@@ -112,6 +112,7 @@ def test_rolling_nodes_restart(
         ocp_nodes = get_node_objs()
         for node in ocp_nodes:
             nodes.restart_nodes(nodes=[node], wait=False)
+            pod.wait_for_ceph_cmd_execute_successfully(timeout=420, num_of_retries=2)
             self.sanity_helpers.health_check(cluster_check=False, tries=60)
 
         retry(CommandFailed, tries=8, delay=40, backoff=1)(

@@ -29,6 +29,7 @@
 )
 from ocs_ci.ocs.resources.pod import (
     check_pods_after_node_replacement,
+    wait_for_ceph_cmd_execute_successfully,
 )
 from ocs_ci.helpers.sanity_helpers import SanityManagedService, Sanity
 from ocs_ci.ocs.cluster import (
@@ -154,6 +155,8 @@ def rolling_terminate_and_recovery_of_ocs_worker_nodes(self, nodes):
                 label_nodes([new_ocs_node])
 
             log.info(f"The new ocs node is: {new_ocs_node.name}")
+            log.info("Wait for the Ceph health command to execute successfully")
+            wait_for_ceph_cmd_execute_successfully(timeout=420, num_of_retries=2)
             log.info("Waiting for all the pods to be running")
             assert check_pods_after_node_replacement(), "Not all the pods are running"