diff --git a/ocs_ci/ocs/resources/pod.py b/ocs_ci/ocs/resources/pod.py index a2c863542e6..64afd998d8a 100644 --- a/ocs_ci/ocs/resources/pod.py +++ b/ocs_ci/ocs/resources/pod.py @@ -3578,12 +3578,13 @@ def restart_pods_in_statuses( logger.info("Finish restarting the pods") -def wait_for_ceph_cmd_execute_successfully(timeout=300): +def base_wait_for_ceph_cmd_execute_successfully(timeout=300, sleep=20): """ Wait for a Ceph command to execute successfully Args: timeout (int): The time to wait for a Ceph command to execute successfully + sleep (int): Time to sleep between the iterations Returns: bool: True, if the Ceph command executed successfully. False, otherwise @@ -3591,7 +3592,7 @@ def wait_for_ceph_cmd_execute_successfully(timeout=300): """ try: for res in TimeoutSampler( - timeout=timeout, sleep=10, func=check_ceph_cmd_execute_successfully + timeout=timeout, sleep=sleep, func=check_ceph_cmd_execute_successfully ): if res: return True @@ -3955,3 +3956,67 @@ def get_prometheus_pods( pods_with_label_match = get_pods_having_label(prometheus_label, namespace) prometheus_pod_objs = [Pod(**prometheus) for prometheus in pods_with_label_match] return prometheus_pod_objs + + +def get_all_ceph_tool_pods(ceph_tool_label=constants.TOOL_APP_LABEL, namespace=None): + """ + Get all the ceph tool pods in the cluster + + Args: + ceph_tool_label (str): label associated with ceph tool pods + (default: constants.TOOL_APP_LABEL) + namespace (str): Namespace in which ceph cluster lives + (default: config.ENV_DATA["cluster_namespace"]) + + Returns: + list : List of the ceph tool pod objects + + """ + namespace = namespace or config.ENV_DATA["cluster_namespace"] + ceph_tools = get_pods_having_label(ceph_tool_label, namespace) + ceph_tool_pods = [Pod(**ceph_tool) for ceph_tool in ceph_tools] + return ceph_tool_pods + + +def wait_for_ceph_cmd_execute_successfully( + timeout=300, sleep=20, num_of_retries=1, restart_tool_pod_before_retry=True +): + """ + Wait for the Ceph command to execute successfully in the given timeout and number of retries. + For, example, if the timeout is 300 and 'num_of_retries' is 2, we will wait 600 seconds + for the ceph command to execute successfully. + + Args: + timeout (int): The time to wait for a Ceph command to execute successfully + sleep (int): Time to sleep between the iterations + num_of_retries (int): The number of retries to wait for the Ceph command to execute successfully. + restart_tool_pod_before_retry (bool): If True, restart the rook-ceph-tool pod before the next retry. + False, otherwise. + + Returns: + bool: True, if the Ceph command executed successfully. False, otherwise + + """ + logger.info("Wait for the ceph command to execute successfully") + + for num_of_retry in range(num_of_retries): + logger.info(f"num of retries = {num_of_retry}") + res = base_wait_for_ceph_cmd_execute_successfully(timeout=timeout, sleep=sleep) + if res: + return True + if num_of_retry < 1: + # Continue to the next iteration if we didn't reach the first retry + continue + + if restart_tool_pod_before_retry: + try: + logger.info("Trying to restart the rook-ceph-tool pods...") + ceph_tool_pods = get_all_ceph_tool_pods() + delete_pods(ceph_tool_pods, wait=True) + except CommandFailed as ex: + logger.warning(ex) + + logger.warning( + f"The ceph command failed to execute successfully after {num_of_retries} retries" + ) + return False diff --git a/tests/functional/z_cluster/nodes/test_nodes_restart.py b/tests/functional/z_cluster/nodes/test_nodes_restart.py index 5eb6f0b24b3..85164c6716f 100644 --- a/tests/functional/z_cluster/nodes/test_nodes_restart.py +++ b/tests/functional/z_cluster/nodes/test_nodes_restart.py @@ -112,6 +112,7 @@ def test_rolling_nodes_restart( ocp_nodes = get_node_objs() for node in ocp_nodes: nodes.restart_nodes(nodes=[node], wait=False) + pod.wait_for_ceph_cmd_execute_successfully(timeout=420, num_of_retries=2) self.sanity_helpers.health_check(cluster_check=False, tries=60) retry(CommandFailed, tries=8, delay=40, backoff=1)( diff --git a/tests/functional/z_cluster/nodes/test_rolling_terminate_and_recovery.py b/tests/functional/z_cluster/nodes/test_rolling_terminate_and_recovery.py index c3bd13389a1..f3f49c678ee 100644 --- a/tests/functional/z_cluster/nodes/test_rolling_terminate_and_recovery.py +++ b/tests/functional/z_cluster/nodes/test_rolling_terminate_and_recovery.py @@ -29,6 +29,7 @@ ) from ocs_ci.ocs.resources.pod import ( check_pods_after_node_replacement, + wait_for_ceph_cmd_execute_successfully, ) from ocs_ci.helpers.sanity_helpers import SanityManagedService, Sanity from ocs_ci.ocs.cluster import ( @@ -154,6 +155,8 @@ def rolling_terminate_and_recovery_of_ocs_worker_nodes(self, nodes): label_nodes([new_ocs_node]) log.info(f"The new ocs node is: {new_ocs_node.name}") + log.info("Wait for the Ceph health command to execute successfully") + wait_for_ceph_cmd_execute_successfully(timeout=420, num_of_retries=2) log.info("Waiting for all the pods to be running") assert check_pods_after_node_replacement(), "Not all the pods are running"