Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wait for the ceph command to execute successfully in the rolling nodes terminate and rolling nodes restart tests #10915

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 66 additions & 2 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -3578,20 +3578,21 @@ def restart_pods_in_statuses(
logger.info("Finish restarting the pods")


def wait_for_ceph_cmd_execute_successfully(timeout=300):
def base_wait_for_ceph_cmd_execute_successfully(timeout=300, sleep=20):
"""
Wait for a Ceph command to execute successfully

Args:
timeout (int): The time to wait for a Ceph command to execute successfully
sleep (int): Time to sleep between the iterations

Returns:
bool: True, if the Ceph command executed successfully. False, otherwise

"""
try:
for res in TimeoutSampler(
timeout=timeout, sleep=10, func=check_ceph_cmd_execute_successfully
timeout=timeout, sleep=sleep, func=check_ceph_cmd_execute_successfully
):
if res:
return True
Expand Down Expand Up @@ -3955,3 +3956,66 @@ def get_prometheus_pods(
pods_with_label_match = get_pods_having_label(prometheus_label, namespace)
prometheus_pod_objs = [Pod(**prometheus) for prometheus in pods_with_label_match]
return prometheus_pod_objs


def get_ceph_tool_pods(ceph_tool_label=constants.TOOL_APP_LABEL, namespace=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I will change the name prefix. The problem with the function you mentioned is that it performs extra conditions and does not just get all the ceph tool pods.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not fixing/adjusting the existing function? Having 2 functions for similar purpose isn't ideal

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can do that. The problem is that we must be more careful as this function is widely used in our tests.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, changing it would require extensive validation. This doesn't mean we should have a dup function which does similar operation

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I will make the changes that are needed.

"""
Fetches info about the ceph tools pods in the cluster

Args:
ceph_tool_label (str): label associated with ceph tool pods
(default: constants.TOOL_APP_LABEL)
namespace (str): Namespace in which ceph cluster lives
(default: config.ENV_DATA["cluster_namespace"])

Returns:
list : List of the ceph tool pod objects
"""
namespace = namespace or config.ENV_DATA["cluster_namespace"]
ceph_tools = get_pods_having_label(ceph_tool_label, namespace)
ceph_tool_pods = [Pod(**ceph_tool) for ceph_tool in ceph_tools]
return ceph_tool_pods


def wait_for_ceph_cmd_execute_successfully(
timeout=300, sleep=20, num_of_retries=1, restart_tool_pod_before_retry=True
):
"""
Wait for the Ceph command to execute successfully in the given timeout and number of retries.
For, example, if the timeout is 300 and 'num_of_retries' is 2, we will wait 600 seconds
for the ceph command to execute successfully.

Args:
timeout (int): The time to wait for a Ceph command to execute successfully
sleep (int): Time to sleep between the iterations
num_of_retries (int): The number of retries to wait for the Ceph command to execute successfully.
restart_tool_pod_before_retry (bool): If True, restart the rook-ceph-tool pod before the next retry.
False, otherwise.

Returns:
bool: True, if the Ceph command executed successfully. False, otherwise

"""
logger.info("Wait for the ceph command to execute successfully")

for num_of_retry in range(num_of_retries):
logger.info(f"num of retries = {num_of_retry}")
res = base_wait_for_ceph_cmd_execute_successfully(timeout=timeout, sleep=sleep)
if res:
return True
if num_of_retry < 1:
# Continue to the next iteration if we didn't reach the first retry
continue

if restart_tool_pod_before_retry:
try:
logger.info("Trying to restart the rook-ceph-tools pod...")
ceph_tool_pods = get_ceph_tool_pods()
delete_pods(ceph_tool_pods, wait=True)
except CommandFailed as ex:
logger.warning(ex)

logger.warning(
f"The ceph command failed to execute successfully after {num_of_retries} retries"
)
return False
1 change: 1 addition & 0 deletions tests/functional/z_cluster/nodes/test_nodes_restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def test_rolling_nodes_restart(
ocp_nodes = get_node_objs()
for node in ocp_nodes:
nodes.restart_nodes(nodes=[node], wait=False)
pod.wait_for_ceph_cmd_execute_successfully(timeout=420, num_of_retries=2)
self.sanity_helpers.health_check(cluster_check=False, tries=60)

retry(CommandFailed, tries=8, delay=40, backoff=1)(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
)
from ocs_ci.ocs.resources.pod import (
check_pods_after_node_replacement,
wait_for_ceph_cmd_execute_successfully,
)
from ocs_ci.helpers.sanity_helpers import SanityManagedService, Sanity
from ocs_ci.ocs.cluster import (
Expand Down Expand Up @@ -154,6 +155,8 @@ def rolling_terminate_and_recovery_of_ocs_worker_nodes(self, nodes):
label_nodes([new_ocs_node])

log.info(f"The new ocs node is: {new_ocs_node.name}")
log.info("Wait for the Ceph health command to execute successfully")
wait_for_ceph_cmd_execute_successfully(timeout=420, num_of_retries=2)
log.info("Waiting for all the pods to be running")
assert check_pods_after_node_replacement(), "Not all the pods are running"

Expand Down
Loading