Skip to content

Commit

Permalink
Adding ceph health check bypass in sat bootsys ncn-power
Browse files Browse the repository at this point in the history
IM:CRAYSAT-1787
Reviewer:Ryan
Adding ceph health check bypass prompt for the user to decide whether to wait
or proceed with skipping the health check after unfreezing of ceph is done.
As it may take some time and the next steps may not explicitly require, by the
time it comes back it would be good to use.
  • Loading branch information
Shivaprasad Ashok Metimath committed Jul 15, 2024
1 parent 0e03fd5 commit 6539c8e
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 17 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
ncn power stage
- If containers fail to stop, automate the procedure of trying to stop them again
in the `platform-services` stage.
- Adding a ceph health check bypass prompt to take input from user and act accordingly.
unfreezing of ceph would be done, only the wait period will be skipped if user wishes to.

### Fixed
- Updated `sat bootsys` to increase the default management NCN shutdown timeout
Expand Down
36 changes: 28 additions & 8 deletions sat/cli/bootsys/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,20 @@ def do_ceph_freeze():
raise FatalPlatformError(str(err))


def prompt_user_for_health_check():
"""Prompt the user to decide if they want to skip the Ceph health check.
Returns:
bool: True if the user wants to skip the health check, False otherwise.
"""
while True:
response = input("Do you want to skip the Ceph health check after unfreezing? (yes/no): ").strip().lower()
if response in ['yes', 'no']:
return response == 'yes'
else:
print("Invalid response. Please enter 'yes' or 'no'.")


def do_ceph_unfreeze(ncn_groups):
"""Start inactive Ceph services, unfreeze Ceph and wait for it to be healthy.
Expand All @@ -551,14 +565,20 @@ def do_ceph_unfreeze(ncn_groups):
except RuntimeError as err:
raise FatalPlatformError(str(err))

with BeginEndLogger('wait for ceph health'):
ceph_timeout = get_config_value('bootsys.ceph_timeout')
LOGGER.info(f'Waiting up to {ceph_timeout} seconds for Ceph to become healthy after unfreeze')
ceph_waiter = CephHealthWaiter(ceph_timeout, storage_hosts, retries=1)
if not ceph_waiter.wait_for_completion():
raise FatalPlatformError(f'Ceph is not healthy. Please correct Ceph health and try again.')
else:
LOGGER.info('Ceph is healthy.')
# Prompt the user to decide if they want to skip the Ceph health check
skip_health_check = prompt_user_for_health_check()

if skip_health_check:
LOGGER.info("Skipping Ceph health check as per user's request.")
else:
with BeginEndLogger('wait for ceph health'):
ceph_timeout = get_config_value('bootsys.ceph_timeout')
LOGGER.info(f'Waiting up to {ceph_timeout} seconds for Ceph to become healthy after unfreeze')
ceph_waiter = CephHealthWaiter(ceph_timeout, storage_hosts, retries=1)
if not ceph_waiter.wait_for_completion():
raise FatalPlatformError(f'Ceph is not healthy. Please correct Ceph health and try again.')
else:
LOGGER.info('Ceph is healthy.')


def do_etcd_snapshot(ncn_groups):
Expand Down
36 changes: 27 additions & 9 deletions tests/cli/bootsys/test_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,9 +861,9 @@ class TestDoCephUnfreeze(unittest.TestCase):
"""Tests for the do_ceph_unfreeze function."""

def setUp(self):
"""Set up mocks."""
"""Set up mocks and test data."""
self.ncn_groups = {
'storage': ['ncn-s001', 'ncn-s002']
'storage': ['ncn-s001', 'ncn-s002']
}
self.ceph_services = [
'ceph-osd.target', 'ceph-radosgw.target', 'ceph-mon.target', 'ceph-mgr.target', 'ceph-mds.target'
Expand All @@ -873,25 +873,43 @@ def setUp(self):
self.ceph_waiter_cls = mock.patch('sat.cli.bootsys.platform.CephHealthWaiter').start()
self.ceph_waiter = self.ceph_waiter_cls.return_value

def test_do_ceph_unfreeze_success(self):
"""Test do_ceph_unfreeze in the successful case."""
def tearDown(self):
"""Tear down mocks."""
mock.patch.stopall()

@mock.patch('builtins.input', side_effect=['yes']) # Simulate user input for skipping health check
def test_do_ceph_unfreeze_skip_health_check(self, mock_input):
"""Test do_ceph_unfreeze when user skips Ceph health check."""
self.ceph_waiter_instance.wait_for_completion.return_value = True # Assume Ceph is healthy
do_ceph_unfreeze(self.ncn_groups)
self.toggle_ceph_freeze_flags.assert_called_once_with(freeze=False)
self.get_config_value.assert_called_once_with('bootsys.ceph_timeout')
self.ceph_waiter_cls.assert_called_once_with(self.get_config_value.return_value,
self.ncn_groups['storage'], retries=1)
self.ncn_groups['storage'], retries=1)
self.ceph_waiter.wait_for_completion.assert_called_once_with()

def test_do_ceph_unfreeze_unhealthy(self):
"""do_ceph_unfreeze should unfreeze Ceph and wait, raising an error if a healthy state is never reached."""
self.ceph_waiter.wait_for_completion.return_value = False
@mock.patch('builtins.input', side_effect=['no']) # Simulate user input for not skipping health check
def test_do_ceph_unfreeze_do_health_check(self, mock_input):
"""Test do_ceph_unfreeze when Ceph health check is not skipped and is unhealthy."""
self.ceph_waiter.wait_for_completion.return_value = False # Assume Ceph is not healthy
expected_error_regex = 'Ceph is not healthy. Please correct Ceph health and try again.'
with self.assertRaisesRegex(FatalPlatformError, expected_error_regex):
do_ceph_unfreeze(self.ncn_groups)
self.toggle_ceph_freeze_flags.assert_called_once_with(freeze=False)
self.get_config_value.assert_called_once_with('bootsys.ceph_timeout')
self.ceph_waiter_cls.assert_called_once_with(self.get_config_value.return_value,
self.ncn_groups['storage'], retries=1)
self.ncn_groups['storage'], retries=1)
self.ceph_waiter.wait_for_completion.assert_called_once_with()

@mock.patch('builtins.input', side_effect=['no']) # Simulate user input for not skipping health check
def test_do_ceph_unfreeze_success(self, mock_input):
"""Test do_ceph_unfreeze when Ceph health check is not skipped and is healthy."""
self.ceph_waiter.wait_for_completion.return_value = True # Assume Ceph is healthy
do_ceph_unfreeze(self.ncn_groups)
self.toggle_ceph_freeze_flags.assert_called_once_with(freeze=False)
self.get_config_value.assert_called_once_with('bootsys.ceph_timeout')
self.ceph_waiter_cls.assert_called_once_with(self.get_config_value.return_value,
self.ncn_groups['storage'], retries=1)
self.ceph_waiter.wait_for_completion.assert_called_once_with()


Expand Down

0 comments on commit 6539c8e

Please sign in to comment.