Skip to content

Commit

Permalink
Adding ceph health check bypass in sat bootsys ncn-power
Browse files Browse the repository at this point in the history
IM:CRAYSAT-1787
Reviewer:Ryan
Adding ceph health check bypass prompt for the user to decide whether to wait
or proceed with skipping the health check after unfreezing of ceph is done.
As it may take some time and the next steps may not explicitly require, by the
time it comes back it would be good to use.
  • Loading branch information
Shivaprasad Ashok Metimath committed Jul 23, 2024
1 parent 0e03fd5 commit bcd11b6
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
ncn power stage
- If containers fail to stop, automate the procedure of trying to stop them again
in the `platform-services` stage.
- Adding a ceph health check bypass prompt to take input from user and act accordingly.
unfreezing of ceph would be done, only the wait period will be skipped if user wishes to.

### Fixed
- Updated `sat bootsys` to increase the default management NCN shutdown timeout
Expand Down
13 changes: 11 additions & 2 deletions sat/cli/bootsys/mgmt_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,10 +425,19 @@ def do_power_on_ncns(args):
if ncn_group == included_ncn_groups['storage']:
try:
do_ceph_unfreeze(included_ncn_groups)
LOGGER.info('Ceph unfreeze completed successfully on storage NCNs.')

except FatalPlatformError as err:
LOGGER.error(f'Failed to unfreeze Ceph on storage NCNs: {err}')
sys.exit(1)
LOGGER.info('Ceph unfreeze completed successfully on storage NCNs.')
# Use pester_choices to prompt the user
user_choice = pester_choices("Ceph is not healthy. Do you want to continue anyway?",
['yes', 'no'])
if user_choice == 'no':
LOGGER.info("Exiting as per user's decision.")
sys.exit(1)
else:
LOGGER.info("Continuing despite Ceph not being healthy as per user's input, "
"make sure to verify it later.")

# Mount Ceph and S3FS filesystems on ncn-m001
try:
Expand Down
36 changes: 27 additions & 9 deletions tests/cli/bootsys/test_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,9 +861,9 @@ class TestDoCephUnfreeze(unittest.TestCase):
"""Tests for the do_ceph_unfreeze function."""

def setUp(self):
"""Set up mocks."""
"""Set up mocks and test data."""
self.ncn_groups = {
'storage': ['ncn-s001', 'ncn-s002']
'storage': ['ncn-s001', 'ncn-s002']
}
self.ceph_services = [
'ceph-osd.target', 'ceph-radosgw.target', 'ceph-mon.target', 'ceph-mgr.target', 'ceph-mds.target'
Expand All @@ -873,25 +873,43 @@ def setUp(self):
self.ceph_waiter_cls = mock.patch('sat.cli.bootsys.platform.CephHealthWaiter').start()
self.ceph_waiter = self.ceph_waiter_cls.return_value

def test_do_ceph_unfreeze_success(self):
"""Test do_ceph_unfreeze in the successful case."""
def tearDown(self):
"""Tear down mocks."""
mock.patch.stopall()

@mock.patch('builtins.input', side_effect=['yes']) # Simulate user input for skipping health check
def test_do_ceph_unfreeze_skip_health_check(self, mock_input):
"""Test do_ceph_unfreeze when user skips Ceph health check."""
self.ceph_waiter.wait_for_completion.return_value = True # Assume Ceph is healthy
do_ceph_unfreeze(self.ncn_groups)
self.toggle_ceph_freeze_flags.assert_called_once_with(freeze=False)
self.get_config_value.assert_called_once_with('bootsys.ceph_timeout')
self.ceph_waiter_cls.assert_called_once_with(self.get_config_value.return_value,
self.ncn_groups['storage'], retries=1)
self.ncn_groups['storage'], retries=1)
self.ceph_waiter.wait_for_completion.assert_called_once_with()

def test_do_ceph_unfreeze_unhealthy(self):
"""do_ceph_unfreeze should unfreeze Ceph and wait, raising an error if a healthy state is never reached."""
self.ceph_waiter.wait_for_completion.return_value = False
@mock.patch('builtins.input', side_effect=['no']) # Simulate user input for not skipping health check
def test_do_ceph_unfreeze_do_health_check(self, mock_input):
"""Test do_ceph_unfreeze when Ceph health check is not skipped and is unhealthy."""
self.ceph_waiter.wait_for_completion.return_value = False # Assume Ceph is not healthy
expected_error_regex = 'Ceph is not healthy. Please correct Ceph health and try again.'
with self.assertRaisesRegex(FatalPlatformError, expected_error_regex):
do_ceph_unfreeze(self.ncn_groups)
self.toggle_ceph_freeze_flags.assert_called_once_with(freeze=False)
self.get_config_value.assert_called_once_with('bootsys.ceph_timeout')
self.ceph_waiter_cls.assert_called_once_with(self.get_config_value.return_value,
self.ncn_groups['storage'], retries=1)
self.ncn_groups['storage'], retries=1)
self.ceph_waiter.wait_for_completion.assert_called_once_with()

@mock.patch('builtins.input', side_effect=['no']) # Simulate user input for not skipping health check
def test_do_ceph_unfreeze_success(self, mock_input):
"""Test do_ceph_unfreeze when Ceph health check is not skipped and is healthy."""
self.ceph_waiter.wait_for_completion.return_value = True # Assume Ceph is healthy
do_ceph_unfreeze(self.ncn_groups)
self.toggle_ceph_freeze_flags.assert_called_once_with(freeze=False)
self.get_config_value.assert_called_once_with('bootsys.ceph_timeout')
self.ceph_waiter_cls.assert_called_once_with(self.get_config_value.return_value,
self.ncn_groups['storage'], retries=1)
self.ceph_waiter.wait_for_completion.assert_called_once_with()


Expand Down

0 comments on commit bcd11b6

Please sign in to comment.