From 53b4981d29122633a452a817c4cc4f95090683db Mon Sep 17 00:00:00 2001 From: Shivaprasad Ashok Metimath Date: Mon, 15 Jul 2024 10:21:24 +0000 Subject: [PATCH] Adding ceph health check bypass in sat bootsys ncn-power IM:CRAYSAT-1787 Reviewer:Ryan Adding ceph health check bypass prompt for the user to decide whether to wait or proceed with skipping the health check after unfreezing of ceph is done. As it may take some time and the next steps may not explicitly require, by the time it comes back it would be good to use. --- CHANGELOG.md | 2 ++ sat/cli/bootsys/mgmt_power.py | 13 +++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b543d597..fd1ec282 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ignorable. - Automate the procedure of setting next boot device to disk before the management nodes are powered off as part of the full-system shutdown. +- Adding a ceph health check bypass prompt to take input from user and act accordingly. + unfreezing of ceph would be done, only the wait period will be skipped if user wishes to. ### Fixed - Updated `sat bootsys` to increase the default management NCN shutdown timeout diff --git a/sat/cli/bootsys/mgmt_power.py b/sat/cli/bootsys/mgmt_power.py index c8810e53..9d1fc02c 100644 --- a/sat/cli/bootsys/mgmt_power.py +++ b/sat/cli/bootsys/mgmt_power.py @@ -477,10 +477,19 @@ def do_power_on_ncns(args): if ncn_group == included_ncn_groups['storage']: try: do_ceph_unfreeze(included_ncn_groups) + LOGGER.info('Ceph unfreeze completed successfully on storage NCNs.') + except FatalPlatformError as err: LOGGER.error(f'Failed to unfreeze Ceph on storage NCNs: {err}') - sys.exit(1) - LOGGER.info('Ceph unfreeze completed successfully on storage NCNs.') + # Use pester_choices to prompt the user + user_choice = pester_choices('Ceph is not healthy. Do you want to continue anyway?', + ['yes', 'no']) + if user_choice == 'no': + LOGGER.info('Exiting as per user\'s decision.') + sys.exit(1) + else: + LOGGER.info('Continuing despite Ceph not being healthy as per user\'s input, ' + 'make sure to verify it later.') # Mount Ceph and S3FS filesystems on ncn-m001 try: