From 72a5aefe7687f81f1cc038d4fcd4e7aba6c13e07 Mon Sep 17 00:00:00 2001 From: annapoorna-s-alt Date: Mon, 15 Jul 2024 12:26:52 +0000 Subject: [PATCH] CRAYSAT-1817: Automate procedure of setting next boot device to disk IM:CRAYSAT-1817 Reviewer: Ryan --- CHANGELOG.md | 2 + sat/cli/bootsys/mgmt_power.py | 56 +++++++++ tests/cli/bootsys/test_mgmt_power.py | 165 ++++++++++++++++++++++++++- 3 files changed, 222 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f09b42ca..b543d597 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 in the `platform-services` stage. - Adding PG_NOT_DEEP_SCRUBBED in allowable checks excluded during ceph health check as it is ignorable. +- Automate the procedure of setting next boot device to disk before the management nodes are + powered off as part of the full-system shutdown. ### Fixed - Updated `sat bootsys` to increase the default management NCN shutdown timeout diff --git a/sat/cli/bootsys/mgmt_power.py b/sat/cli/bootsys/mgmt_power.py index 3de19611..c8810e53 100644 --- a/sat/cli/bootsys/mgmt_power.py +++ b/sat/cli/bootsys/mgmt_power.py @@ -31,6 +31,7 @@ import socket import subprocess import sys +import re import inflect from paramiko.ssh_exception import SSHException @@ -260,6 +261,56 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou sys.exit(0) +def set_next_boot_device_to_disk(ssh_client, ncns): + """ + Set the next boot device to a disk for a list of nodes via SSH. + + This function connects to each node via SSH, retrieves the current boot order, identifies the first available disk, + and sets the next boot device to the specified disk. + + Args: + ssh_client (paramiko.SSHClient): An active SSH client used to connect to the nodes. + ncns (list): A list of node names (NCNs) to configure. + """ + command = 'efibootmgr' + + for ncn in ncns: + try: + ssh_client.connect(ncn) + except (SSHException, socket.error) as err: + LOGGER.warning(f'Unable to connect to node {ncn}: {err}') + continue + + try: + _, stdout, stderr = ssh_client.exec_command(command) + exit_code = stdout.channel.recv_exit_status() + if exit_code != 0: + LOGGER.warning(f'Unable to determine boot order of {ncn}, {command} exited with exit code {exit_code}') + continue + + boot_order = stdout.read().decode() + match = re.search(r'^Boot([0-9A-Fa-f]{4})\*?\s.*UEFI OS', boot_order, re.MULTILINE) + if not match: + LOGGER.warning(f'No disk boot entries found for {ncn}') + continue + + next_boot = match.group(1) + + # Set the next boot device + next_boot_disk = f'efibootmgr -n {next_boot}' + _, stdout, stderr = ssh_client.exec_command(next_boot_disk) + exit_code = stdout.channel.recv_exit_status() + if exit_code != 0: + LOGGER.warning(f'Failed to set next boot device for {ncn}, {next_boot_disk} exited with exit code ' + f'{exit_code}') + continue + + LOGGER.info(f'Successfully set next boot device to disk (Boot{next_boot}) for {ncn}') + + finally: + ssh_client.close() + + def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeout, ipmi_timeout): """Power off NCNs. @@ -283,6 +334,7 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo # Shutdown workers worker_ncns = other_ncns_by_role.get('workers', []) if worker_ncns: + set_next_boot_device_to_disk(ssh_client, worker_ncns) try: with IPMIConsoleLogger(worker_ncns, username, password): LOGGER.info(f'Shutting down worker NCNs: {", ".join(worker_ncns)}') @@ -300,6 +352,8 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo # Shutdown managers (except ncn-m001) manager_ncns = other_ncns_by_role.get('managers', []) if manager_ncns: + set_next_boot_device_to_disk(ssh_client, ['ncn-m001']) + set_next_boot_device_to_disk(ssh_client, manager_ncns) try: with IPMIConsoleLogger(manager_ncns, username, password): LOGGER.info(f'Shutting down manager NCNs: {", ".join(manager_ncns)}') @@ -324,6 +378,8 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo # Freeze Ceph on storage nodes and then shutdown storage_ncns = other_ncns_by_role.get('storage', []) if storage_ncns: + set_next_boot_device_to_disk(ssh_client, storage_ncns) + LOGGER.info(f'Freezing Ceph and shutting down storage NCNs: {", ".join(storage_ncns)}') try: do_ceph_freeze() diff --git a/tests/cli/bootsys/test_mgmt_power.py b/tests/cli/bootsys/test_mgmt_power.py index 22571168..50f01098 100644 --- a/tests/cli/bootsys/test_mgmt_power.py +++ b/tests/cli/bootsys/test_mgmt_power.py @@ -24,9 +24,11 @@ """ Tests for the sat.cli.bootsys.mgmt_power module. """ +import logging from argparse import Namespace import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, patch, call +from unittest import mock from paramiko.ssh_exception import SSHException, NoValidConnectionsError @@ -34,8 +36,12 @@ do_power_off_ncns, SSHAvailableWaiter, IPMIPowerStateWaiter, + do_mgmt_shutdown_power, + FatalBootsysError, + set_next_boot_device_to_disk ) from sat.waiting import WaitingFailure +from sat.cli.bootsys.platform import do_ceph_freeze, FatalPlatformError class TestSSHAvailableWaiter(unittest.TestCase): @@ -177,3 +183,160 @@ def test_mgmt_ncns_skip_prompt_power_off(self): do_power_off_ncns(self.args) self.mock_prompt_continue.assert_not_called() self.mock_do_mgmt_shutdown_power.assert_called_once() + + +class TestDoMgmtShutdownPower(unittest.TestCase): + """Tests for the do_mgmt_shutdown_power function.""" + + def setUp(self): + """Set up mocks.""" + self.username = 'admin' + self.password = 'password' + + self.mock_args = mock.Mock() + self.ncn_shutdown_timeout = 1200 + self.ipmi_timeout = 60 + + self.mock_get_and_verify_ncn_groups = mock.patch( + 'sat.cli.bootsys.mgmt_power.get_and_verify_ncn_groups').start() + self.mock_get_ssh_client = mock.patch( + 'sat.cli.bootsys.mgmt_power.get_ssh_client').start() + self.mock_set_next_boot_device_to_disk = mock.patch( + 'sat.cli.bootsys.mgmt_power.set_next_boot_device_to_disk').start() + self.mock_start_shutdown = mock.patch( + 'sat.cli.bootsys.mgmt_power.start_shutdown').start() + self.mock_finish_shutdown = mock.patch( + 'sat.cli.bootsys.mgmt_power.finish_shutdown').start() + self.mock_do_ceph_freeze = mock.patch( + 'sat.cli.bootsys.mgmt_power.do_ceph_freeze').start() + self.mock_do_ceph_unmounts = mock.patch( + 'sat.cli.bootsys.mgmt_power.do_ceph_unmounts').start() + self.mock_IPMIConsoleLogger = mock.patch( + 'sat.cli.bootsys.mgmt_power.IPMIConsoleLogger').start() + + self.mock_ssh_client = mock.Mock() + self.mock_get_ssh_client.return_value = self.mock_ssh_client + + self.mock_other_ncns_by_role = { + 'workers': ['ncn-w001', 'ncn-w002'], + 'managers': ['ncn-m002', 'ncn-m003'], + 'storage': ['ncn-s001', 'ncn-s002'] + } + self.mock_get_and_verify_ncn_groups.return_value = self.mock_other_ncns_by_role + + def tearDown(self): + mock.patch.stopall() + + def test_failed_ncn_verification(self): + """Test do_mgmt_shutdown_power when NCN verification fails.""" + bad_ncn_msg = 'Failed to identify members of the following NCN subrole(s): ...' + self.mock_get_and_verify_ncn_groups.side_effect = FatalBootsysError(bad_ncn_msg) + with self.assertRaises(SystemExit): + with self.assertLogs(level=logging.ERROR) as cm: + do_mgmt_shutdown_power(self.username, self.password, self.mock_args.excluded_ncns, + self.ncn_shutdown_timeout, self.ipmi_timeout) + + expected_err = f'Not proceeding with NCN power off: {bad_ncn_msg}' + self.assertEqual(cm.records[0].message, expected_err) + + def test_do_mgmt_shutdown_power_success(self): + """Test do_mgmt_shutdown_power when all steps are successful.""" + with self.assertLogs(level=logging.INFO) as cm: + do_mgmt_shutdown_power(self.username, self.password, self.mock_args.excluded_ncns, + self.ncn_shutdown_timeout, self.ipmi_timeout) + + # Assert calls for worker NCNs + self.mock_set_next_boot_device_to_disk.assert_any_call(self.mock_ssh_client, ['ncn-w001', 'ncn-w002']) + self.mock_start_shutdown.assert_any_call(['ncn-w001', 'ncn-w002'], self.mock_ssh_client) + self.mock_finish_shutdown.assert_any_call(['ncn-w001', 'ncn-w002'], self.username, self.password, + self.ncn_shutdown_timeout, self.ipmi_timeout) + + # Assert calls for manager NCNs + self.mock_set_next_boot_device_to_disk.assert_any_call(self.mock_ssh_client, ['ncn-m002', 'ncn-m003']) + self.mock_start_shutdown.assert_any_call(['ncn-m002', 'ncn-m003'], self.mock_ssh_client) + self.mock_finish_shutdown.assert_any_call(['ncn-m002', 'ncn-m003'], self.username, self.password, + self.ncn_shutdown_timeout, self.ipmi_timeout) + + # Assert calls for storage NCNs + self.mock_set_next_boot_device_to_disk.assert_any_call(self.mock_ssh_client, ['ncn-s001', 'ncn-s002']) + self.mock_do_ceph_freeze.assert_called_once() + self.mock_start_shutdown.assert_any_call(['ncn-s001', 'ncn-s002'], self.mock_ssh_client) + self.mock_finish_shutdown.assert_any_call(['ncn-s001', 'ncn-s002'], self.username, self.password, + self.ncn_shutdown_timeout, self.ipmi_timeout) + + # Assert call for Ceph unmount on ncn-m001 + self.mock_do_ceph_unmounts.assert_called_once_with(self.mock_ssh_client, 'ncn-m001') + + self.assertEqual(cm.records[0].message, 'Shutting down worker NCNs: ncn-w001, ncn-w002') + self.assertEqual(cm.records[1].message, 'Waiting up to 1200 seconds for worker NCNs to shut down...') + self.assertEqual(cm.records[2].message, 'Shutting down manager NCNs: ncn-m002, ncn-m003') + self.assertEqual(cm.records[3].message, 'Waiting up to 1200 seconds for manager NCNs to shut down...') + self.assertEqual(cm.records[4].message, 'Freezing Ceph and shutting down storage NCNs: ncn-s001, ncn-s002') + self.assertEqual(cm.records[5].message, 'Ceph freeze completed successfully on storage NCNs.') + self.assertEqual(cm.records[6].message, 'Waiting up to 1200 seconds for storage NCNs to shut down...') + self.assertEqual(cm.records[7].message, 'Shutdown and power off of storage NCNs: ncn-s001, ncn-s002') + self.assertEqual(cm.records[8].message, 'Shutdown and power off of all management NCNs complete.') + + def test_do_mgmt_shutdown_power_with_fatal_error(self): + """Test do_mgmt_shutdown_power when a fatal error occurs.""" + self.mock_do_ceph_freeze.side_effect = FatalPlatformError('Ceph freeze failed') + with self.assertLogs(level=logging.ERROR) as cm: + with self.assertRaises(SystemExit): + do_mgmt_shutdown_power(self.username, self.password, self.mock_args.excluded_ncns, + self.ncn_shutdown_timeout, self.ipmi_timeout) + + self.mock_get_and_verify_ncn_groups.assert_called_once_with(self.mock_args.excluded_ncns.union({'ncn-m001'})) + self.assertEqual(cm.records[-1].message, 'Failed to freeze Ceph on storage NCNs: Ceph freeze failed') + + def test_set_next_boot_device_to_disk_success(self): + """Test that the function sets the boot device to disk successfully""" + mock_ssh_client = MagicMock() + mock_stdout = MagicMock() + mock_stderr = MagicMock() + mock_stdout.channel.recv_exit_status.return_value = 0 + mock_stdout.read.return_value = b"Boot0001* UEFI OS\nBoot0002* Other OS" + + mock_ssh_client.exec_command.return_value = (None, mock_stdout, mock_stderr) + + ncns = ['ncn-w001', 'ncn-s001'] + set_next_boot_device_to_disk(mock_ssh_client, ncns) + + expected_connect_calls = [call('ncn-w001'), call('ncn-s001')] + mock_ssh_client.connect.assert_has_calls(expected_connect_calls, any_order=True) + + expected_exec_calls = [ + call('efibootmgr'), + call('efibootmgr -n 0001'), + call('efibootmgr'), + call('efibootmgr -n 0001') + ] + mock_ssh_client.exec_command.assert_has_calls(expected_exec_calls, any_order=True) + + def test_set_next_boot_device_to_disk_ssh_fail(self): + """Test that the function handles SSH connection failures""" + mock_ssh_client = MagicMock() + mock_ssh_client.connect.side_effect = SSHException('ssh failed') + + with patch('sat.cli.bootsys.mgmt_power.LOGGER') as mock_logger: + ncns = ['ncn-w001'] + set_next_boot_device_to_disk(mock_ssh_client, ncns) + + mock_logger.warning.assert_called_with('Unable to connect to node ncn-w001: ssh failed') + + def test_set_next_boot_device_to_disk_command_fail(self): + """Test that the function handles command execution failures""" + mock_ssh_client = MagicMock() + mock_stdout = MagicMock() + mock_stderr = MagicMock() + mock_stdout.channel.recv_exit_status.side_effect = [1, 0] + mock_stdout.read.return_value = b"" + + mock_ssh_client.exec_command.return_value = (None, mock_stdout, mock_stderr) + + with patch('sat.cli.bootsys.mgmt_power.LOGGER') as mock_logger: + ncns = ['ncn-w001'] + set_next_boot_device_to_disk(mock_ssh_client, ncns) + + mock_logger.warning.assert_called_with( + 'Unable to determine boot order of ncn-w001, efibootmgr exited with exit code 1' + )