Skip to content

Commit

Permalink
CRAYSAT-1817: Automate procedure of setting next boot device to disk
Browse files Browse the repository at this point in the history
IM:CRAYSAT-1817
Reviewer: Ryan
  • Loading branch information
annapoorna-s-alt committed Jul 23, 2024
1 parent 26008ce commit 72a5aef
Show file tree
Hide file tree
Showing 3 changed files with 222 additions and 1 deletion.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
in the `platform-services` stage.
- Adding PG_NOT_DEEP_SCRUBBED in allowable checks excluded during ceph health check as it is
ignorable.
- Automate the procedure of setting next boot device to disk before the management nodes are
powered off as part of the full-system shutdown.

### Fixed
- Updated `sat bootsys` to increase the default management NCN shutdown timeout
Expand Down
56 changes: 56 additions & 0 deletions sat/cli/bootsys/mgmt_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import socket
import subprocess
import sys
import re

import inflect
from paramiko.ssh_exception import SSHException
Expand Down Expand Up @@ -260,6 +261,56 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou
sys.exit(0)


def set_next_boot_device_to_disk(ssh_client, ncns):
"""
Set the next boot device to a disk for a list of nodes via SSH.
This function connects to each node via SSH, retrieves the current boot order, identifies the first available disk,
and sets the next boot device to the specified disk.
Args:
ssh_client (paramiko.SSHClient): An active SSH client used to connect to the nodes.
ncns (list): A list of node names (NCNs) to configure.
"""
command = 'efibootmgr'

for ncn in ncns:
try:
ssh_client.connect(ncn)
except (SSHException, socket.error) as err:
LOGGER.warning(f'Unable to connect to node {ncn}: {err}')
continue

try:
_, stdout, stderr = ssh_client.exec_command(command)
exit_code = stdout.channel.recv_exit_status()
if exit_code != 0:
LOGGER.warning(f'Unable to determine boot order of {ncn}, {command} exited with exit code {exit_code}')
continue

boot_order = stdout.read().decode()
match = re.search(r'^Boot([0-9A-Fa-f]{4})\*?\s.*UEFI OS', boot_order, re.MULTILINE)
if not match:
LOGGER.warning(f'No disk boot entries found for {ncn}')
continue

next_boot = match.group(1)

# Set the next boot device
next_boot_disk = f'efibootmgr -n {next_boot}'
_, stdout, stderr = ssh_client.exec_command(next_boot_disk)
exit_code = stdout.channel.recv_exit_status()
if exit_code != 0:
LOGGER.warning(f'Failed to set next boot device for {ncn}, {next_boot_disk} exited with exit code '
f'{exit_code}')
continue

LOGGER.info(f'Successfully set next boot device to disk (Boot{next_boot}) for {ncn}')

finally:
ssh_client.close()


def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeout, ipmi_timeout):
"""Power off NCNs.
Expand All @@ -283,6 +334,7 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
# Shutdown workers
worker_ncns = other_ncns_by_role.get('workers', [])
if worker_ncns:
set_next_boot_device_to_disk(ssh_client, worker_ncns)
try:
with IPMIConsoleLogger(worker_ncns, username, password):
LOGGER.info(f'Shutting down worker NCNs: {", ".join(worker_ncns)}')
Expand All @@ -300,6 +352,8 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
# Shutdown managers (except ncn-m001)
manager_ncns = other_ncns_by_role.get('managers', [])
if manager_ncns:
set_next_boot_device_to_disk(ssh_client, ['ncn-m001'])
set_next_boot_device_to_disk(ssh_client, manager_ncns)
try:
with IPMIConsoleLogger(manager_ncns, username, password):
LOGGER.info(f'Shutting down manager NCNs: {", ".join(manager_ncns)}')
Expand All @@ -324,6 +378,8 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
# Freeze Ceph on storage nodes and then shutdown
storage_ncns = other_ncns_by_role.get('storage', [])
if storage_ncns:
set_next_boot_device_to_disk(ssh_client, storage_ncns)

LOGGER.info(f'Freezing Ceph and shutting down storage NCNs: {", ".join(storage_ncns)}')
try:
do_ceph_freeze()
Expand Down
165 changes: 164 additions & 1 deletion tests/cli/bootsys/test_mgmt_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,24 @@
"""
Tests for the sat.cli.bootsys.mgmt_power module.
"""
import logging
from argparse import Namespace
import unittest
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock, patch, call
from unittest import mock

from paramiko.ssh_exception import SSHException, NoValidConnectionsError

from sat.cli.bootsys.mgmt_power import (
do_power_off_ncns,
SSHAvailableWaiter,
IPMIPowerStateWaiter,
do_mgmt_shutdown_power,
FatalBootsysError,
set_next_boot_device_to_disk
)
from sat.waiting import WaitingFailure
from sat.cli.bootsys.platform import do_ceph_freeze, FatalPlatformError


class TestSSHAvailableWaiter(unittest.TestCase):
Expand Down Expand Up @@ -177,3 +183,160 @@ def test_mgmt_ncns_skip_prompt_power_off(self):
do_power_off_ncns(self.args)
self.mock_prompt_continue.assert_not_called()
self.mock_do_mgmt_shutdown_power.assert_called_once()


class TestDoMgmtShutdownPower(unittest.TestCase):
"""Tests for the do_mgmt_shutdown_power function."""

def setUp(self):
"""Set up mocks."""
self.username = 'admin'
self.password = 'password'

self.mock_args = mock.Mock()
self.ncn_shutdown_timeout = 1200
self.ipmi_timeout = 60

self.mock_get_and_verify_ncn_groups = mock.patch(
'sat.cli.bootsys.mgmt_power.get_and_verify_ncn_groups').start()
self.mock_get_ssh_client = mock.patch(
'sat.cli.bootsys.mgmt_power.get_ssh_client').start()
self.mock_set_next_boot_device_to_disk = mock.patch(
'sat.cli.bootsys.mgmt_power.set_next_boot_device_to_disk').start()
self.mock_start_shutdown = mock.patch(
'sat.cli.bootsys.mgmt_power.start_shutdown').start()
self.mock_finish_shutdown = mock.patch(
'sat.cli.bootsys.mgmt_power.finish_shutdown').start()
self.mock_do_ceph_freeze = mock.patch(
'sat.cli.bootsys.mgmt_power.do_ceph_freeze').start()
self.mock_do_ceph_unmounts = mock.patch(
'sat.cli.bootsys.mgmt_power.do_ceph_unmounts').start()
self.mock_IPMIConsoleLogger = mock.patch(
'sat.cli.bootsys.mgmt_power.IPMIConsoleLogger').start()

self.mock_ssh_client = mock.Mock()
self.mock_get_ssh_client.return_value = self.mock_ssh_client

self.mock_other_ncns_by_role = {
'workers': ['ncn-w001', 'ncn-w002'],
'managers': ['ncn-m002', 'ncn-m003'],
'storage': ['ncn-s001', 'ncn-s002']
}
self.mock_get_and_verify_ncn_groups.return_value = self.mock_other_ncns_by_role

def tearDown(self):
mock.patch.stopall()

def test_failed_ncn_verification(self):
"""Test do_mgmt_shutdown_power when NCN verification fails."""
bad_ncn_msg = 'Failed to identify members of the following NCN subrole(s): ...'
self.mock_get_and_verify_ncn_groups.side_effect = FatalBootsysError(bad_ncn_msg)
with self.assertRaises(SystemExit):
with self.assertLogs(level=logging.ERROR) as cm:
do_mgmt_shutdown_power(self.username, self.password, self.mock_args.excluded_ncns,
self.ncn_shutdown_timeout, self.ipmi_timeout)

expected_err = f'Not proceeding with NCN power off: {bad_ncn_msg}'
self.assertEqual(cm.records[0].message, expected_err)

def test_do_mgmt_shutdown_power_success(self):
"""Test do_mgmt_shutdown_power when all steps are successful."""
with self.assertLogs(level=logging.INFO) as cm:
do_mgmt_shutdown_power(self.username, self.password, self.mock_args.excluded_ncns,
self.ncn_shutdown_timeout, self.ipmi_timeout)

# Assert calls for worker NCNs
self.mock_set_next_boot_device_to_disk.assert_any_call(self.mock_ssh_client, ['ncn-w001', 'ncn-w002'])
self.mock_start_shutdown.assert_any_call(['ncn-w001', 'ncn-w002'], self.mock_ssh_client)
self.mock_finish_shutdown.assert_any_call(['ncn-w001', 'ncn-w002'], self.username, self.password,
self.ncn_shutdown_timeout, self.ipmi_timeout)

# Assert calls for manager NCNs
self.mock_set_next_boot_device_to_disk.assert_any_call(self.mock_ssh_client, ['ncn-m002', 'ncn-m003'])
self.mock_start_shutdown.assert_any_call(['ncn-m002', 'ncn-m003'], self.mock_ssh_client)
self.mock_finish_shutdown.assert_any_call(['ncn-m002', 'ncn-m003'], self.username, self.password,
self.ncn_shutdown_timeout, self.ipmi_timeout)

# Assert calls for storage NCNs
self.mock_set_next_boot_device_to_disk.assert_any_call(self.mock_ssh_client, ['ncn-s001', 'ncn-s002'])
self.mock_do_ceph_freeze.assert_called_once()
self.mock_start_shutdown.assert_any_call(['ncn-s001', 'ncn-s002'], self.mock_ssh_client)
self.mock_finish_shutdown.assert_any_call(['ncn-s001', 'ncn-s002'], self.username, self.password,
self.ncn_shutdown_timeout, self.ipmi_timeout)

# Assert call for Ceph unmount on ncn-m001
self.mock_do_ceph_unmounts.assert_called_once_with(self.mock_ssh_client, 'ncn-m001')

self.assertEqual(cm.records[0].message, 'Shutting down worker NCNs: ncn-w001, ncn-w002')
self.assertEqual(cm.records[1].message, 'Waiting up to 1200 seconds for worker NCNs to shut down...')
self.assertEqual(cm.records[2].message, 'Shutting down manager NCNs: ncn-m002, ncn-m003')
self.assertEqual(cm.records[3].message, 'Waiting up to 1200 seconds for manager NCNs to shut down...')
self.assertEqual(cm.records[4].message, 'Freezing Ceph and shutting down storage NCNs: ncn-s001, ncn-s002')
self.assertEqual(cm.records[5].message, 'Ceph freeze completed successfully on storage NCNs.')
self.assertEqual(cm.records[6].message, 'Waiting up to 1200 seconds for storage NCNs to shut down...')
self.assertEqual(cm.records[7].message, 'Shutdown and power off of storage NCNs: ncn-s001, ncn-s002')
self.assertEqual(cm.records[8].message, 'Shutdown and power off of all management NCNs complete.')

def test_do_mgmt_shutdown_power_with_fatal_error(self):
"""Test do_mgmt_shutdown_power when a fatal error occurs."""
self.mock_do_ceph_freeze.side_effect = FatalPlatformError('Ceph freeze failed')
with self.assertLogs(level=logging.ERROR) as cm:
with self.assertRaises(SystemExit):
do_mgmt_shutdown_power(self.username, self.password, self.mock_args.excluded_ncns,
self.ncn_shutdown_timeout, self.ipmi_timeout)

self.mock_get_and_verify_ncn_groups.assert_called_once_with(self.mock_args.excluded_ncns.union({'ncn-m001'}))
self.assertEqual(cm.records[-1].message, 'Failed to freeze Ceph on storage NCNs: Ceph freeze failed')

def test_set_next_boot_device_to_disk_success(self):
"""Test that the function sets the boot device to disk successfully"""
mock_ssh_client = MagicMock()
mock_stdout = MagicMock()
mock_stderr = MagicMock()
mock_stdout.channel.recv_exit_status.return_value = 0
mock_stdout.read.return_value = b"Boot0001* UEFI OS\nBoot0002* Other OS"

mock_ssh_client.exec_command.return_value = (None, mock_stdout, mock_stderr)

ncns = ['ncn-w001', 'ncn-s001']
set_next_boot_device_to_disk(mock_ssh_client, ncns)

expected_connect_calls = [call('ncn-w001'), call('ncn-s001')]
mock_ssh_client.connect.assert_has_calls(expected_connect_calls, any_order=True)

expected_exec_calls = [
call('efibootmgr'),
call('efibootmgr -n 0001'),
call('efibootmgr'),
call('efibootmgr -n 0001')
]
mock_ssh_client.exec_command.assert_has_calls(expected_exec_calls, any_order=True)

def test_set_next_boot_device_to_disk_ssh_fail(self):
"""Test that the function handles SSH connection failures"""
mock_ssh_client = MagicMock()
mock_ssh_client.connect.side_effect = SSHException('ssh failed')

with patch('sat.cli.bootsys.mgmt_power.LOGGER') as mock_logger:
ncns = ['ncn-w001']
set_next_boot_device_to_disk(mock_ssh_client, ncns)

mock_logger.warning.assert_called_with('Unable to connect to node ncn-w001: ssh failed')

def test_set_next_boot_device_to_disk_command_fail(self):
"""Test that the function handles command execution failures"""
mock_ssh_client = MagicMock()
mock_stdout = MagicMock()
mock_stderr = MagicMock()
mock_stdout.channel.recv_exit_status.side_effect = [1, 0]
mock_stdout.read.return_value = b""

mock_ssh_client.exec_command.return_value = (None, mock_stdout, mock_stderr)

with patch('sat.cli.bootsys.mgmt_power.LOGGER') as mock_logger:
ncns = ['ncn-w001']
set_next_boot_device_to_disk(mock_ssh_client, ncns)

mock_logger.warning.assert_called_with(
'Unable to determine boot order of ncn-w001, efibootmgr exited with exit code 1'
)

0 comments on commit 72a5aef

Please sign in to comment.