Skip to content

Commit

Permalink
Merge pull request #240 from Cray-HPE/CRAYSAT-1817-automate-next-boot…
Browse files Browse the repository at this point in the history
…-device

CRAYSAT-1817: Automate procedure of setting next boot device to disk
  • Loading branch information
annapoorna-s-alt authored Jul 25, 2024
2 parents 26008ce + 022369f commit ec8154e
Show file tree
Hide file tree
Showing 3 changed files with 322 additions and 1 deletion.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
in the `platform-services` stage.
- Adding PG_NOT_DEEP_SCRUBBED in allowable checks excluded during ceph health check as it is
ignorable.
- Automate the procedure of setting next boot device to disk before the management nodes are
powered off as part of the full-system shutdown.

### Fixed
- Updated `sat bootsys` to increase the default management NCN shutdown timeout
Expand Down
56 changes: 56 additions & 0 deletions sat/cli/bootsys/mgmt_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import socket
import subprocess
import sys
import re

import inflect
from paramiko.ssh_exception import SSHException
Expand Down Expand Up @@ -260,6 +261,56 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou
sys.exit(0)


def set_next_boot_device_to_disk(ssh_client, ncns):
"""
Set the next boot device to a disk for a list of nodes via SSH.
This function connects to each node via SSH, retrieves the current boot order, identifies the first available disk,
and sets the next boot device to the specified disk.
Args:
ssh_client (paramiko.SSHClient): An active SSH client used to connect to the nodes.
ncns (list): A list of node names (NCNs) to configure.
"""
command = 'efibootmgr'

for ncn in ncns:
try:
ssh_client.connect(ncn)
except (SSHException, socket.error) as err:
LOGGER.warning(f'Unable to connect to node {ncn}: {err}')
continue

try:
_, stdout, stderr = ssh_client.exec_command(command)
exit_code = stdout.channel.recv_exit_status()
if exit_code != 0:
LOGGER.warning(f'Unable to determine boot order of {ncn}, {command} exited with exit code {exit_code}')
continue

boot_order = stdout.read().decode()
match = re.search(r'^Boot([0-9A-Fa-f]{4})\*?\s.*UEFI OS', boot_order, re.MULTILINE)
if not match:
LOGGER.warning(f'No disk boot entries found for {ncn}')
continue

next_boot = match.group(1)

# Set the next boot device
next_boot_disk = f'efibootmgr -n {next_boot}'
_, stdout, stderr = ssh_client.exec_command(next_boot_disk)
exit_code = stdout.channel.recv_exit_status()
if exit_code != 0:
LOGGER.warning(f'Failed to set next boot device for {ncn}, {next_boot_disk} exited with exit code '
f'{exit_code}')
continue

LOGGER.info(f'Successfully set next boot device to disk (Boot{next_boot}) for {ncn}')

finally:
ssh_client.close()


def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeout, ipmi_timeout):
"""Power off NCNs.
Expand All @@ -283,6 +334,7 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
# Shutdown workers
worker_ncns = other_ncns_by_role.get('workers', [])
if worker_ncns:
set_next_boot_device_to_disk(ssh_client, worker_ncns)
try:
with IPMIConsoleLogger(worker_ncns, username, password):
LOGGER.info(f'Shutting down worker NCNs: {", ".join(worker_ncns)}')
Expand All @@ -300,6 +352,8 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
# Shutdown managers (except ncn-m001)
manager_ncns = other_ncns_by_role.get('managers', [])
if manager_ncns:
set_next_boot_device_to_disk(ssh_client, ['ncn-m001'])
set_next_boot_device_to_disk(ssh_client, manager_ncns)
try:
with IPMIConsoleLogger(manager_ncns, username, password):
LOGGER.info(f'Shutting down manager NCNs: {", ".join(manager_ncns)}')
Expand All @@ -324,6 +378,8 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
# Freeze Ceph on storage nodes and then shutdown
storage_ncns = other_ncns_by_role.get('storage', [])
if storage_ncns:
set_next_boot_device_to_disk(ssh_client, storage_ncns)

LOGGER.info(f'Freezing Ceph and shutting down storage NCNs: {", ".join(storage_ncns)}')
try:
do_ceph_freeze()
Expand Down
265 changes: 264 additions & 1 deletion tests/cli/bootsys/test_mgmt_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,24 @@
"""
Tests for the sat.cli.bootsys.mgmt_power module.
"""
import logging
from argparse import Namespace
import unittest
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock, patch, call
from unittest import mock

from paramiko.ssh_exception import SSHException, NoValidConnectionsError

from sat.cli.bootsys.mgmt_power import (
do_power_off_ncns,
SSHAvailableWaiter,
IPMIPowerStateWaiter,
do_mgmt_shutdown_power,
FatalBootsysError,
set_next_boot_device_to_disk
)
from sat.waiting import WaitingFailure
from sat.cli.bootsys.platform import do_ceph_freeze, FatalPlatformError


class TestSSHAvailableWaiter(unittest.TestCase):
Expand Down Expand Up @@ -177,3 +183,260 @@ def test_mgmt_ncns_skip_prompt_power_off(self):
do_power_off_ncns(self.args)
self.mock_prompt_continue.assert_not_called()
self.mock_do_mgmt_shutdown_power.assert_called_once()


class TestDoMgmtShutdownPower(unittest.TestCase):
"""Tests for the do_mgmt_shutdown_power function."""

def setUp(self):
"""Set up mocks."""
self.username = 'admin'
self.password = 'password'

self.ncn_shutdown_timeout = 1200
self.ipmi_timeout = 60

self.mock_get_and_verify_ncn_groups = mock.patch(
'sat.cli.bootsys.mgmt_power.get_and_verify_ncn_groups').start()
self.mock_filtered_host_keys = mock.patch(
'sat.cli.bootsys.mgmt_power.FilteredHostKeys').start()
self.mock_get_ssh_client = mock.patch(
'sat.cli.bootsys.mgmt_power.get_ssh_client').start()
self.mock_set_next_boot_device_to_disk = mock.patch(
'sat.cli.bootsys.mgmt_power.set_next_boot_device_to_disk').start()
self.mock_start_shutdown = mock.patch(
'sat.cli.bootsys.mgmt_power.start_shutdown').start()
self.mock_finish_shutdown = mock.patch(
'sat.cli.bootsys.mgmt_power.finish_shutdown').start()
self.mock_do_ceph_freeze = mock.patch(
'sat.cli.bootsys.mgmt_power.do_ceph_freeze').start()
self.mock_do_ceph_unmounts = mock.patch(
'sat.cli.bootsys.mgmt_power.do_ceph_unmounts').start()
self.mock_IPMIConsoleLogger = mock.patch(
'sat.cli.bootsys.mgmt_power.IPMIConsoleLogger').start()

self.mock_ssh_client = mock.Mock()
self.mock_get_ssh_client.return_value = self.mock_ssh_client

self.mock_other_ncns_by_role = {
'workers': ['ncn-w001', 'ncn-w002'],
'managers': ['ncn-m002', 'ncn-m003'],
'storage': ['ncn-s001', 'ncn-s002']
}
self.mock_get_and_verify_ncn_groups.return_value = self.mock_other_ncns_by_role

def tearDown(self):
mock.patch.stopall()

def test_failed_ncn_verification(self):
"""Test do_mgmt_shutdown_power when NCN verification fails."""
bad_ncn_msg = 'Failed to identify members of the following NCN subrole(s): ...'
self.mock_get_and_verify_ncn_groups.side_effect = FatalBootsysError(bad_ncn_msg)
with self.assertRaises(SystemExit):
with self.assertLogs(level=logging.ERROR) as cm:
do_mgmt_shutdown_power(self.username, self.password, set(),
self.ncn_shutdown_timeout, self.ipmi_timeout)

expected_err = f'Not proceeding with NCN power off: {bad_ncn_msg}'
self.assertEqual(cm.records[0].message, expected_err)

def test_do_mgmt_shutdown_power_success(self):
"""Test do_mgmt_shutdown_power when all steps are successful."""
with self.assertLogs(level=logging.INFO) as cm:
do_mgmt_shutdown_power(self.username, self.password, set(),
self.ncn_shutdown_timeout, self.ipmi_timeout)

# Assert calls for worker NCNs
self.mock_set_next_boot_device_to_disk.assert_any_call(self.mock_ssh_client, ['ncn-w001', 'ncn-w002'])
self.mock_start_shutdown.assert_any_call(['ncn-w001', 'ncn-w002'], self.mock_ssh_client)
self.mock_finish_shutdown.assert_any_call(['ncn-w001', 'ncn-w002'], self.username, self.password,
self.ncn_shutdown_timeout, self.ipmi_timeout)

# Assert calls for manager NCNs
self.mock_set_next_boot_device_to_disk.assert_any_call(self.mock_ssh_client, ['ncn-m002', 'ncn-m003'])
self.mock_start_shutdown.assert_any_call(['ncn-m002', 'ncn-m003'], self.mock_ssh_client)
self.mock_finish_shutdown.assert_any_call(['ncn-m002', 'ncn-m003'], self.username, self.password,
self.ncn_shutdown_timeout, self.ipmi_timeout)

# Assert calls for storage NCNs
self.mock_set_next_boot_device_to_disk.assert_any_call(self.mock_ssh_client, ['ncn-s001', 'ncn-s002'])
self.mock_do_ceph_freeze.assert_called_once()
self.mock_start_shutdown.assert_any_call(['ncn-s001', 'ncn-s002'], self.mock_ssh_client)
self.mock_finish_shutdown.assert_any_call(['ncn-s001', 'ncn-s002'], self.username, self.password,
self.ncn_shutdown_timeout, self.ipmi_timeout)

# Assert call for Ceph unmount on ncn-m001
self.mock_do_ceph_unmounts.assert_called_once_with(self.mock_ssh_client, 'ncn-m001')

expected_messages = [
'Shutting down worker NCNs: ncn-w001, ncn-w002',
'Waiting up to 1200 seconds for worker NCNs to shut down...',
'Shutting down manager NCNs: ncn-m002, ncn-m003',
'Waiting up to 1200 seconds for manager NCNs to shut down...',
'Freezing Ceph and shutting down storage NCNs: ncn-s001, ncn-s002',
'Ceph freeze completed successfully on storage NCNs.',
'Waiting up to 1200 seconds for storage NCNs to shut down...',
'Shutdown and power off of storage NCNs: ncn-s001, ncn-s002',
'Shutdown and power off of all management NCNs complete.'
]
self.assertEqual(expected_messages, [record.message for record in cm.records])

def test_do_mgmt_shutdown_power_with_fatal_error(self):
"""Test do_mgmt_shutdown_power when a fatal error occurs."""
self.mock_do_ceph_freeze.side_effect = FatalPlatformError('Ceph freeze failed')
with self.assertLogs(level=logging.ERROR) as cm:
with self.assertRaises(SystemExit):
do_mgmt_shutdown_power(self.username, self.password, set(),
self.ncn_shutdown_timeout, self.ipmi_timeout)

self.mock_get_and_verify_ncn_groups.assert_called_once_with({'ncn-m001'})
self.assertEqual(cm.records[-1].message, 'Failed to freeze Ceph on storage NCNs: Ceph freeze failed')

def test_set_next_boot_device_to_disk_success(self):
"""Test that the function sets the boot device to disk successfully"""
mock_stdout = MagicMock()
mock_stderr = MagicMock()
mock_stdout.channel.recv_exit_status.return_value = 0
mock_stdout.read.return_value = b"Boot0001* UEFI OS\nBoot0002* Other OS"

self.mock_ssh_client.exec_command.return_value = (None, mock_stdout, mock_stderr)

with self.assertLogs(level=logging.INFO) as cm:
ncns = ['ncn-w001', 'ncn-s001']
set_next_boot_device_to_disk(self.mock_ssh_client, ncns)

expected_connect_calls = [call('ncn-w001'), call('ncn-s001')]
self.mock_ssh_client.connect.assert_has_calls(expected_connect_calls, any_order=True)

expected_exec_calls = [
call('efibootmgr'),
call('efibootmgr -n 0001'),
call('efibootmgr'),
call('efibootmgr -n 0001')
]
self.mock_ssh_client.exec_command.assert_has_calls(expected_exec_calls, any_order=True)

self.assertEqual(cm.records[0].message, 'Successfully set next boot device to disk (Boot0001) for ncn-w001')
self.assertEqual(cm.records[1].message, 'Successfully set next boot device to disk (Boot0001) for ncn-s001')

def test_set_next_boot_device_to_disk_ssh_fail(self):
"""Test that the function handles SSH connection failures and continues"""
mock_ssh_client = MagicMock()
mock_stdout = MagicMock()
mock_stderr = MagicMock()
mock_stdout.channel.recv_exit_status.return_value = 0
mock_stdout.read.return_value = b"Boot0001* UEFI OS\nBoot0002* Other OS"

# Set up the SSH connection side effects to simulate different scenarios
mock_ssh_client.connect.side_effect = [None, SSHException('ssh failed'), None]
mock_ssh_client.exec_command.return_value = (None, mock_stdout, mock_stderr)

with patch('sat.cli.bootsys.mgmt_power.LOGGER') as mock_logger:
# Test handling of SSH failure and continuation for all NCNs
ncns = ['ncn-w001', 'ncn-w002', 'ncn-w003']
set_next_boot_device_to_disk(mock_ssh_client, ncns)

# Check log messages and method calls
expected_connect_calls = [call('ncn-w001'), call('ncn-w002'), call('ncn-w003')]
mock_ssh_client.connect.assert_has_calls(expected_connect_calls)

expected_exec_calls = [
call('efibootmgr'),
call('efibootmgr -n 0001'),
call('efibootmgr'),
call('efibootmgr -n 0001')
]
mock_ssh_client.exec_command.assert_has_calls(expected_exec_calls, any_order=True)

# Verify the log messages for successful and failed connections
mock_logger.info.assert_any_call('Successfully set next boot device to disk (Boot0001) for ncn-w001')
mock_logger.warning.assert_called_with('Unable to connect to node ncn-w002: ssh failed')
mock_logger.info.assert_any_call('Successfully set next boot device to disk (Boot0001) for ncn-w003')

def test_set_next_boot_device_to_disk_list_command_fails(self):
"""Test that the function handles command execution failures and continues"""
# Mock successful connection for all NCNs
self.mock_ssh_client.connect.side_effect = [None, None, None]

# Mock command execution failure for the first NCN and success for the rest
mock_stdout_1 = MagicMock()
mock_stdout_2 = MagicMock()
mock_stdout_3 = MagicMock()
mock_stderr = MagicMock()
mock_stdout_1.channel.recv_exit_status.return_value = 1
mock_stdout_1.read.return_value = b""
mock_stdout_2.channel.recv_exit_status.return_value = 0
mock_stdout_2.read.return_value = b"Boot0001* UEFI OS\nBoot0002* Other OS"
mock_stdout_3.channel.recv_exit_status.return_value = 0
mock_stdout_3.read.return_value = b"BootNext: 0001"

self.mock_ssh_client.exec_command.side_effect = [
(None, mock_stdout_1, mock_stderr),
(None, mock_stdout_2, mock_stderr),
(None, mock_stdout_3, mock_stderr),
(None, mock_stdout_2, mock_stderr),
(None, mock_stdout_3, mock_stderr),
]

with patch('sat.cli.bootsys.mgmt_power.LOGGER') as mock_logger:
ncns = ['ncn-w001', 'ncn-w002', 'ncn-w003']
set_next_boot_device_to_disk(self.mock_ssh_client, ncns)

expected_connect_calls = [call('ncn-w001'), call('ncn-w002'), call('ncn-w003')]
self.mock_ssh_client.connect.assert_has_calls(expected_connect_calls)

expected_exec_calls = [
call('efibootmgr'),
call('efibootmgr'),
call('efibootmgr -n 0001'),
call('efibootmgr'),
call('efibootmgr -n 0001')
]
self.mock_ssh_client.exec_command.assert_has_calls(expected_exec_calls, any_order=True)

mock_logger.warning.assert_called_with(
'Unable to determine boot order of ncn-w001, efibootmgr exited with exit code 1')
mock_logger.info.assert_any_call('Successfully set next boot device to disk (Boot0001) for ncn-w002')
mock_logger.info.assert_any_call('Successfully set next boot device to disk (Boot0001) for ncn-w003')

def test_set_next_boot_device_to_disk_set_command_fails(self):
"""Test that the function handles a failure when setting the next boot device for a node"""
mock_ssh_client = MagicMock()
mock_stdout_list = MagicMock()
mock_stdout_set = MagicMock()
mock_stderr = MagicMock()
# Simulate successful listing of boot entries
mock_stdout_list.channel.recv_exit_status.return_value = 0
mock_stdout_list.read.side_effect = [
b"Boot0001* UEFI OS\nBoot0002* Other OS", # Output for ncn-w001
b"Boot0003* UEFI OS\nBoot0004* Other OS", # Output for ncn-w002
b"Boot0005* UEFI OS\nBoot0006* Other OS" # Output for ncn-w003
]
# Simulate failure when setting the next boot device for ncn-w002
mock_stdout_set.channel.recv_exit_status.side_effect = [0, 1, 0]
mock_stdout_set.read.return_value = b"BootNext: 0001"
mock_ssh_client.exec_command.side_effect = [
(None, mock_stdout_list, mock_stderr), # Call to efibootmgr for ncn-w001
(None, mock_stdout_set, mock_stderr), # Call to efibootmgr -n 0001 for ncn-w001
(None, mock_stdout_list, mock_stderr), # Call to efibootmgr for ncn-w002
(None, mock_stdout_set, mock_stderr), # Call to efibootmgr -n 0001 for ncn-w002
(None, mock_stdout_list, mock_stderr), # Call to efibootmgr for ncn-w003
(None, mock_stdout_set, mock_stderr) # Call to efibootmgr -n 0001 for ncn-w003
]
with patch('sat.cli.bootsys.mgmt_power.LOGGER') as mock_logger:
ncns = ['ncn-w001', 'ncn-w002', 'ncn-w003']
set_next_boot_device_to_disk(mock_ssh_client, ncns)
expected_exec_calls = [
call('efibootmgr'),
call('efibootmgr -n 0001'),
call('efibootmgr'),
call('efibootmgr -n 0003'),
call('efibootmgr'),
call('efibootmgr -n 0005')
]
mock_ssh_client.exec_command.assert_has_calls(expected_exec_calls)
mock_logger.info.assert_any_call('Successfully set next boot device to disk (Boot0001) for ncn-w001')
mock_logger.warning.assert_called_with(
'Failed to set next boot device for ncn-w002, efibootmgr -n 0003 exited with exit code 1'
)
mock_logger.info.assert_any_call('Successfully set next boot device to disk (Boot0005) for ncn-w003')

0 comments on commit ec8154e

Please sign in to comment.