Skip to content

Commit

Permalink
CRAYSAT-1817: Automate procedure of setting next boot device to disk
Browse files Browse the repository at this point in the history
IM:CRAYSAT-1817
Reviewer: Ryan
  • Loading branch information
annapoorna-s-alt committed Jul 16, 2024
1 parent 26008ce commit 9ffb1e6
Showing 1 changed file with 92 additions and 0 deletions.
92 changes: 92 additions & 0 deletions sat/cli/bootsys/mgmt_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,90 @@ def finish_shutdown(hosts, username, password, ncn_shutdown_timeout, ipmi_timeou
sys.exit(0)


def get_disk_entries(ssh_client, ncn):
"""
Retrieve the current boot order from a node via SSH and identify disk entries.
Args:
ssh_client (paramiko.SSHClient): The SSH client connected to the node.
ncn (str): The node associated with the ssh_client.
Returns:
dict: A dictionary of disk entries with descriptions as keys and boot numbers as values.
Raises:
SystemExit: If the command fails, an error is logged and the function exits.
"""
command = "efibootmgr"
_, stdout, stderr = ssh_client.exec_command(command)
error = stderr.read().decode()
if error:
LOGGER.error(f'Failed to get current boot order for {ncn}: {error}')
raise SystemExit(1)

boot_order_output = stdout.read().decode()
lines = boot_order_output.split('\n')
disk_entries = {}
for line in lines:
if line.startswith("Boot"):
parts = line.split()
boot_num = parts[0][4:8]
boot_desc = " ".join(parts[1:])
if "CRAY UEFI OS 0" in boot_desc.upper():
disk_entries[boot_desc] = boot_num
return disk_entries


def set_next_boot_device_to_disk(ssh_client, ncns):
"""
Set the next boot device to a disk for a list of nodes via SSH.
This function connects to each node via SSH, retrieves the current boot order, identifies available disk entries,
and sets the next boot device to the specified disk.
Args:
ssh_client (paramiko.SSHClient): An active SSH client used to connect to the nodes.
ncns (list): A list of node names (NCNs) to configure.
Raises:
paramiko.SSHException: If there is an error connecting to any node via SSH.
ValueError: If there are no disk boot entries found for a node.
RuntimeError: If there are issues setting the next boot device.
"""

for ncn in ncns:
try:
ssh_client.connect(ncn)
except (SSHException, socket.error) as err:
LOGGER.warning('Unable to connect to node "%s": %s', ncn, err)
continue

try:
disk_entries = get_disk_entries(ssh_client, ncn)
if not disk_entries:
raise ValueError(f'No disk boot entries found for {ncn}')

# Consider the first disk entry
new_boot_order = next(iter(disk_entries.values()))
LOGGER.info(f'New Boot Order for {ncn}: {new_boot_order}')

# Set the next boot device
set_boot_next = f"efibootmgr -n {new_boot_order}"
_, stdout, stderr = ssh_client.exec_command(set_boot_next)
error = stderr.read().decode()
if error:
raise RuntimeError(f'Failed to set next boot device for {ncn}: {error}')

LOGGER.info(f'Successfully set next boot device to disk for {ncn}')

except ValueError as ve:
LOGGER.error(f'Value Error: {ve}')
except Exception as e:
LOGGER.error(f'Failed to set next boot device for {ncn}: {e}')
finally:
ssh_client.close()


def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeout, ipmi_timeout):
"""Power off NCNs.
Expand All @@ -283,6 +367,8 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
# Shutdown workers
worker_ncns = other_ncns_by_role.get('workers', [])
if worker_ncns:
# Set boot device to Disk
set_next_boot_device_to_disk(ssh_client, worker_ncns)
try:
with IPMIConsoleLogger(worker_ncns, username, password):
LOGGER.info(f'Shutting down worker NCNs: {", ".join(worker_ncns)}')
Expand All @@ -300,6 +386,8 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
# Shutdown managers (except ncn-m001)
manager_ncns = other_ncns_by_role.get('managers', [])
if manager_ncns:
# Set boot device to Disk
set_next_boot_device_to_disk(ssh_client, manager_ncns)
try:
with IPMIConsoleLogger(manager_ncns, username, password):
LOGGER.info(f'Shutting down manager NCNs: {", ".join(manager_ncns)}')
Expand All @@ -324,6 +412,9 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
# Freeze Ceph on storage nodes and then shutdown
storage_ncns = other_ncns_by_role.get('storage', [])
if storage_ncns:
# Set boot device to Disk
set_next_boot_device_to_disk(ssh_client, storage_ncns)

LOGGER.info(f'Freezing Ceph and shutting down storage NCNs: {", ".join(storage_ncns)}')
try:
do_ceph_freeze()
Expand All @@ -332,6 +423,7 @@ def do_mgmt_shutdown_power(username, password, excluded_ncns, ncn_shutdown_timeo
ssh_client.close()
raise SystemExit(1)
LOGGER.info('Ceph freeze completed successfully on storage NCNs.')

try:
with IPMIConsoleLogger(storage_ncns, username, password):
start_shutdown(storage_ncns, ssh_client)
Expand Down

0 comments on commit 9ffb1e6

Please sign in to comment.