Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport release/3.33] CRAYSAT-1948: Fix traceback in sat swap blade #308

Merged
merged 1 commit into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [3.33.8] - 2024-12-18

### Fixed
- Fixed a traceback that occurs in `sat swap blade` when the node classes
reported by HSM includes both "Mountain" and "Hill" for a single blade.

## [3.33.7] - 2024-12-18

### Fixed
Expand Down
78 changes: 51 additions & 27 deletions sat/cli/swap/blade.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"""

import abc
import enum
from functools import wraps
import json
import logging
Expand Down Expand Up @@ -55,6 +56,11 @@
inf = inflect.engine()


class BladeClass(enum.Enum):
LIQUID = 'liquid-cooled'
AIR = 'air-cooled'


def get_available_file(prefix, extension):
"""Get a file object which doesn't overwrite existing files

Expand Down Expand Up @@ -173,28 +179,44 @@ def blade_node_bmcs(self):

@cached_property
def blade_class(self):
"""str: the class of the blade ("mountain" or "river")
"""BladeClass: the class of the blade (BladeClass.LIQUID or BladeClass.AIR)

"Mountain" and "Hill" blades are liquid-cooled, while "River" blades are
air-cooled.

Raises:
BladeSwapError: if there is a problem determining the slot class
"""
try:
node_classes = set()
for node in self.blade_nodes:
err_prefix = f'Could not determine blade class for {self.xname}'
node_classes = set()
for node in self.blade_nodes:
try:
node_classes.add(node['Class'])

# These error cases shouldn't happen, but be defensive anyway.
if len(node_classes) > 1:
reason = f'multiple node classes on blade {self.xname}: {", ".join(node_classes)}'
raise BladeSwapError(f'Could not determine slot class: {reason}')
elif not node_classes:
reason = f'no nodes on blade {self.xname}'
raise BladeSwapError(f'Could not determine slot class: {reason}')
except KeyError as err:
raise BladeSwapError(f'{err_prefix}: node {node["ID"]} is '
f'missing {err} field in HSM')

if len(node_classes) > 1:
if set(node_class.lower() for node_class in node_classes) == {'mountain', 'hill'}:
# This mismatch has been observed, and it doesn't change the procedure,
# so we'll just log it and continue.
LOGGER.warning('Multiple node classes detected on blade %s: %s',
self.xname, ', '.join(node_classes))
return BladeClass.LIQUID
else:
return node_classes.pop().lower()

except KeyError as err:
raise BladeSwapError(f'Node {node["ID"]} is missing {err} field in HSM')
reason = f'incompatible node classes: {", ".join(node_classes)}'
raise BladeSwapError(f'{err_prefix}: {reason}')
elif not node_classes:
raise BladeSwapError(f'{err_prefix}: no nodes on blade')

# In this case, there is just one node class on the blade
hsm_class = node_classes.pop()
if hsm_class.lower() in ('mountain', 'hill'):
return BladeClass.LIQUID
elif hsm_class.lower() == 'river':
return BladeClass.AIR
else:
raise BladeSwapError(f'{err_prefix}: unsupported blade class "{hsm_class}"')

@abc.abstractmethod
def procedure(self):
Expand All @@ -215,9 +237,11 @@ def run(self):
Raises:
SystemExit: if a BladeSwapError is raised during execution of the blade swap procedure
"""
if self.blade_class not in {'mountain', 'hill', 'river'}:
LOGGER.error('Unsupported blade class "%s" for blade %s; aborting.',
self.blade_class.title(), self.xname)
try:
# This property can raise a BladeSwapError
_ = self.blade_class
except BladeSwapError as err:
LOGGER.error(str(err))
raise SystemExit(1)

try:
Expand Down Expand Up @@ -314,7 +338,7 @@ def power_off_slot(self):
Raises:
BladeSwapError: if there is a problem powering off the slot with PCS
"""
if self.blade_class == 'river':
if self.blade_class == BladeClass.AIR:
# Power off nodes on the blade individually on River blades
xnames_on = self.pcs_client.get_xnames_power_state(
[node['ID'] for node in self.blade_nodes]
Expand Down Expand Up @@ -416,7 +440,7 @@ def prompt_clear_node_controller_settings(self):
# CRAYSAT-1373: Do this automatically with SCSD once CASMHMS-5447 is
# completed.

if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
commands = []
for node_bmc in self.blade_node_bmcs:
commands.append(
Expand Down Expand Up @@ -456,7 +480,7 @@ def delete_ethernet_interfaces(self):
# well. Mountain blades should *only* have the node ethernet interfaces
# deleted.

if self.blade_class == 'river':
if self.blade_class == BladeClass.AIR:
interface_ids_to_delete |= set(
iface['ID']
for node in self.blade_node_bmcs
Expand Down Expand Up @@ -494,7 +518,7 @@ def procedure(self):

self.suspend_hms_discovery_cron_job()

if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
self.mountain_procedure()
else:
self.river_procedure()
Expand Down Expand Up @@ -571,7 +595,7 @@ def power_on_slot(self):
BladeSwapError: if the slot cannot be powered on
"""
params = {'recursive': True}
if self.blade_class == 'river':
if self.blade_class == BladeClass.AIR:
params['force'] = True
self.pcs_client.set_xnames_power_state([self.xname], 'on', **params)

Expand Down Expand Up @@ -599,7 +623,7 @@ def begin_slot_discovery(self):
try:
chassis_bmc = self.hsm_client.query_components(chassis_xname, type='ChassisBMC')[0]
except (APIError, IndexError):
if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
LOGGER.warning('Could not locate ChassisBMC for chassis %s; waiting '
'for hms-discovery to discover slot',
chassis_xname)
Expand Down Expand Up @@ -736,14 +760,14 @@ def procedure(self):
if self.src_mapping and self.dst_mapping:
self.map_ip_mac_addresses()

if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
self.enable_slot()
self.power_on_slot()

self.resume_hms_discovery_cron_job()
self.begin_slot_discovery()

if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
self.wait_for_chassisbmc_endpoints()

self.wait_for_nodebmc_endpoints()
Expand Down
56 changes: 45 additions & 11 deletions tests/cli/swap/test_blade.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from sat.apiclient.pcs import PCSClient
from sat.cli.swap.blade import (
blade_swap_stage,
BladeClass,
BladeSwapError,
BladeSwapProcedure,
RedfishEndpointDiscoveryWaiter,
Expand Down Expand Up @@ -182,7 +183,8 @@ def setUp(self):

self.mock_cron = patch('sat.cli.swap.blade.HMSDiscoveryCronJob').start()

self.mock_blade_class_patcher = patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', 'mountain')
self.mock_blade_class_patcher = patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class',
BladeClass.LIQUID)
self.mock_blade_class_patcher.start()

def tearDown(self):
Expand Down Expand Up @@ -245,14 +247,46 @@ def setUp(self):

def test_detecting_mountain_blade(self):
"""Test detecting a Mountain blade"""
self.assertEqual(self.swap_out.blade_class, 'mountain')
self.assertEqual(BladeClass.LIQUID, self.swap_out.blade_class)

def test_detecting_hill_blade(self):
"""Test detecting a Hill blade"""
for node in self.nodes:
node['Class'] = 'Hill'

self.assertEqual(BladeClass.LIQUID, self.swap_out.blade_class)

def test_detecting_mixed_hill_mountain_blade(self):
"""Test detecting a blade with both Hill and Mountain nodes"""
self.nodes[0]['Class'] = 'Hill'

with self.assertLogs(level='WARNING') as logs_cm:
self.assertEqual(BladeClass.LIQUID, self.swap_out.blade_class)

self.assertEqual(len(logs_cm.output), 1)
self.assertIn('Multiple node classes detected on blade', logs_cm.records[0].message)

def test_detecting_river_blade(self):
"""Test detecting a River blade"""
for node in self.nodes:
node['Class'] = 'River'

self.assertEqual(self.swap_out.blade_class, 'river')
self.assertEqual(BladeClass.AIR, self.swap_out.blade_class)

def test_detecting_unrecognized_blade(self):
"""Test detecting an unrecognized blade"""
for node in self.nodes:
node['Class'] = 'Unrecognized'

with self.assertRaisesRegex(BladeSwapError, 'unsupported blade class "Unrecognized"'):
_ = self.swap_out.blade_class

def test_incompatible_blade_class(self):
"""Test that BladeSwapError is raised if the blade class is incompatible"""
self.nodes[0]['Class'] = 'River'

with self.assertRaisesRegex(BladeSwapError, 'incompatible node classes'):
_ = self.swap_out.blade_class


class TestDisablingRedfishEndpoints(BaseBladeSwapProcedureTest):
Expand Down Expand Up @@ -367,7 +401,7 @@ class TestPowerOffSlot(BaseBladeSwapProcedureTest):
"""Tests for the powering off slot stage"""

def test_slot_power_off_command_sent_mountain_blades(self):
"""Test that the slot power off command is sent properly for Mountain blades"""
"""Test that the slot power off command is sent properly for liquid-cooled blades"""
self.swap_out.power_off_slot()
self.mock_pcs_client.set_xnames_power_state.assert_called_once_with(
[self.blade_xname],
Expand All @@ -377,9 +411,9 @@ def test_slot_power_off_command_sent_mountain_blades(self):
)

def test_slot_power_off_command_sent_river_blades(self):
"""Test that the slot power off command is sent properly for River blades"""
"""Test that the slot power off command is sent properly for air-cooled blades"""
self.mock_blade_class_patcher.stop()
patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', 'river').start()
patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', BladeClass.AIR).start()

node_xnames = [n['ID'] for n in self.nodes]
self.mock_pcs_client.get_xnames_power_state.return_value = {'on': node_xnames}
Expand Down Expand Up @@ -590,7 +624,7 @@ def test_begin_slot_discovery_no_warning_if_river_blade(self):
self.mock_hsm_client.query_components.return_value = []

self.mock_blade_class_patcher.stop()
patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', 'river').start()
patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', BladeClass.AIR).start()

with self.assertLogs(level='WARNING') as cm:
self.swap_in.begin_slot_discovery()
Expand Down Expand Up @@ -815,14 +849,14 @@ def setUp(self):
self.mock_swapinprocedure_obj = MagicMock()

def test_swap_in_mountain(self):
"""Test swapping in a mountain blade discovers and waits for ChassisBMCs"""
self.mock_swapinprocedure_obj.blade_class = 'mountain'
"""Test swapping in a liquid-cooled blade discovers and waits for ChassisBMCs"""
self.mock_swapinprocedure_obj.blade_class = BladeClass.LIQUID
SwapInProcedure.procedure(self.mock_swapinprocedure_obj)
self.mock_swapinprocedure_obj.wait_for_chassisbmc_endpoints.assert_called()

def test_swap_in_river(self):
"""Test swapping in a river blade does not wait for ChassisBMCs"""
self.mock_swapinprocedure_obj.blade_class = 'river'
"""Test swapping in an air-cooled blade does not wait for ChassisBMCs"""
self.mock_swapinprocedure_obj.blade_class = BladeClass.AIR
SwapInProcedure.procedure(self.mock_swapinprocedure_obj)
self.mock_swapinprocedure_obj.wait_for_chassisbmc_endpoints.assert_not_called()

Expand Down
Loading