Skip to content

Commit

Permalink
CRAYSAT-1948: Fix traceback in sat swap blade
Browse files Browse the repository at this point in the history
Fix a traceback that occurs in `sat swap blade` when the node classes
reported by HSM includes both "Mountain" and "Hill" for a single blade.

This probably indicates some sort of configuration problem in HSM, but
the behavior of `sat swap blade` is the same for both Mountain and Hill,
so we can log a warning and proceed in this case.

Test Description:
Existing and new unit tests validate the new logic in `blade_class`
property.

Tested the `sat swap blade --action disable` command on baldar, which
was exhibiting the issue where a single blade had some nodes reporting
as "Hill" and some as "Mountain" in HSM. The command worked as expected.
  • Loading branch information
haasken-hpe committed Dec 18, 2024
1 parent 382ffe0 commit 5cae513
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 38 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [3.33.8] - 2024-12-18

### Fixed
- Fixed a traceback that occurs in `sat swap blade` when the node classes
reported by HSM includes both "Mountain" and "Hill" for a single blade.

## [3.33.7] - 2024-12-18

### Fixed
Expand Down
78 changes: 51 additions & 27 deletions sat/cli/swap/blade.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"""

import abc
import enum
from functools import wraps
import json
import logging
Expand Down Expand Up @@ -55,6 +56,11 @@
inf = inflect.engine()


class BladeClass(enum.Enum):
LIQUID = 'liquid-cooled'
AIR = 'air-cooled'


def get_available_file(prefix, extension):
"""Get a file object which doesn't overwrite existing files
Expand Down Expand Up @@ -173,28 +179,44 @@ def blade_node_bmcs(self):

@cached_property
def blade_class(self):
"""str: the class of the blade ("mountain" or "river")
"""BladeClass: the class of the blade (BladeClass.LIQUID or BladeClass.AIR)
"Mountain" and "Hill" blades are liquid-cooled, while "River" blades are
air-cooled.
Raises:
BladeSwapError: if there is a problem determining the slot class
"""
try:
node_classes = set()
for node in self.blade_nodes:
err_prefix = f'Could not determine blade class for {self.xname}'
node_classes = set()
for node in self.blade_nodes:
try:
node_classes.add(node['Class'])

# These error cases shouldn't happen, but be defensive anyway.
if len(node_classes) > 1:
reason = f'multiple node classes on blade {self.xname}: {", ".join(node_classes)}'
raise BladeSwapError(f'Could not determine slot class: {reason}')
elif not node_classes:
reason = f'no nodes on blade {self.xname}'
raise BladeSwapError(f'Could not determine slot class: {reason}')
except KeyError as err:
raise BladeSwapError(f'{err_prefix}: node {node["ID"]} is '
f'missing {err} field in HSM')

if len(node_classes) > 1:
if set(node_class.lower() for node_class in node_classes) == {'mountain', 'hill'}:
# This mismatch has been observed, and it doesn't change the procedure,
# so we'll just log it and continue.
LOGGER.warning('Multiple node classes detected on blade %s: %s',
self.xname, ', '.join(node_classes))
return BladeClass.LIQUID
else:
return node_classes.pop().lower()

except KeyError as err:
raise BladeSwapError(f'Node {node["ID"]} is missing {err} field in HSM')
reason = f'incompatible node classes: {", ".join(node_classes)}'
raise BladeSwapError(f'{err_prefix}: {reason}')
elif not node_classes:
raise BladeSwapError(f'{err_prefix}: no nodes on blade')

# In this case, there is just one node class on the blade
hsm_class = node_classes.pop()
if hsm_class.lower() in ('mountain', 'hill'):
return BladeClass.LIQUID
elif hsm_class.lower() == 'river':
return BladeClass.AIR
else:
raise BladeSwapError(f'{err_prefix}: unsupported blade class "{hsm_class}"')

@abc.abstractmethod
def procedure(self):
Expand All @@ -215,9 +237,11 @@ def run(self):
Raises:
SystemExit: if a BladeSwapError is raised during execution of the blade swap procedure
"""
if self.blade_class not in {'mountain', 'hill', 'river'}:
LOGGER.error('Unsupported blade class "%s" for blade %s; aborting.',
self.blade_class.title(), self.xname)
try:
# This property can raise a BladeSwapError
_ = self.blade_class
except BladeSwapError as err:
LOGGER.error(str(err))
raise SystemExit(1)

try:
Expand Down Expand Up @@ -314,7 +338,7 @@ def power_off_slot(self):
Raises:
BladeSwapError: if there is a problem powering off the slot with PCS
"""
if self.blade_class == 'river':
if self.blade_class == BladeClass.AIR:
# Power off nodes on the blade individually on River blades
xnames_on = self.pcs_client.get_xnames_power_state(
[node['ID'] for node in self.blade_nodes]
Expand Down Expand Up @@ -416,7 +440,7 @@ def prompt_clear_node_controller_settings(self):
# CRAYSAT-1373: Do this automatically with SCSD once CASMHMS-5447 is
# completed.

if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
commands = []
for node_bmc in self.blade_node_bmcs:
commands.append(
Expand Down Expand Up @@ -456,7 +480,7 @@ def delete_ethernet_interfaces(self):
# well. Mountain blades should *only* have the node ethernet interfaces
# deleted.

if self.blade_class == 'river':
if self.blade_class == BladeClass.AIR:
interface_ids_to_delete |= set(
iface['ID']
for node in self.blade_node_bmcs
Expand Down Expand Up @@ -494,7 +518,7 @@ def procedure(self):

self.suspend_hms_discovery_cron_job()

if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
self.mountain_procedure()
else:
self.river_procedure()
Expand Down Expand Up @@ -571,7 +595,7 @@ def power_on_slot(self):
BladeSwapError: if the slot cannot be powered on
"""
params = {'recursive': True}
if self.blade_class == 'river':
if self.blade_class == BladeClass.AIR:
params['force'] = True
self.pcs_client.set_xnames_power_state([self.xname], 'on', **params)

Expand Down Expand Up @@ -599,7 +623,7 @@ def begin_slot_discovery(self):
try:
chassis_bmc = self.hsm_client.query_components(chassis_xname, type='ChassisBMC')[0]
except (APIError, IndexError):
if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
LOGGER.warning('Could not locate ChassisBMC for chassis %s; waiting '
'for hms-discovery to discover slot',
chassis_xname)
Expand Down Expand Up @@ -736,14 +760,14 @@ def procedure(self):
if self.src_mapping and self.dst_mapping:
self.map_ip_mac_addresses()

if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
self.enable_slot()
self.power_on_slot()

self.resume_hms_discovery_cron_job()
self.begin_slot_discovery()

if self.blade_class in ('mountain', 'hill'):
if self.blade_class == BladeClass.LIQUID:
self.wait_for_chassisbmc_endpoints()

self.wait_for_nodebmc_endpoints()
Expand Down
56 changes: 45 additions & 11 deletions tests/cli/swap/test_blade.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from sat.apiclient.pcs import PCSClient
from sat.cli.swap.blade import (
blade_swap_stage,
BladeClass,
BladeSwapError,
BladeSwapProcedure,
RedfishEndpointDiscoveryWaiter,
Expand Down Expand Up @@ -182,7 +183,8 @@ def setUp(self):

self.mock_cron = patch('sat.cli.swap.blade.HMSDiscoveryCronJob').start()

self.mock_blade_class_patcher = patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', 'mountain')
self.mock_blade_class_patcher = patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class',
BladeClass.LIQUID)
self.mock_blade_class_patcher.start()

def tearDown(self):
Expand Down Expand Up @@ -245,14 +247,46 @@ def setUp(self):

def test_detecting_mountain_blade(self):
"""Test detecting a Mountain blade"""
self.assertEqual(self.swap_out.blade_class, 'mountain')
self.assertEqual(BladeClass.LIQUID, self.swap_out.blade_class)

def test_detecting_hill_blade(self):
"""Test detecting a Hill blade"""
for node in self.nodes:
node['Class'] = 'Hill'

self.assertEqual(BladeClass.LIQUID, self.swap_out.blade_class)

def test_detecting_mixed_hill_mountain_blade(self):
"""Test detecting a blade with both Hill and Mountain nodes"""
self.nodes[0]['Class'] = 'Hill'

with self.assertLogs(level='WARNING') as logs_cm:
self.assertEqual(BladeClass.LIQUID, self.swap_out.blade_class)

self.assertEqual(len(logs_cm.output), 1)
self.assertIn('Multiple node classes detected on blade', logs_cm.records[0].message)

def test_detecting_river_blade(self):
"""Test detecting a River blade"""
for node in self.nodes:
node['Class'] = 'River'

self.assertEqual(self.swap_out.blade_class, 'river')
self.assertEqual(BladeClass.AIR, self.swap_out.blade_class)

def test_detecting_unrecognized_blade(self):
"""Test detecting an unrecognized blade"""
for node in self.nodes:
node['Class'] = 'Unrecognized'

with self.assertRaisesRegex(BladeSwapError, 'unsupported blade class "Unrecognized"'):
_ = self.swap_out.blade_class

def test_incompatible_blade_class(self):
"""Test that BladeSwapError is raised if the blade class is incompatible"""
self.nodes[0]['Class'] = 'River'

with self.assertRaisesRegex(BladeSwapError, 'incompatible node classes'):
_ = self.swap_out.blade_class


class TestDisablingRedfishEndpoints(BaseBladeSwapProcedureTest):
Expand Down Expand Up @@ -367,7 +401,7 @@ class TestPowerOffSlot(BaseBladeSwapProcedureTest):
"""Tests for the powering off slot stage"""

def test_slot_power_off_command_sent_mountain_blades(self):
"""Test that the slot power off command is sent properly for Mountain blades"""
"""Test that the slot power off command is sent properly for liquid-cooled blades"""
self.swap_out.power_off_slot()
self.mock_pcs_client.set_xnames_power_state.assert_called_once_with(
[self.blade_xname],
Expand All @@ -377,9 +411,9 @@ def test_slot_power_off_command_sent_mountain_blades(self):
)

def test_slot_power_off_command_sent_river_blades(self):
"""Test that the slot power off command is sent properly for River blades"""
"""Test that the slot power off command is sent properly for air-cooled blades"""
self.mock_blade_class_patcher.stop()
patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', 'river').start()
patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', BladeClass.AIR).start()

node_xnames = [n['ID'] for n in self.nodes]
self.mock_pcs_client.get_xnames_power_state.return_value = {'on': node_xnames}
Expand Down Expand Up @@ -590,7 +624,7 @@ def test_begin_slot_discovery_no_warning_if_river_blade(self):
self.mock_hsm_client.query_components.return_value = []

self.mock_blade_class_patcher.stop()
patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', 'river').start()
patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', BladeClass.AIR).start()

with self.assertLogs(level='WARNING') as cm:
self.swap_in.begin_slot_discovery()
Expand Down Expand Up @@ -815,14 +849,14 @@ def setUp(self):
self.mock_swapinprocedure_obj = MagicMock()

def test_swap_in_mountain(self):
"""Test swapping in a mountain blade discovers and waits for ChassisBMCs"""
self.mock_swapinprocedure_obj.blade_class = 'mountain'
"""Test swapping in a liquid-cooled blade discovers and waits for ChassisBMCs"""
self.mock_swapinprocedure_obj.blade_class = BladeClass.LIQUID
SwapInProcedure.procedure(self.mock_swapinprocedure_obj)
self.mock_swapinprocedure_obj.wait_for_chassisbmc_endpoints.assert_called()

def test_swap_in_river(self):
"""Test swapping in a river blade does not wait for ChassisBMCs"""
self.mock_swapinprocedure_obj.blade_class = 'river'
"""Test swapping in an air-cooled blade does not wait for ChassisBMCs"""
self.mock_swapinprocedure_obj.blade_class = BladeClass.AIR
SwapInProcedure.procedure(self.mock_swapinprocedure_obj)
self.mock_swapinprocedure_obj.wait_for_chassisbmc_endpoints.assert_not_called()

Expand Down

0 comments on commit 5cae513

Please sign in to comment.