From 5314486a9c0b37cf2f1a143a5ee61ec5d5e3ad2b Mon Sep 17 00:00:00 2001 From: Ryan Haasken Date: Mon, 16 Dec 2024 17:40:52 -0600 Subject: [PATCH] CRAYSAT-1948: Fix traceback in `sat swap blade` Fix a traceback that occurs in `sat swap blade` when the node classes reported by HSM includes both "Mountain" and "Hill" for a single blade. This probably indicates some sort of configuration problem in HSM, but the behavior of `sat swap blade` is the same for both Mountain and Hill, so we can log a warning and proceed in this case. Test Description: Existing and new unit tests validate the new logic in `blade_class` property. Tested the `sat swap blade --action disable` command on baldar, which was exhibiting the issue where a single blade had some nodes reporting as "Hill" and some as "Mountain" in HSM. The command worked as expected. (cherry picked from commit 5cae51386c0c1398c217c3be2fb92d7604a9246d) --- CHANGELOG.md | 6 +++ sat/cli/swap/blade.py | 78 +++++++++++++++++++++++------------- tests/cli/swap/test_blade.py | 56 +++++++++++++++++++++----- 3 files changed, 102 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9475f38..bcb80a8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [3.33.8] - 2024-12-18 + +### Fixed +- Fixed a traceback that occurs in `sat swap blade` when the node classes + reported by HSM includes both "Mountain" and "Hill" for a single blade. + ## [3.33.7] - 2024-12-18 ### Fixed diff --git a/sat/cli/swap/blade.py b/sat/cli/swap/blade.py index 8fc4eaf5..9d55d7c7 100644 --- a/sat/cli/swap/blade.py +++ b/sat/cli/swap/blade.py @@ -28,6 +28,7 @@ """ import abc +import enum from functools import wraps import json import logging @@ -55,6 +56,11 @@ inf = inflect.engine() +class BladeClass(enum.Enum): + LIQUID = 'liquid-cooled' + AIR = 'air-cooled' + + def get_available_file(prefix, extension): """Get a file object which doesn't overwrite existing files @@ -173,28 +179,44 @@ def blade_node_bmcs(self): @cached_property def blade_class(self): - """str: the class of the blade ("mountain" or "river") + """BladeClass: the class of the blade (BladeClass.LIQUID or BladeClass.AIR) + + "Mountain" and "Hill" blades are liquid-cooled, while "River" blades are + air-cooled. Raises: BladeSwapError: if there is a problem determining the slot class """ - try: - node_classes = set() - for node in self.blade_nodes: + err_prefix = f'Could not determine blade class for {self.xname}' + node_classes = set() + for node in self.blade_nodes: + try: node_classes.add(node['Class']) - - # These error cases shouldn't happen, but be defensive anyway. - if len(node_classes) > 1: - reason = f'multiple node classes on blade {self.xname}: {", ".join(node_classes)}' - raise BladeSwapError(f'Could not determine slot class: {reason}') - elif not node_classes: - reason = f'no nodes on blade {self.xname}' - raise BladeSwapError(f'Could not determine slot class: {reason}') + except KeyError as err: + raise BladeSwapError(f'{err_prefix}: node {node["ID"]} is ' + f'missing {err} field in HSM') + + if len(node_classes) > 1: + if set(node_class.lower() for node_class in node_classes) == {'mountain', 'hill'}: + # This mismatch has been observed, and it doesn't change the procedure, + # so we'll just log it and continue. + LOGGER.warning('Multiple node classes detected on blade %s: %s', + self.xname, ', '.join(node_classes)) + return BladeClass.LIQUID else: - return node_classes.pop().lower() - - except KeyError as err: - raise BladeSwapError(f'Node {node["ID"]} is missing {err} field in HSM') + reason = f'incompatible node classes: {", ".join(node_classes)}' + raise BladeSwapError(f'{err_prefix}: {reason}') + elif not node_classes: + raise BladeSwapError(f'{err_prefix}: no nodes on blade') + + # In this case, there is just one node class on the blade + hsm_class = node_classes.pop() + if hsm_class.lower() in ('mountain', 'hill'): + return BladeClass.LIQUID + elif hsm_class.lower() == 'river': + return BladeClass.AIR + else: + raise BladeSwapError(f'{err_prefix}: unsupported blade class "{hsm_class}"') @abc.abstractmethod def procedure(self): @@ -215,9 +237,11 @@ def run(self): Raises: SystemExit: if a BladeSwapError is raised during execution of the blade swap procedure """ - if self.blade_class not in {'mountain', 'hill', 'river'}: - LOGGER.error('Unsupported blade class "%s" for blade %s; aborting.', - self.blade_class.title(), self.xname) + try: + # This property can raise a BladeSwapError + _ = self.blade_class + except BladeSwapError as err: + LOGGER.error(str(err)) raise SystemExit(1) try: @@ -314,7 +338,7 @@ def power_off_slot(self): Raises: BladeSwapError: if there is a problem powering off the slot with PCS """ - if self.blade_class == 'river': + if self.blade_class == BladeClass.AIR: # Power off nodes on the blade individually on River blades xnames_on = self.pcs_client.get_xnames_power_state( [node['ID'] for node in self.blade_nodes] @@ -416,7 +440,7 @@ def prompt_clear_node_controller_settings(self): # CRAYSAT-1373: Do this automatically with SCSD once CASMHMS-5447 is # completed. - if self.blade_class in ('mountain', 'hill'): + if self.blade_class == BladeClass.LIQUID: commands = [] for node_bmc in self.blade_node_bmcs: commands.append( @@ -456,7 +480,7 @@ def delete_ethernet_interfaces(self): # well. Mountain blades should *only* have the node ethernet interfaces # deleted. - if self.blade_class == 'river': + if self.blade_class == BladeClass.AIR: interface_ids_to_delete |= set( iface['ID'] for node in self.blade_node_bmcs @@ -494,7 +518,7 @@ def procedure(self): self.suspend_hms_discovery_cron_job() - if self.blade_class in ('mountain', 'hill'): + if self.blade_class == BladeClass.LIQUID: self.mountain_procedure() else: self.river_procedure() @@ -571,7 +595,7 @@ def power_on_slot(self): BladeSwapError: if the slot cannot be powered on """ params = {'recursive': True} - if self.blade_class == 'river': + if self.blade_class == BladeClass.AIR: params['force'] = True self.pcs_client.set_xnames_power_state([self.xname], 'on', **params) @@ -599,7 +623,7 @@ def begin_slot_discovery(self): try: chassis_bmc = self.hsm_client.query_components(chassis_xname, type='ChassisBMC')[0] except (APIError, IndexError): - if self.blade_class in ('mountain', 'hill'): + if self.blade_class == BladeClass.LIQUID: LOGGER.warning('Could not locate ChassisBMC for chassis %s; waiting ' 'for hms-discovery to discover slot', chassis_xname) @@ -736,14 +760,14 @@ def procedure(self): if self.src_mapping and self.dst_mapping: self.map_ip_mac_addresses() - if self.blade_class in ('mountain', 'hill'): + if self.blade_class == BladeClass.LIQUID: self.enable_slot() self.power_on_slot() self.resume_hms_discovery_cron_job() self.begin_slot_discovery() - if self.blade_class in ('mountain', 'hill'): + if self.blade_class == BladeClass.LIQUID: self.wait_for_chassisbmc_endpoints() self.wait_for_nodebmc_endpoints() diff --git a/tests/cli/swap/test_blade.py b/tests/cli/swap/test_blade.py index 5ab43cfa..387229c0 100644 --- a/tests/cli/swap/test_blade.py +++ b/tests/cli/swap/test_blade.py @@ -39,6 +39,7 @@ from sat.apiclient.pcs import PCSClient from sat.cli.swap.blade import ( blade_swap_stage, + BladeClass, BladeSwapError, BladeSwapProcedure, RedfishEndpointDiscoveryWaiter, @@ -182,7 +183,8 @@ def setUp(self): self.mock_cron = patch('sat.cli.swap.blade.HMSDiscoveryCronJob').start() - self.mock_blade_class_patcher = patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', 'mountain') + self.mock_blade_class_patcher = patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', + BladeClass.LIQUID) self.mock_blade_class_patcher.start() def tearDown(self): @@ -245,14 +247,46 @@ def setUp(self): def test_detecting_mountain_blade(self): """Test detecting a Mountain blade""" - self.assertEqual(self.swap_out.blade_class, 'mountain') + self.assertEqual(BladeClass.LIQUID, self.swap_out.blade_class) + + def test_detecting_hill_blade(self): + """Test detecting a Hill blade""" + for node in self.nodes: + node['Class'] = 'Hill' + + self.assertEqual(BladeClass.LIQUID, self.swap_out.blade_class) + + def test_detecting_mixed_hill_mountain_blade(self): + """Test detecting a blade with both Hill and Mountain nodes""" + self.nodes[0]['Class'] = 'Hill' + + with self.assertLogs(level='WARNING') as logs_cm: + self.assertEqual(BladeClass.LIQUID, self.swap_out.blade_class) + + self.assertEqual(len(logs_cm.output), 1) + self.assertIn('Multiple node classes detected on blade', logs_cm.records[0].message) def test_detecting_river_blade(self): """Test detecting a River blade""" for node in self.nodes: node['Class'] = 'River' - self.assertEqual(self.swap_out.blade_class, 'river') + self.assertEqual(BladeClass.AIR, self.swap_out.blade_class) + + def test_detecting_unrecognized_blade(self): + """Test detecting an unrecognized blade""" + for node in self.nodes: + node['Class'] = 'Unrecognized' + + with self.assertRaisesRegex(BladeSwapError, 'unsupported blade class "Unrecognized"'): + _ = self.swap_out.blade_class + + def test_incompatible_blade_class(self): + """Test that BladeSwapError is raised if the blade class is incompatible""" + self.nodes[0]['Class'] = 'River' + + with self.assertRaisesRegex(BladeSwapError, 'incompatible node classes'): + _ = self.swap_out.blade_class class TestDisablingRedfishEndpoints(BaseBladeSwapProcedureTest): @@ -367,7 +401,7 @@ class TestPowerOffSlot(BaseBladeSwapProcedureTest): """Tests for the powering off slot stage""" def test_slot_power_off_command_sent_mountain_blades(self): - """Test that the slot power off command is sent properly for Mountain blades""" + """Test that the slot power off command is sent properly for liquid-cooled blades""" self.swap_out.power_off_slot() self.mock_pcs_client.set_xnames_power_state.assert_called_once_with( [self.blade_xname], @@ -377,9 +411,9 @@ def test_slot_power_off_command_sent_mountain_blades(self): ) def test_slot_power_off_command_sent_river_blades(self): - """Test that the slot power off command is sent properly for River blades""" + """Test that the slot power off command is sent properly for air-cooled blades""" self.mock_blade_class_patcher.stop() - patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', 'river').start() + patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', BladeClass.AIR).start() node_xnames = [n['ID'] for n in self.nodes] self.mock_pcs_client.get_xnames_power_state.return_value = {'on': node_xnames} @@ -590,7 +624,7 @@ def test_begin_slot_discovery_no_warning_if_river_blade(self): self.mock_hsm_client.query_components.return_value = [] self.mock_blade_class_patcher.stop() - patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', 'river').start() + patch('sat.cli.swap.blade.BladeSwapProcedure.blade_class', BladeClass.AIR).start() with self.assertLogs(level='WARNING') as cm: self.swap_in.begin_slot_discovery() @@ -815,14 +849,14 @@ def setUp(self): self.mock_swapinprocedure_obj = MagicMock() def test_swap_in_mountain(self): - """Test swapping in a mountain blade discovers and waits for ChassisBMCs""" - self.mock_swapinprocedure_obj.blade_class = 'mountain' + """Test swapping in a liquid-cooled blade discovers and waits for ChassisBMCs""" + self.mock_swapinprocedure_obj.blade_class = BladeClass.LIQUID SwapInProcedure.procedure(self.mock_swapinprocedure_obj) self.mock_swapinprocedure_obj.wait_for_chassisbmc_endpoints.assert_called() def test_swap_in_river(self): - """Test swapping in a river blade does not wait for ChassisBMCs""" - self.mock_swapinprocedure_obj.blade_class = 'river' + """Test swapping in an air-cooled blade does not wait for ChassisBMCs""" + self.mock_swapinprocedure_obj.blade_class = BladeClass.AIR SwapInProcedure.procedure(self.mock_swapinprocedure_obj) self.mock_swapinprocedure_obj.wait_for_chassisbmc_endpoints.assert_not_called()