From ff6a4f0bd5625ae5f4b62c131541e791313e48f9 Mon Sep 17 00:00:00 2001 From: Jason Sollom Date: Wed, 9 Aug 2023 16:13:41 -0500 Subject: [PATCH] CASMCMS-8754: Make BOS V2 status operator resilient to power errors When CAPMC returns errors, handle them and disable the associated nodes instead of panicking and halting all forward progress. Added some new classes. Handle troublesome CAPMC errors that don't point fingers and indentify which nodes experienced the errors. If CAPMC returns an error that cannot be associated with an individual node, then reissue the CAPMC command for each individual component. Under that condition, Any components returning an error is guaranteed to be the cause of the error, so they are associated with the error and disabled in order to not cause BOS to attempt to retry them. As a consequence of handling CAPMC-returned errors, the power operators need to be able to disable nodes. When they receive errors from CAPMC, they can declare those nodes disabled. Create a power_operator_base class because error handling is common to all three power operators. This is an abstract base class that each of the power operators inherits from. It collects all of the error handling into one place, so that it is not spread across the three power operators. --- CHANGELOG.md | 8 +- setup.py | 1 - src/bos/operators/base.py | 5 +- src/bos/operators/power_off_forceful.py | 22 +- src/bos/operators/power_off_graceful.py | 22 +- src/bos/operators/power_on.py | 34 +- src/bos/operators/power_operator_base.py | 143 ++++++++ src/bos/operators/status.py | 50 ++- src/bos/operators/utils/clients/capmc.py | 415 ++++++++++++++++++----- 9 files changed, 575 insertions(+), 125 deletions(-) delete mode 120000 setup.py create mode 100644 src/bos/operators/power_operator_base.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b1d353e..53a52142 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.0.27] - 10-03-2023 +### Changed +- Added error checking for errors returned by CAPMC. Where possible, nodes are disabled when they can be + associated with an error. This error handling prevents the BOS V2 status operator from entering a + live-lock when it is dealing with nodes that are MISSING ore disabled in the Hardware State Manager. + ## [2.0.26] - 09-19-2023 ### Fixed - Fixed HSM query handling to prevent errors from querying with an empty list nodes. @@ -143,7 +149,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [2.0.10] - 2023-05-08 ### Added -- 'include_disabled' option to decide whether disabled nodes should be part of a BOS session +- 'include_disabled' option to decide whether disabled nodes should be part of a BOS session ## [2.0.9] - 2023-01-12 ### Fixed diff --git a/setup.py b/setup.py deleted file mode 120000 index d1870645..00000000 --- a/setup.py +++ /dev/null @@ -1 +0,0 @@ -src/server/setup.py \ No newline at end of file diff --git a/src/bos/operators/base.py b/src/bos/operators/base.py index 6b100e10..69abd96a 100644 --- a/src/bos/operators/base.py +++ b/src/bos/operators/base.py @@ -164,6 +164,9 @@ def _update_database(self, components: List[dict], additional_fields: dict=None) 'id': component['id'], 'error': component['error'] # New error, or clearing out old error } + # Allow operators to disable components. + if not component['enabled']: + patch['enabled'] = component['enabled'] if self.name: last_action_data = { 'action': self.name, @@ -178,7 +181,7 @@ def _update_database(self, components: List[dict], additional_fields: dict=None) if additional_fields: patch.update(additional_fields) - + # When updating a component's desired state, operators # are expected to provide session data as a hacky way to prove # that they are operators. If they do not provide it, then the diff --git a/src/bos/operators/power_off_forceful.py b/src/bos/operators/power_off_forceful.py index 263f3ca9..587e8211 100644 --- a/src/bos/operators/power_off_forceful.py +++ b/src/bos/operators/power_off_forceful.py @@ -25,15 +25,16 @@ import logging from bos.common.values import Action, Status -import bos.operators.utils.clients.capmc as capmc +from bos.operators.utils.clients.capmc import disable_based_on_error_xname_on_off, power from bos.operators.utils.clients.bos.options import options -from bos.operators.base import BaseOperator, main +from bos.operators.base import main +from bos.operators.power_operator_base import PowerOperatorBase from bos.operators.filters import BOSQuery, HSMState, TimeSinceLastAction LOGGER = logging.getLogger('bos.operators.power_off_forceful') -class ForcefulPowerOffOperator(BaseOperator): +class ForcefulPowerOffOperator(PowerOperatorBase): """ The Forceful Power-Off Operator tells capmc to power-off nodes if: - Enabled in the BOS database and the status is power_off_gracefully of power_off_forcefully @@ -56,10 +57,17 @@ def filters(self): HSMState(enabled=True), ] - def _act(self, components): - component_ids = [component['id'] for component in components] - capmc.power(component_ids, state='off', force=True) - return components + def _my_power(self, component_ids): + """ + Power off components forcefully. + + Returns: + errors (dict): A class containing an error code, error message, and + a dictionary containing the nodes (keys) suffering from errors (values) + :rtype: CapmcXnameOnOffReturnedError + """ + + return power(component_ids, state='off', force=True) if __name__ == '__main__': diff --git a/src/bos/operators/power_off_graceful.py b/src/bos/operators/power_off_graceful.py index a476a037..7e827a57 100644 --- a/src/bos/operators/power_off_graceful.py +++ b/src/bos/operators/power_off_graceful.py @@ -25,14 +25,15 @@ import logging from bos.common.values import Action, Status -import bos.operators.utils.clients.capmc as capmc -from bos.operators.base import BaseOperator, main +from bos.operators.utils.clients.capmc import disable_based_on_error_xname_on_off, power +from bos.operators.base import main +from bos.operators.power_operator_base import PowerOperatorBase from bos.operators.filters import BOSQuery, HSMState LOGGER = logging.getLogger('bos.operators.power_off_graceful') -class GracefulPowerOffOperator(BaseOperator): +class GracefulPowerOffOperator(PowerOperatorBase): """ - Enabled in the BOS database and the status is power_off_pending - Enabled in HSM @@ -52,10 +53,17 @@ def filters(self): HSMState(enabled=True), ] - def _act(self, components): - component_ids = [component['id'] for component in components] - capmc.power(component_ids, state='off', force=False) - return components + def _my_power(self, component_ids): + """ + Power off components gracefully, not forcefully. + + Returns: + errors (dict): A class containing an error code, error message, and + a dictionary containing the nodes (keys) suffering from errors (values) + :rtype: CapmcXnameOnOffReturnedError + """ + + return power(component_ids, state='off', force=False) if __name__ == '__main__': diff --git a/src/bos/operators/power_on.py b/src/bos/operators/power_on.py index b8a7e389..5c542099 100644 --- a/src/bos/operators/power_on.py +++ b/src/bos/operators/power_on.py @@ -28,16 +28,17 @@ from bos.common.values import Action, Status import bos.operators.utils.clients.bss as bss -import bos.operators.utils.clients.capmc as capmc +from bos.operators.utils.clients.capmc import disable_based_on_error_xname_on_off, power from bos.operators.utils.clients.cfs import set_cfs -from bos.operators.base import BaseOperator, main +from bos.operators.base import main +from bos.operators.power_operator_base import PowerOperatorBase from bos.operators.filters import BOSQuery, HSMState from bos.server.dbs.boot_artifacts import record_boot_artifacts LOGGER = logging.getLogger('bos.operators.power_on') -class PowerOnOperator(BaseOperator): +class PowerOnOperator(PowerOperatorBase): """ The Power-On Operator tells capmc to power-on nodes if: - Enabled in the BOS database and the status is power_on_pending @@ -59,6 +60,10 @@ def filters(self): ] def _act(self, components): + """ + Set up BSS and CFS prior to powering on the components. + Power on the components. + """ self._preset_last_action(components) try: self._set_bss(components) @@ -68,12 +73,7 @@ def _act(self, components): set_cfs(components, enabled=False, clear_state=True) except Exception as e: raise Exception("An error was encountered while setting CFS information: {}".format(e)) from e - component_ids = [component['id'] for component in components] - try: - capmc.power(component_ids, state='on') - except Exception as e: - raise Exception("An error was encountered while calling CAPMC to power on: {}".format(e)) from e - return components + return self._power_components(components) def _set_bss(self, components, retries=5): """ @@ -81,8 +81,8 @@ def _set_bss(self, components, retries=5): Receive a BSS_REFERRAL_TOKEN from BSS. Map the token to the boot artifacts. Update each node's desired state with the token. - - Because the connection to the BSS tokens database can be lost due to + + Because the connection to the BSS tokens database can be lost due to infrequent use, retry up to retries number of times. """ parameters = defaultdict(set) @@ -129,6 +129,18 @@ def _set_bss(self, components, retries=5): "session": sessions[node]}) self.bos_client.components.update_components(bss_tokens) + def _my_power(self, component_ids): + """ + Power on components. + + Returns: + errors (dict): A class containing an error code, error message, and + a dictionary containing the nodes (keys) suffering from errors (values) + :rtype: CapmcXnameOnOffReturnedError + """ + + return power(component_ids, state='on') + if __name__ == '__main__': main(PowerOnOperator) diff --git a/src/bos/operators/power_operator_base.py b/src/bos/operators/power_operator_base.py new file mode 100644 index 00000000..40af66b3 --- /dev/null +++ b/src/bos/operators/power_operator_base.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python +# +# MIT License +# +# (C) Copyright 2021-2022 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +import logging +from typing import List, Type + +from bos.operators.utils.clients.capmc import disable_based_on_error_xname_on_off, power +from bos.operators.base import BaseOperator, main +from bos.operators.filters.base import BaseFilter + +LOGGER = logging.getLogger('bos.operators.power_base') + + +class PowerOperatorBase(BaseOperator): + """ + An abstract class for all BOS power operators. + + Override these methods and properties. + NAME - This field determines how the operator/action is logged in the components database. + FILTERS - A list of all filters that are used to determine which components should be acted on. + Includes the initial query for BOS components as the query also includes filtering. + _my_power - This method indicates the power action to be taken by the operator. + Any other method may also be overridden, but functionality such as error handling may be lost. + + """ + + retry_attempt_field = "power_base_operator" + + @property + def name(self) -> str: + return 'Invalid Action Type' + + # Filters + @property + def filters(self) -> List[Type[BaseFilter]]: + return [] + + def _act(self, components) -> List[dict]: + """ + Calls the _power_components method. + Override this method to perform additional actions specific to the + power operator. + Be sure to finish with the return statement below. + """ + return self._power_components(components) + + + def _power_components(self, components: List[dict]) -> List[dict]: + """ + Apply the _my_power operation to a list of components. + Handle any errors. This includes setting the error per component and + disabling the component. + + If we know which nodes experienced errors, then set their errors and + disable them. If we do not know which nodes experienced errors, then + attempt to power them on or off individually. Any errors encountered + will be specific to the individual node. + + Inputs: + :param List[dict] components: A list of the components to operate on + + :return components: the + :rtype: A list of dictionaries where the individual elements are Components + """ + component_ids = [component['id'] for component in components] + errors = self._my_power(component_ids) + if errors.error_code != 0: + if errors.nodes_in_error: + # Update any nodes with errors they encountered + for component in errors.nodes_in_error: + index = self._find_component_in_components(component, components) + if index is not None: + error = errors.nodes_in_error[component].error_message + components[index]['error'] = error + components[index]['enabled'] = disable_based_on_error_xname_on_off(error) + break + else: + # Errors could not be associated with a specific node. + # Ask CAPMC to act on them one at a time to identify + # nodes associated with errors. + for component in component_ids: + LOGGER.debug(f"Acting on component {component}") + errors = self._my_power([component]) + if errors.error_code != 0: + index = self._find_component_in_components(component, components) + if index is not None: + components[index]['error'] = errors.error_message + components[index]['enabled'] = False + + return components + + def _find_component_in_components(self, component_id, components) -> int: + """ + In a list of components, find the component that matches + the component ID. Return its index in the list. + + :param str component_id: The component ID + :param List[dict] components: A list of components + + Returns: + An index indicating the matched components location in the list + It returns None if there is no match. + :rtype: int + """ + for component in components: + if component_id == component['id']: + return components.index(component) + return None + + def _my_power(self, component_ids: List[str]): + """ + Overide this function with the power call specific to the operator. + + Returns: + errors (dict): A class containing o on error code, error message, and + a dictionary containing the nodes (keys) suffering from errors (values) + :rtype: CapmcXnameOnOffReturnedError + """ + return power(component_ids) + +if __name__ == '__main__': + main(PowerOperatorBase) diff --git a/src/bos/operators/status.py b/src/bos/operators/status.py index 0edf2ca3..c15a1e1f 100644 --- a/src/bos/operators/status.py +++ b/src/bos/operators/status.py @@ -30,7 +30,9 @@ DesiredConfigurationIsNone, DesiredConfigurationSetInCFS, LastActionIs, TimeSinceLastAction from bos.operators.utils.clients.bos.options import options from bos.operators.utils.clients.capmc import status as get_power_states +from bos.operators.utils.clients.capmc import disable_based_on_error_xname_status from bos.operators.utils.clients.cfs import get_components as get_cfs_components +from bos.server.models.v2_component import V2Component # noqa: E501 LOGGER = logging.getLogger('bos.operators.status') @@ -70,7 +72,7 @@ def _run(self) -> None: """ A single pass of detecting and acting on components """ components = self.bos_client.components.get_components(enabled=True) component_ids = [component['id'] for component in components] - power_states, _failed_nodes = get_power_states(component_ids) + power_states, xname_status_failures = get_power_states(component_ids) cfs_states = self._get_cfs_components() updated_components = [] if components: @@ -78,8 +80,13 @@ def _run(self) -> None: self.boot_wait_time_elapsed = TimeSinceLastAction(seconds=options.max_boot_wait_time)._match self.power_on_wait_time_elapsed = TimeSinceLastAction(seconds=options.max_power_on_wait_time)._match for component in components: + error_string = None + node_error = xname_status_failures.nodes_in_error.get(component['id'],{}) + if node_error: + error_string = node_error.error_message updated_component = self._check_status( - component, power_states.get(component['id']), cfs_states.get(component['id'])) + component, power_states.get(component['id']), cfs_states.get(component['id']), + error_string) if updated_component: updated_components.append(updated_component) if not updated_components: @@ -90,12 +97,18 @@ def _run(self) -> None: self.bos_client.components.update_components(updated_components) @staticmethod - def _get_cfs_components(): + def _get_cfs_components() -> dict: """ Gets all the components from CFS. We used to get only the components of interest, but that caused an HTTP request that was longer than uwsgi could handle when the number of nodes was very large. Requesting all components means none need to be specified in the request. + + :return: A dictionary containing the CFS components + :rtype: dict + + The function returns a dictionary of CFS componets where the key is the component ID and + the values are the individual component data. """ cfs_data = get_cfs_components() cfs_states = {} @@ -103,22 +116,36 @@ def _get_cfs_components(): cfs_states[component['id']] = component return cfs_states - def _check_status(self, component, power_state, cfs_component): + def _check_status(self, component: V2Component, power_state: str, cfs_component: str, error: str) -> dict: """ Calculate the component's current status based upon its power state and CFS configuration state. If its status differs from the status in the database, return this information. + + :param V2Component component: A BOS component + :param str power_state: The component's power state + :param str cfs_component: The component's CFS state as seen in the Configuration Framework Service (CFS) + :param str error: The error the node is experiencing; This equals None if there is no error. + + :return updated_component: The component's updated data + :rtype: dict """ + phase = Phase.none + override = Status.on_hold + disable = False + action_failed = False + if power_state and cfs_component: phase, override, disable, error, action_failed = self._calculate_status(component, power_state, cfs_component) else: - # If the component cannot be found in capmc or cfs - phase = Phase.none - override = Status.on_hold - action_failed = False + # If the component cannot be found in CAPMC or CFS if not power_state: - error = 'Component information was not returned by capmc' + if not error: + error = 'Component information was not returned by CAPMC' + disable = disable_based_on_error_xname_status(error) + if disable: + override = None elif not cfs_component: - error = 'Component information was not returned by cfs' + error = 'Component information was not returned by CFS' updated_component = { 'id': component['id'], @@ -163,7 +190,7 @@ def _calculate_status(self, component, power_state, cfs_component): """ Calculate a component's status based on its current state, power state, and CFS state. - + Disabling for successful completion should return an empty phase Disabling for a failure should return the phase that failed Override is used for status information that cannot be determined using only @@ -222,6 +249,5 @@ def _calculate_status(self, component, power_state, cfs_component): return phase, override, disable, error, action_failed - if __name__ == '__main__': main(StatusOperator) diff --git a/src/bos/operators/utils/clients/capmc.py b/src/bos/operators/utils/clients/capmc.py index 8fcb72bb..c02f7c83 100644 --- a/src/bos/operators/utils/clients/capmc.py +++ b/src/bos/operators/utils/clients/capmc.py @@ -24,7 +24,10 @@ import logging import requests import json +import re +from abc import ABC, abstractmethod from collections import defaultdict +from typing import List from bos.operators.utils import requests_retry_session, PROTOCOL @@ -34,6 +37,252 @@ LOGGER = logging.getLogger('bos.operators.utils.clients.capmc') +# If a CAPMC response contains an error that is in of the these lists, +# then it will also contain a list of nodes that should be disabled. +# XNAME_COMMON_ERROR_STRINGS, XNAME_STATUS_ERROR_STRINGS, and +# XNAME_ON_OFF_ERROR_STRINGS +XNAME_COMMON_ERROR_STRINGS = ['invalid/duplicate xnames', + 'disabled or not found', + 'xnames role blocked', + 'xnames role blocked/not found'] + + +XNAME_STATUS_ERROR_STRINGS = ['xnames not found'] +XNAME_STATUS_ERROR_STRINGS.extend(XNAME_COMMON_ERROR_STRINGS) + +XNAME_ON_OFF_ERROR_STRINGS = ["invalid xnames", + "Invalid Component IDs", + "components disabled"] +XNAME_ON_OFF_ERROR_STRINGS.extend(XNAME_COMMON_ERROR_STRINGS) +class CapmcReturnedError(ABC): + """ + A base function for parsing the errors returned by CAPMC. + The nodes_in_error is a dictionary with nodes (keys) and errors (values). + + Must override: + * calculate_nodes_in_error + * process_error_string + """ + def __init__(self, response): + self.response = response + self.error_code = 0 + self.error_message = '' + self.nodes_in_error = {} + if 'e' in response: + self.error_code = response['e'] + if 'err_msg' in response: + self.error_message = response['err_msg'] + self.calculate_nodes_in_error() + + @abstractmethod + def calculate_nodes_in_error(self): + pass + + @abstractmethod + def process_error_string(self): + """ + Process the received error string against the appropriate + error string dictionary. + + This function should be overridden replacing XNAME_COMMON_ERROR_STRINGS + with the appropriate error string dictionary for the type of call + being issued to CAPMC. + """ + self._process_error_string(XNAME_COMMON_ERROR_STRINGS) + + def _process_error_string(self, error_string_dict): + """ + This function will populate the nodes_in_error attribute if there are + any nodes found to be in error. Note, some errors cannot be associated + with an individual node. + Inputs: + :param dict error_strings: A dictionary of error strings to compare against + """ + for err_str in error_string_dict: + match = re.match(fr"{err_str}: +\[([\w ?]+)\]", self.error_message) + if match: + nodes_in_error = match.group(1).split() + for node in nodes_in_error: + self.nodes_in_error[node] = CapmcNodeError(self.error_code, err_str) + break + + +class CapmcXnameStatusReturnedError(CapmcReturnedError): + """ + This class processes errors returned when calling xname_get_status. + + ---------------------------------------------------------------------------------------- + Here is CAPMC's error response format to requests to the get_xname_status endpoint when + the CAPMC front-end encounters an error. + They are reproduced here because, otherwise, it is only available in a Jira. + + {"e": 400, + "err_msg": "invalid/duplicate xnames: [x1000c0s0b0n0,x1000c0s0b0n1]"} + + e: 400 + errMsg: "no request" + errMsg: some sort of decoding error + * Retry with valid payload + + e: 400 + errMsg: "invalid filter string: abcd" + * Retry with valid filter string + + e: 400 + errMsg: "unknown status source 'abcd'" + * Retry with valid source or restriction removed + + e: 400 + errMsg: "invalid/duplicate xnames: [x1000c0s0b0n0]" + errMsg: "xnames not found: [x1000c8s8b8n0]" + errMsg: "disabled or not found: [x1000c0s0b0n0]" + errMsg: "xnames role blocked: [x1000c0s0b0n0]" + errMsg: "xnames role blocked/not found: [x1000c0s0b0n0]" + * Retry with invalid and/or duplicate names removed + + e: 400 + errMsg: "No matching components found" + * Retry with valid xname list or different filter options + + e: 405 + errMsg: "(PATCH) Not Allowed" + * Retry with GET + + e: 500 + errMsg: "Error: " + request/unmarshal error string + errMst: "Connection to the secure store isn't ready. Can not get redfish credentials." + * FATAL. CAPMC is unable to talk to a required service (HSM, VAULT) + ---------------------------------------------------------------------------------------- + + This class only populates nodes_in_error for the 400 errors that actually provide + failed nodes. Other errors do not provide a list of failed nodes, so those errors are + merely captured at the top level. + """ + def calculate_nodes_in_error(self): + if self.error_code == 400: + self.process_error_string() + + def process_error_string(self): + """ + Process the received error string against the XNAME_STATUS_ERROR_STRINGS + dictionary. + """ + self._process_error_string(XNAME_STATUS_ERROR_STRINGS) + +class CapmcXnameOnOffReturnedError(CapmcReturnedError): + """ + This class processes errors returned when calling xname_on and xname_off. + + This function is used in booting as well as shutdown, so it has been + abstracted to one place in order to avoid duplication. + + ---------------------------------------------------------------------------------------- + Here is an example of what a partially successful shutdown looks like, since it isn't captured + in the documentation particularly well. This is from the CAPMC backend. + {"e":-1,"err_msg":"Errors encountered with 1/1 Xnames issued On","xnames":[{"xname":"x3000c0s19b3n0","e":-1,"err_msg":"NodeBMC Communication Error"}]} + + e: -1 + errMsg: "Errors encountered with %d components for %s" + FATAL. Could not find supported power operations + + e: -1 + errMsg: "Errors encountered with %d/%d Xnames issued %s"\ + Partial success. Most likely FATAL for failed components + + e: -1 + errMsg: "no power controls for %s operation" + FATAL. Can't determine power operation + + e: -1 + errMsg: "Skipping %s: Type, '%s', not defined in power sequence for '%s'" + errMsg: "no supported ResetType for %s operation" + FATAL. Power operation not supported. + + e: 37 + errMsg: "Error: Failed to reserve components while performing a %s." + Retry. The condition may resolve itself + + e: 400 + errMsg: "Bad Request: " + decode error + errMsg: "Bad Request: Missing required xnames parameter" + errMsg: "Bad Request: Required xnames list is empty" + Retry with valid request + + e: 400 + errMsg: "Cannot force the On operation" + Retry without 'force=true' in payload + + e: 400 + errMsg: "Bad Request: recursive and prereq options are mutually exclusive" + Retry with only one of the options + + e: 400 + errMsg: "invalid xnames: [x1001c0s0b0n0]" + errMsg: "invalid/duplicate xnames: [x1001c0s0b0n0]" + errMsg: "Invalid Component IDs: [x1001c0s0b0n0]" + errMsg: "disabled or not found: [x1001c0s0b0n0]" + errMsg: "xnames role blocked: [x1001c0s0b0n0]" + errMsg: "xnames role blocked/not found: [x1001c0s0b0n0]" + errMsg: "nodes disabled: [1001]" + errMsg: "components disabled: [x1001c0s0b0n0]" + Retry with invalid and/or duplicate names/IDs removed + + e: 400 + errMsg: "No nodes found to operate on" + Retry with valid xname list or different filter options + + e: 405 + errMsg: "(PATCH) Not Allowed" + Retry with POST + + e: 500 + errMsg: "Error: " + request/unmarshal error string + errMst: "Connection to the secure store isn't ready. Can not get redfish credentials." + FATAL. CAPMC is unable to talk to a required service (HSM, VAULT) + ---------------------------------------------------------------------------------------- + + This class only populates nodes_in_error for errors that actually provide + failed nodes. Other errors do not provide a list of failed nodes, so those errors are + merely captured at the top level. + """ + def calculate_nodes_in_error(self): + if self.error_code == -1: + if 'undefined' in self.response: + for node in self.response['undefined']: + self.nodes_in_error[node] = CapmcNodeError(self.error_code,'undefined') + if 'xnames' in self.response: + for xname_dict in self.response['xnames']: + xname = xname_dict['xname'] + err_msg = xname_dict['err_msg'] + self.nodes_in_error[xname] = CapmcNodeError(self.error_code, err_msg) + elif self.error_code == 400: + self.process_error_string() + + def process_error_string(self): + """ + Process the received error string against the XNAME_ON_OFF_ERROR_STRINGS + dictionary. + """ + self._process_error_string(XNAME_ON_OFF_ERROR_STRINGS) + + +class CapmcNodeError(object): + def __init__(self, error_code, error_message): + self.error_code = error_code + self.error_message = error_message + + def __repr__(self) -> str: + """ + Print how this class was initialized for debugging purposes. + """ + return f"CapmcNodeError({self.error_code}, {self.error_message})" + + def __str__(self) -> str: + """ + Print a human-readable version of this class. + """ + return f"Error code: {self.error_code}\tError Message: {self.error_message}" + class CapmcException(Exception): """ @@ -58,13 +307,13 @@ def status(nodes, filtertype = 'show_all', session = None): filtertype (str): Type of filter to use when sorting Returns: - status_dict (dict): Keys are different states; values are a literal set of nodes - failed_nodes (set): A set of the nodes that had errors - errors (dict): A dictionary containing the nodes (values) - suffering from errors (keys) + node_status (dict): Keys are nodes; values are different power states or errors + :rtype: dict + xname_status_failures: A CapmcXnameStatusReturnedError class containing the error code, + error string, and a dictionary containing nodes (keys) suffering from errors (valuse) + :rtype: CapmcXnameStatusReturnedError Raises: - HTTPError JSONDecodeError -- error decoding the CAPMC response """ endpoint = '%s/get_xname_status' % (ENDPOINT) @@ -74,26 +323,26 @@ def status(nodes, filtertype = 'show_all', session = None): 'xnames': list(nodes)} response = session.post(endpoint, json = body) - try: - response.raise_for_status() - except requests.exceptions.HTTPError as err: - LOGGER.error("Failed interacting with Cray Advanced Platform Monitoring and Control " - "(CAPMC): %s", err) - LOGGER.error(response.text) - raise try: json_response = json.loads(response.text) except json.JSONDecodeError as jde: errmsg = "CAPMC returned a non-JSON response: %s %s" % (response.text, jde) LOGGER.error(errmsg) raise - # Check for error state in the returned response and retry - if json_response['e']: - LOGGER.error("CAPMC responded with an error response code '%s': %s", - json_response['e'], json_response) - - failed_nodes, errors = parse_response(json_response) + xname_status_failures = CapmcXnameStatusReturnedError(json_response) + LOGGER.debug("XNAME_STATUS_FAILURES: nodes_in_error: " + f"{xname_status_failures.nodes_in_error}") + try: + response.raise_for_status() + except requests.exceptions.HTTPError: + LOGGER.error("Failed interacting with Cray Advanced Platform " + "Monitoring and Control (CAPMC). " + f"Error code: {xname_status_failures.error_code} " + f"Error message: {xname_status_failures.error_message} " + f"Entire response: {xname_status_failures.response}") + + # Remove the error elements leaving only the node's power status. for key in ('e', 'err_msg'): try: del json_response[key] @@ -101,69 +350,17 @@ def status(nodes, filtertype = 'show_all', session = None): pass # Reorder JSON response into a dictionary where the nodes are the keys. - node_status = {} + node_power_status = {} for power_state, nodes in json_response.items(): for node in nodes: - node_status[node] = power_state - - # Add in the nodes with errors. - node_status.update(errors) - - return node_status, failed_nodes - - -def parse_response(response): - """ - Takes a CAPMC power action JSON response and process it for partial - communication errors. This function is used in booting as well as - shutdown, so it has been abstracted to one place in order to avoid - duplication. - - This function has the side effect of categorizing and logging errors - by error condition encountered. - - # Here is an example of what a partially successful shutdown looks like, since it isn't captured - # in the documentation particularly well. - # {"e":-1,"err_msg":"Errors encountered with 1/1 Xnames issued On","xnames":[{"xname":"x3000c0s19b3n0","e":-1,"err_msg":"NodeBMC Communication Error"}]} - - This function returns a set of nodes (in our case, almost always, xnames) - that did not receive the requested call for action. Upstream calling - functions may decide what to do with that information. - - Returns - failed_nodes (set): A set of the nodes that failed - reasons_for_failure (dict): A dictionary containing the nodes (values) - suffering from errors (keys) - """ - failed_nodes = set() - reasons_for_failure = defaultdict(list) - if 'e' not in response or response['e'] == 0: - # All nodes received the requested action; happy path - return failed_nodes, reasons_for_failure - LOGGER.warning("CAPMC responded with e code '%s'", response['e']) - if 'err_msg' in response: - LOGGER.warning("err_msg: %s", response['err_msg']) - if 'undefined' in response: - failed_nodes |= set(response['undefined']) - if 'xnames' in response: - for xname_dict in response['xnames']: - xname = xname_dict['xname'] - err_msg = xname_dict['err_msg'] - reasons_for_failure[err_msg].append(xname) - # Report back all reasons for failure - for err_msg, nodes in sorted(reasons_for_failure.items()): - node_count = len(nodes) - if node_count <= 5: - LOGGER.warning("\t%s: %s", err_msg, ', '.join(sorted(nodes))) - else: - LOGGER.warning("\t%s: %s nodes", err_msg, node_count) - # Collect all failed nodes. - for nodes in reasons_for_failure.values(): - failed_nodes |= set(nodes) - return failed_nodes, reasons_for_failure - - -def power(nodes, state, force = True, session = None, cont = True, reason = "BOS: Powering nodes"): + node_power_status[node] = power_state + + return node_power_status, xname_status_failures + + +def power(nodes: List, state: str, force: bool = True, session = None, + cont: bool = True, + reason: str = "BOS: Powering nodes") -> CapmcXnameOnOffReturnedError: """ Sets a node to a power state using CAPMC; returns a set of nodes that were unable to achieve that state. @@ -172,7 +369,7 @@ def power(nodes, state, force = True, session = None, cont = True, reason = "BOS to power the node to the desired state. Args: - nodes (list): Nodes to power on + nodes (list): Nodes to act upon state (string): Power state: off or on force (bool): Should the power off be forceful (True) or not forceful (False) session (Requests.session object): A Requests session instance @@ -180,9 +377,9 @@ def power(nodes, state, force = True, session = None, cont = True, reason = "BOS or more of the requested components fails their action. Returns: - failed (set): the nodes that failed to enter the desired power state - boot_errors (dict): A dictionary containing the nodes (values) - suffering from errors (keys) + errors (dict): A class containing on error code, error message, and + a dictionary containing the nodes (keys) suffering from errors (values) + :rtype: CapmcXnameOnOffReturnedError Raises: ValueError: if state is neither 'off' nor 'on' @@ -205,8 +402,16 @@ def power(nodes, state, force = True, session = None, cont = True, reason = "BOS elif state == "off": json_response = call(power_endpoint, nodes, output_format, cont, reason, force = force) - failed_nodes, errors = parse_response(json_response) - return failed_nodes, errors + errors = CapmcXnameOnOffReturnedError(json_response) + if errors.error_code != 0: + LOGGER.error("Failed interacting with Cray Advanced Platform " + "Monitoring and Control (CAPMC). " + f"Error code: {errors.error_code} " + f"Error message: {errors.error_message}") + LOGGER.debug("Failed interacting with Cray Advanced Platform " + "Monitoring and Control (CAPMC). " + f"Full response: {errors.response}") + return errors def node_type(nodes): @@ -250,3 +455,43 @@ def call(endpoint, nodes, node_format = 'xnames', cont = True, reason = "None gi return json.loads(resp.text) except json.JSONDecodeError as jde: raise CapmcException("Non-json response from CAPMC: %s" % (resp.text)) from jde + +def disable_based_on_error_xname_on_off(error): + """ + CAPMC returns errors to requests to xname_on and xname_off. + Some errors are transient, some not. + Some non-transient errors have nodes associated with them. Others do not + have nodes associated with them. + Non-transient errors with associated nodes should cause those node + to be disabled. + Inputs: + :param str error: The error string returned by CAPMC. + + Returns: + True: When the error is non-transient. + False: When the error is transient. + :rtype: boolean + """ + if error in XNAME_ON_OFF_ERROR_STRINGS: + return True + return False + +def disable_based_on_error_xname_status(error): + """ + CAPMC returns errors to requests to get_xname_status. + Some errors are transient, some not. + Some non-transient errors have nodes associated with them. Others do not + have nodes associated with them. + Non-transient errors with associated nodes should cause those node + to be disabled. + Inputs: + :param str error: The error string returned by CAPMC. + + Returns: + True: When the error is non-transient. + False: When the error is transient. + :rtype: boolean + """ + if error in XNAME_STATUS_ERROR_STRINGS: + return True + return False \ No newline at end of file