Skip to content

Commit

Permalink
CASMCMS-8754: Make BOS V2 status operator resilient to power errors
Browse files Browse the repository at this point in the history
When CAPMC returns errors, handle them and disable the associated nodes
instead of panicking and halting all forward progress.

Added some new classes.

Handle troublesome CAPMC errors that don't point fingers and indentify
which nodes experienced the errors.

If CAPMC returns an error that cannot be associated with an individual
node, then reissue the CAPMC command for each individual component.
Under that condition, Any components returning an error is guaranteed
to be the cause of the error, so they are associated with the error
and disabled in order to not cause BOS to attempt to retry them.

As a consequence of handling CAPMC-returned errors, the power
operators need to be able to disable nodes. When they receive
errors from CAPMC, they can declare those nodes disabled.

Create a power_operator_base class because error handling is common
to all three power operators. This is an abstract base class
that each of the power operators inherits from. It collects all of
the error handling into one place, so that it is not spread across
the three power operators.
  • Loading branch information
jsollom-hpe committed Oct 3, 2023
1 parent 63b9729 commit ff6a4f0
Show file tree
Hide file tree
Showing 9 changed files with 575 additions and 125 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [2.0.27] - 10-03-2023
### Changed
- Added error checking for errors returned by CAPMC. Where possible, nodes are disabled when they can be
associated with an error. This error handling prevents the BOS V2 status operator from entering a
live-lock when it is dealing with nodes that are MISSING ore disabled in the Hardware State Manager.

## [2.0.26] - 09-19-2023
### Fixed
- Fixed HSM query handling to prevent errors from querying with an empty list nodes.
Expand Down Expand Up @@ -143,7 +149,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [2.0.10] - 2023-05-08
### Added
- 'include_disabled' option to decide whether disabled nodes should be part of a BOS session
- 'include_disabled' option to decide whether disabled nodes should be part of a BOS session

## [2.0.9] - 2023-01-12
### Fixed
Expand Down
1 change: 0 additions & 1 deletion setup.py

This file was deleted.

5 changes: 4 additions & 1 deletion src/bos/operators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ def _update_database(self, components: List[dict], additional_fields: dict=None)
'id': component['id'],
'error': component['error'] # New error, or clearing out old error
}
# Allow operators to disable components.
if not component['enabled']:
patch['enabled'] = component['enabled']
if self.name:
last_action_data = {
'action': self.name,
Expand All @@ -178,7 +181,7 @@ def _update_database(self, components: List[dict], additional_fields: dict=None)

if additional_fields:
patch.update(additional_fields)

# When updating a component's desired state, operators
# are expected to provide session data as a hacky way to prove
# that they are operators. If they do not provide it, then the
Expand Down
22 changes: 15 additions & 7 deletions src/bos/operators/power_off_forceful.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,16 @@
import logging

from bos.common.values import Action, Status
import bos.operators.utils.clients.capmc as capmc
from bos.operators.utils.clients.capmc import disable_based_on_error_xname_on_off, power
from bos.operators.utils.clients.bos.options import options
from bos.operators.base import BaseOperator, main
from bos.operators.base import main
from bos.operators.power_operator_base import PowerOperatorBase
from bos.operators.filters import BOSQuery, HSMState, TimeSinceLastAction

LOGGER = logging.getLogger('bos.operators.power_off_forceful')


class ForcefulPowerOffOperator(BaseOperator):
class ForcefulPowerOffOperator(PowerOperatorBase):
"""
The Forceful Power-Off Operator tells capmc to power-off nodes if:
- Enabled in the BOS database and the status is power_off_gracefully of power_off_forcefully
Expand All @@ -56,10 +57,17 @@ def filters(self):
HSMState(enabled=True),
]

def _act(self, components):
component_ids = [component['id'] for component in components]
capmc.power(component_ids, state='off', force=True)
return components
def _my_power(self, component_ids):
"""
Power off components forcefully.
Returns:
errors (dict): A class containing an error code, error message, and
a dictionary containing the nodes (keys) suffering from errors (values)
:rtype: CapmcXnameOnOffReturnedError
"""

return power(component_ids, state='off', force=True)


if __name__ == '__main__':
Expand Down
22 changes: 15 additions & 7 deletions src/bos/operators/power_off_graceful.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@
import logging

from bos.common.values import Action, Status
import bos.operators.utils.clients.capmc as capmc
from bos.operators.base import BaseOperator, main
from bos.operators.utils.clients.capmc import disable_based_on_error_xname_on_off, power
from bos.operators.base import main
from bos.operators.power_operator_base import PowerOperatorBase
from bos.operators.filters import BOSQuery, HSMState

LOGGER = logging.getLogger('bos.operators.power_off_graceful')


class GracefulPowerOffOperator(BaseOperator):
class GracefulPowerOffOperator(PowerOperatorBase):
"""
- Enabled in the BOS database and the status is power_off_pending
- Enabled in HSM
Expand All @@ -52,10 +53,17 @@ def filters(self):
HSMState(enabled=True),
]

def _act(self, components):
component_ids = [component['id'] for component in components]
capmc.power(component_ids, state='off', force=False)
return components
def _my_power(self, component_ids):
"""
Power off components gracefully, not forcefully.
Returns:
errors (dict): A class containing an error code, error message, and
a dictionary containing the nodes (keys) suffering from errors (values)
:rtype: CapmcXnameOnOffReturnedError
"""

return power(component_ids, state='off', force=False)


if __name__ == '__main__':
Expand Down
34 changes: 23 additions & 11 deletions src/bos/operators/power_on.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,17 @@

from bos.common.values import Action, Status
import bos.operators.utils.clients.bss as bss
import bos.operators.utils.clients.capmc as capmc
from bos.operators.utils.clients.capmc import disable_based_on_error_xname_on_off, power
from bos.operators.utils.clients.cfs import set_cfs
from bos.operators.base import BaseOperator, main
from bos.operators.base import main
from bos.operators.power_operator_base import PowerOperatorBase
from bos.operators.filters import BOSQuery, HSMState
from bos.server.dbs.boot_artifacts import record_boot_artifacts

LOGGER = logging.getLogger('bos.operators.power_on')


class PowerOnOperator(BaseOperator):
class PowerOnOperator(PowerOperatorBase):
"""
The Power-On Operator tells capmc to power-on nodes if:
- Enabled in the BOS database and the status is power_on_pending
Expand All @@ -59,6 +60,10 @@ def filters(self):
]

def _act(self, components):
"""
Set up BSS and CFS prior to powering on the components.
Power on the components.
"""
self._preset_last_action(components)
try:
self._set_bss(components)
Expand All @@ -68,21 +73,16 @@ def _act(self, components):
set_cfs(components, enabled=False, clear_state=True)
except Exception as e:
raise Exception("An error was encountered while setting CFS information: {}".format(e)) from e
component_ids = [component['id'] for component in components]
try:
capmc.power(component_ids, state='on')
except Exception as e:
raise Exception("An error was encountered while calling CAPMC to power on: {}".format(e)) from e
return components
return self._power_components(components)

def _set_bss(self, components, retries=5):
"""
Set the boot artifacts (kernel, kernel parameters, and initrd) in BSS.
Receive a BSS_REFERRAL_TOKEN from BSS.
Map the token to the boot artifacts.
Update each node's desired state with the token.
Because the connection to the BSS tokens database can be lost due to
Because the connection to the BSS tokens database can be lost due to
infrequent use, retry up to retries number of times.
"""
parameters = defaultdict(set)
Expand Down Expand Up @@ -129,6 +129,18 @@ def _set_bss(self, components, retries=5):
"session": sessions[node]})
self.bos_client.components.update_components(bss_tokens)

def _my_power(self, component_ids):
"""
Power on components.
Returns:
errors (dict): A class containing an error code, error message, and
a dictionary containing the nodes (keys) suffering from errors (values)
:rtype: CapmcXnameOnOffReturnedError
"""

return power(component_ids, state='on')


if __name__ == '__main__':
main(PowerOnOperator)
143 changes: 143 additions & 0 deletions src/bos/operators/power_operator_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python
#
# MIT License
#
# (C) Copyright 2021-2022 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
import logging
from typing import List, Type

from bos.operators.utils.clients.capmc import disable_based_on_error_xname_on_off, power
from bos.operators.base import BaseOperator, main
from bos.operators.filters.base import BaseFilter

LOGGER = logging.getLogger('bos.operators.power_base')


class PowerOperatorBase(BaseOperator):
"""
An abstract class for all BOS power operators.
Override these methods and properties.
NAME - This field determines how the operator/action is logged in the components database.
FILTERS - A list of all filters that are used to determine which components should be acted on.
Includes the initial query for BOS components as the query also includes filtering.
_my_power - This method indicates the power action to be taken by the operator.
Any other method may also be overridden, but functionality such as error handling may be lost.
"""

retry_attempt_field = "power_base_operator"

@property
def name(self) -> str:
return 'Invalid Action Type'

# Filters
@property
def filters(self) -> List[Type[BaseFilter]]:
return []

def _act(self, components) -> List[dict]:
"""
Calls the _power_components method.
Override this method to perform additional actions specific to the
power operator.
Be sure to finish with the return statement below.
"""
return self._power_components(components)


def _power_components(self, components: List[dict]) -> List[dict]:
"""
Apply the _my_power operation to a list of components.
Handle any errors. This includes setting the error per component and
disabling the component.
If we know which nodes experienced errors, then set their errors and
disable them. If we do not know which nodes experienced errors, then
attempt to power them on or off individually. Any errors encountered
will be specific to the individual node.
Inputs:
:param List[dict] components: A list of the components to operate on
:return components: the
:rtype: A list of dictionaries where the individual elements are Components
"""
component_ids = [component['id'] for component in components]
errors = self._my_power(component_ids)
if errors.error_code != 0:
if errors.nodes_in_error:
# Update any nodes with errors they encountered
for component in errors.nodes_in_error:
index = self._find_component_in_components(component, components)
if index is not None:
error = errors.nodes_in_error[component].error_message
components[index]['error'] = error
components[index]['enabled'] = disable_based_on_error_xname_on_off(error)
break
else:
# Errors could not be associated with a specific node.
# Ask CAPMC to act on them one at a time to identify
# nodes associated with errors.
for component in component_ids:
LOGGER.debug(f"Acting on component {component}")
errors = self._my_power([component])
if errors.error_code != 0:
index = self._find_component_in_components(component, components)
if index is not None:
components[index]['error'] = errors.error_message
components[index]['enabled'] = False

return components

def _find_component_in_components(self, component_id, components) -> int:
"""
In a list of components, find the component that matches
the component ID. Return its index in the list.
:param str component_id: The component ID
:param List[dict] components: A list of components
Returns:
An index indicating the matched components location in the list
It returns None if there is no match.
:rtype: int
"""
for component in components:
if component_id == component['id']:
return components.index(component)
return None

def _my_power(self, component_ids: List[str]):
"""
Overide this function with the power call specific to the operator.
Returns:
errors (dict): A class containing o on error code, error message, and
a dictionary containing the nodes (keys) suffering from errors (values)
:rtype: CapmcXnameOnOffReturnedError
"""
return power(component_ids)

if __name__ == '__main__':
main(PowerOperatorBase)
Loading

0 comments on commit ff6a4f0

Please sign in to comment.