Skip to content

Commit

Permalink
CASMCMS-9078: BOS tags SBPS images
Browse files Browse the repository at this point in the history
The Scalable Boot Provisioning Service (SBPS) provides root
filesystems to nodes when they boot. The images containing
these root filesystems need to be tagged in the Image
Management Service with 'sbps-project: true' before SBPS projects
them to the nodes. With this mod, BOS tags any rootfs it is booting
a node with with this key/value tag to ensure that it is projected
during booting. This relieves the admin from needing to manually
tag the image. This is a quality of life mod.

(cherry picked from commit 944e637)
  • Loading branch information
jsollom-hpe committed Aug 20, 2024
1 parent 61b4c18 commit 9190ccc
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 23 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
## [2.26.0] - 2024-08-20
### Added
- BOS automatically tags IMS images with the 'sbps-project: true' tag when using SBPS as the rootfs provider.

## [2.25.0] - 2024-08-15
### Changed
Expand Down
85 changes: 85 additions & 0 deletions src/bos/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from typing import List

PROTOCOL = 'http'
TIME_DURATION_PATTERN = re.compile(r"^(\d+?)(\D+?)$", re.M|re.S)
Expand Down Expand Up @@ -116,3 +117,87 @@ def exc_type_msg(exc: Exception) -> str:
(e.g. TypeError: 'int' object is not subscriptable)
"""
return ''.join(traceback.format_exception_only(type(exc), exc))

def get_image_id(component: str) -> str:
"""
Extract the IMS image ID from the path to the kernel
We expect it to look something like this:
s3://boot-images/fbcc5b02-b6a4-46a8-9402-2b7138adc327/kernel
"""
# Get kernel's path
boot_artifacts = component.get('desired_state', {}).get('boot_artifacts', {})
kernel = boot_artifacts.get('kernel')
image_id = get_image_id_from_kernel(kernel)
return image_id


def get_image_id_from_kernel(kernel_path: str) -> str:
# Extract image ID from kernel path
pattern = re.compile('.*//.*/(.*)/kernel')
match = pattern.match(kernel_path)
image_id = match.group(1)
return image_id

def using_sbps(component: str) -> bool:
"""
If the component is using the Scalable Boot Provisioning Service (SBPS) to
provide the root filesystem, then return True.
Otherwise, return False.
The kernel parameters will contain the string root=sbps-s3 if it is using
SBPS.
Return True if it is and False if it is not.
"""
# Get the kernel boot parameters
boot_artifacts = component.get('desired_state', {}).get('boot_artifacts', {})
kernel_parameters = boot_artifacts.get('kernel_parameters')
return using_sbps_check_kernel_parameters(kernel_parameters)

def using_sbps_check_kernel_parameters(kernel_parameters: str) -> bool:
"""
Check the kernel boot parameters to see if the image is using the
rootfs provider 'sbps'.
SBPS is the Scalable Boot Provisioning Service (SBPS).
The kernel parameters will contain the string root=sbps-s3 if it is using
SBPS.
Return True if it is and False if it is not.
"""
# Check for the 'root=sbps-s3' string.
pattern = re.compile("root=sbps-s3")
match = pattern.search(kernel_parameters)
if match:
return True
return False

def components_by_id(components: List[dict]) -> dict:
"""
Input:
* components: a list containing individual components
Return:
A dictionary with the name of each component as the
key and the value being the entire component itself.
Purpose: It makes searching more efficient because you can
index by component name.
"""
components_by_id = {}
for component in components:
id = component["id"]
components_by_id[id] = component

return components_by_id

def reverse_components_by_id(components_by_id: dict) -> List[dict]:
"""
Input:
components_by_id: a dictionary with the name of each component as the
key and the value being the entire component itself.
Return:
A list with each component as an element
Purpose: Reverse the effect of components_by_id.
"""
components = [component for component in components_by_id.values()]
return components
2 changes: 1 addition & 1 deletion src/bos/operators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def _run_on_chunk(self, components: List[dict]) -> None:
components = self._act(components)
except Exception as e:
LOGGER.error("An unhandled exception was caught while trying to act on components: %s",
e, exec_info=True)
e, exc_info=True)
for component in components:
component["error"] = str(e)
self._update_database(components)
Expand Down
123 changes: 102 additions & 21 deletions src/bos/operators/power_on.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@
from collections import defaultdict
import logging
from requests import HTTPError
from typing import Dict, List, Set, Tuple, Union

from bos.common.utils import exc_type_msg
from bos.common.utils import exc_type_msg, get_image_id_from_kernel, using_sbps_check_kernel_parameters, components_by_id
from bos.common.values import Action, Status
from bos.operators.utils.clients import bss
from bos.operators.utils.clients import pcs
from bos.operators.utils.clients.ims import tag_image
from bos.operators.utils.clients.cfs import set_cfs
from bos.operators.base import BaseOperator, main
from bos.operators.filters import BOSQuery, HSMState
Expand Down Expand Up @@ -59,12 +61,19 @@ def filters(self):
HSMState()
]

def _act(self, components):
def _act(self, components: Union[List[dict],None]):
if not components:
return components
self._preset_last_action(components)

boot_artifacts, sessions = self._sort_components_by_boot_artifacts(components)

try:
self._tag_images(boot_artifacts, components)
except Exception as e:
raise Exception(f"Error encountered tagging images {e}.")
try:
self._set_bss(components)
self._set_bss(boot_artifacts, bos_sessions=sessions)
except Exception as e:
raise Exception(f"Error encountered setting BSS information: {e}") from e
try:
Expand All @@ -78,7 +87,43 @@ def _act(self, components):
raise Exception(f"Error encountered calling CAPMC to power on: {e}") from e
return components

def _set_bss(self, components, retries=5):
def _sort_components_by_boot_artifacts(self, components: List[dict]) -> tuple[Dict, Dict]:
"""
Create a two dictionaries.
The first dictionary has keys with a unique combination of boot artifacts associated with
a single boot image. They appear in this order:
* kernel
* kernel parameters
* initrd
The first dictionary's values are a set of the nodes that boot with those boot
artifacts.
The second dictionary has keys that are nodes and values are that node's BOS
session.
Inputs:
* components: A list where each element is a component describe by a dictionary
Returns: A tuple containing the first and second dictionary.
"""
boot_artifacts = defaultdict(set)
bos_sessions = {}
for component in components:
# Handle the boot artifacts
nodes_boot_artifacts = component.get('desired_state', {}).get('boot_artifacts', {})
kernel = nodes_boot_artifacts.get('kernel')
kernel_parameters = nodes_boot_artifacts.get('kernel_parameters')
initrd = nodes_boot_artifacts.get('initrd')
if not any([kernel, kernel_parameters, initrd]):
continue
key = (kernel, kernel_parameters, initrd)
boot_artifacts[key].add(component['id'])
# Handle the session
bos_sessions[component['id']] = component.get('session', "")

return (boot_artifacts, bos_sessions)

def _set_bss(self, boot_artifacts, bos_sessions, retries=5):
"""
Set the boot artifacts (kernel, kernel parameters, and initrd) in BSS.
Receive a BSS_REFERRAL_TOKEN from BSS.
Expand All @@ -88,26 +133,12 @@ def _set_bss(self, components, retries=5):
Because the connection to the BSS tokens database can be lost due to
infrequent use, retry up to retries number of times.
"""
if not components:
if not boot_artifacts:
# If we have been passed an empty list, there is nothing to do.
LOGGER.debug("_set_bss: No components to act on")
return
parameters = defaultdict(set)
sessions = {}
for component in components:
# Handle the boot artifacts
boot_artifacts = component.get('desired_state', {}).get('boot_artifacts', {})
kernel = boot_artifacts.get('kernel')
kernel_parameters = boot_artifacts.get('kernel_parameters')
initrd = boot_artifacts.get('initrd')
if not any([kernel, kernel_parameters, initrd]):
continue
key = (kernel, kernel_parameters, initrd)
parameters[key].add(component['id'])
# Handle the session
sessions[component['id']] = component.get('session', "")
bss_tokens = []
for key, nodes in parameters.items():
for key, nodes in boot_artifacts.items():
kernel, kernel_parameters, initrd = key
try:
resp = bss.set_bss(node_set=nodes, kernel_params=kernel_parameters,
Expand All @@ -134,7 +165,7 @@ def _set_bss(self, components, retries=5):
for node in nodes:
bss_tokens.append({"id": node,
"desired_state": {"bss_token": token},
"session": sessions[node]})
"session": bos_sessions[node]})
LOGGER.info('Found %d components that require BSS token updates', len(bss_tokens))
if not bss_tokens:
return
Expand All @@ -147,5 +178,55 @@ def _set_bss(self, components, retries=5):
redacted_component_updates)
self.bos_client.components.update_components(bss_tokens)

def _tag_images(self, boot_artifacts: Dict[Tuple[str, str, str], Set[str]], components: List[dict]) -> None:
"""
If the component is receiving its root file system via the SBPS provisioner,
then tag that image in IMS, so that SBPS makes it available.
This requires finding the IMS image ID associated with each component.
Many components may be booted with the same image, but the image only needs to
be tagged once.
Inputs:
* boot_artifacts: A dictionary keyed with a unique combination of boot artifacts
in this order:
* kernel
* kernel parameters
* initrd
These boot artifacts together represent a unique boot image
and are used to identify that image.
The values are the set of components being booted with that image.
* components: A list where each element is a component describe by a dictionary
This is used to update the component with an error should one
occur.
"""
if not boot_artifacts:
# If we have been passed an empty dictionary, there is nothing to do.
LOGGER.debug("_tag_images: No components to act on.")
return

image_ids = set()
image_id_to_nodes = {}
for boot_artifact, components_list in boot_artifacts.items():
kernel_parameters = boot_artifact[1]
if using_sbps_check_kernel_parameters(kernel_parameters):
# Get the image ID
kernel = boot_artifact[0]
image_id = get_image_id_from_kernel(kernel)
# Add it to the set.
image_ids.add(image_id)
# Map image IDs to nodes
image_id_to_nodes[image_id] = components_list

my_components_by_id = components_by_id(components)
for image in image_ids:
try:
tag_image(image, "set", "sbps-project", "true")
except Exception as e:
components_to_update = []
for node in image_id_to_nodes[image]:
my_components_by_id[node]["error"] = str(e)
components_to_update.append(my_components_by_id[node])
self._update_database(components_to_update)

if __name__ == '__main__':
main(PowerOnOperator)
84 changes: 84 additions & 0 deletions src/bos/operators/utils/clients/ims.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#
# MIT License
#
# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#

import logging
from requests.exceptions import HTTPError

from bos.common.utils import compact_response_text, exc_type_msg, requests_retry_session, PROTOCOL

SERVICE_NAME = 'cray-ims'
IMS_VERSION = 'v3'
BASE_ENDPOINT = f"{PROTOCOL}://{SERVICE_NAME}/{IMS_VERSION}"
IMAGES_ENDPOINT = f"{BASE_ENDPOINT}/images"

LOGGER = logging.getLogger('bos.operators.utils.clients.ims')
IMS_TAG_OPERATIONS = ['set', 'remove']

class TagFailure(Exception):
pass

def patch_image(image_id, data, session=None):
if not data:
LOGGER.warning("patch_image called without data; returning without action.")
return
if not session:
session = requests_retry_session()
LOGGER.debug("PATCH %s with body=%s", IMAGES_ENDPOINT, data)
response = session.patch(f"{IMAGES_ENDPOINT}/{image_id}", json=data)
LOGGER.debug("Response status code=%d, reason=%s, body=%s", response.status_code,
response.reason, compact_response_text(response.text))
try:
response.raise_for_status()
except HTTPError as err:
LOGGER.error("Failed asking IMS to tag image: %s", exc_type_msg(err))
raise

def tag_image(image_id: str, operation: str, key: str, value: str = None, session=None) -> None:
if operation not in IMS_TAG_OPERATIONS:
msg = f"{operation} not valid. Expecting one of {IMS_TAG_OPERATIONS}"
LOGGER.error(msg)
raise TagFailure(msg)

if not key:
msg = f"key must exist: {key}"
LOGGER.error(msg)
raise TagFailure(msg)

if value:
LOGGER.debug(f"Patching image {image_id} {operation}ing key: {key} value: {value}")
else:
LOGGER.debug(f"Patching image {image_id} {operation}ing key: {key}")

if not session:
session = requests_retry_session()

data = {
"metadata": {
"operation": operation,
"key": key,
"value": value
}
}
patch_image(image_id=image_id, data=data, session=session)

0 comments on commit 9190ccc

Please sign in to comment.