From 9190ccca8b4b508cf2d1f9724a0325f193f484f0 Mon Sep 17 00:00:00 2001 From: Jason Sollom Date: Tue, 20 Aug 2024 13:06:30 -0500 Subject: [PATCH] CASMCMS-9078: BOS tags SBPS images The Scalable Boot Provisioning Service (SBPS) provides root filesystems to nodes when they boot. The images containing these root filesystems need to be tagged in the Image Management Service with 'sbps-project: true' before SBPS projects them to the nodes. With this mod, BOS tags any rootfs it is booting a node with with this key/value tag to ensure that it is projected during booting. This relieves the admin from needing to manually tag the image. This is a quality of life mod. (cherry picked from commit 944e63776b5fd8fe4e83c5dee4685b2a5f90b256) --- CHANGELOG.md | 4 +- src/bos/common/utils.py | 85 +++++++++++++++++ src/bos/operators/base.py | 2 +- src/bos/operators/power_on.py | 123 ++++++++++++++++++++----- src/bos/operators/utils/clients/ims.py | 84 +++++++++++++++++ 5 files changed, 275 insertions(+), 23 deletions(-) create mode 100644 src/bos/operators/utils/clients/ims.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 8375efc0..67560868 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,9 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [2.26.0] - 2024-08-20 +### Added +- BOS automatically tags IMS images with the 'sbps-project: true' tag when using SBPS as the rootfs provider. ## [2.25.0] - 2024-08-15 ### Changed diff --git a/src/bos/common/utils.py b/src/bos/common/utils.py index d1cf03b1..bfc5be7c 100644 --- a/src/bos/common/utils.py +++ b/src/bos/common/utils.py @@ -28,6 +28,7 @@ import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry +from typing import List PROTOCOL = 'http' TIME_DURATION_PATTERN = re.compile(r"^(\d+?)(\D+?)$", re.M|re.S) @@ -116,3 +117,87 @@ def exc_type_msg(exc: Exception) -> str: (e.g. TypeError: 'int' object is not subscriptable) """ return ''.join(traceback.format_exception_only(type(exc), exc)) + +def get_image_id(component: str) -> str: + """ + Extract the IMS image ID from the path to the kernel + We expect it to look something like this: + s3://boot-images/fbcc5b02-b6a4-46a8-9402-2b7138adc327/kernel + """ + # Get kernel's path + boot_artifacts = component.get('desired_state', {}).get('boot_artifacts', {}) + kernel = boot_artifacts.get('kernel') + image_id = get_image_id_from_kernel(kernel) + return image_id + + +def get_image_id_from_kernel(kernel_path: str) -> str: + # Extract image ID from kernel path + pattern = re.compile('.*//.*/(.*)/kernel') + match = pattern.match(kernel_path) + image_id = match.group(1) + return image_id + +def using_sbps(component: str) -> bool: + """ + If the component is using the Scalable Boot Provisioning Service (SBPS) to + provide the root filesystem, then return True. + Otherwise, return False. + + The kernel parameters will contain the string root=sbps-s3 if it is using + SBPS. + + Return True if it is and False if it is not. + """ + # Get the kernel boot parameters + boot_artifacts = component.get('desired_state', {}).get('boot_artifacts', {}) + kernel_parameters = boot_artifacts.get('kernel_parameters') + return using_sbps_check_kernel_parameters(kernel_parameters) + +def using_sbps_check_kernel_parameters(kernel_parameters: str) -> bool: + """ + Check the kernel boot parameters to see if the image is using the + rootfs provider 'sbps'. + SBPS is the Scalable Boot Provisioning Service (SBPS). + The kernel parameters will contain the string root=sbps-s3 if it is using + SBPS. + + Return True if it is and False if it is not. + """ + # Check for the 'root=sbps-s3' string. + pattern = re.compile("root=sbps-s3") + match = pattern.search(kernel_parameters) + if match: + return True + return False + +def components_by_id(components: List[dict]) -> dict: + """ + Input: + * components: a list containing individual components + Return: + A dictionary with the name of each component as the + key and the value being the entire component itself. + + Purpose: It makes searching more efficient because you can + index by component name. + """ + components_by_id = {} + for component in components: + id = component["id"] + components_by_id[id] = component + + return components_by_id + +def reverse_components_by_id(components_by_id: dict) -> List[dict]: + """ + Input: + components_by_id: a dictionary with the name of each component as the + key and the value being the entire component itself. + Return: + A list with each component as an element + + Purpose: Reverse the effect of components_by_id. + """ + components = [component for component in components_by_id.values()] + return components \ No newline at end of file diff --git a/src/bos/operators/base.py b/src/bos/operators/base.py index 1d31a655..1f4e0c74 100644 --- a/src/bos/operators/base.py +++ b/src/bos/operators/base.py @@ -154,7 +154,7 @@ def _run_on_chunk(self, components: List[dict]) -> None: components = self._act(components) except Exception as e: LOGGER.error("An unhandled exception was caught while trying to act on components: %s", - e, exec_info=True) + e, exc_info=True) for component in components: component["error"] = str(e) self._update_database(components) diff --git a/src/bos/operators/power_on.py b/src/bos/operators/power_on.py index 93871f2f..164ab0fc 100644 --- a/src/bos/operators/power_on.py +++ b/src/bos/operators/power_on.py @@ -25,11 +25,13 @@ from collections import defaultdict import logging from requests import HTTPError +from typing import Dict, List, Set, Tuple, Union -from bos.common.utils import exc_type_msg +from bos.common.utils import exc_type_msg, get_image_id_from_kernel, using_sbps_check_kernel_parameters, components_by_id from bos.common.values import Action, Status from bos.operators.utils.clients import bss from bos.operators.utils.clients import pcs +from bos.operators.utils.clients.ims import tag_image from bos.operators.utils.clients.cfs import set_cfs from bos.operators.base import BaseOperator, main from bos.operators.filters import BOSQuery, HSMState @@ -59,12 +61,19 @@ def filters(self): HSMState() ] - def _act(self, components): + def _act(self, components: Union[List[dict],None]): if not components: return components self._preset_last_action(components) + + boot_artifacts, sessions = self._sort_components_by_boot_artifacts(components) + + try: + self._tag_images(boot_artifacts, components) + except Exception as e: + raise Exception(f"Error encountered tagging images {e}.") try: - self._set_bss(components) + self._set_bss(boot_artifacts, bos_sessions=sessions) except Exception as e: raise Exception(f"Error encountered setting BSS information: {e}") from e try: @@ -78,7 +87,43 @@ def _act(self, components): raise Exception(f"Error encountered calling CAPMC to power on: {e}") from e return components - def _set_bss(self, components, retries=5): + def _sort_components_by_boot_artifacts(self, components: List[dict]) -> tuple[Dict, Dict]: + """ + Create a two dictionaries. + The first dictionary has keys with a unique combination of boot artifacts associated with + a single boot image. They appear in this order: + * kernel + * kernel parameters + * initrd + The first dictionary's values are a set of the nodes that boot with those boot + artifacts. + + The second dictionary has keys that are nodes and values are that node's BOS + session. + + Inputs: + * components: A list where each element is a component describe by a dictionary + + Returns: A tuple containing the first and second dictionary. + """ + boot_artifacts = defaultdict(set) + bos_sessions = {} + for component in components: + # Handle the boot artifacts + nodes_boot_artifacts = component.get('desired_state', {}).get('boot_artifacts', {}) + kernel = nodes_boot_artifacts.get('kernel') + kernel_parameters = nodes_boot_artifacts.get('kernel_parameters') + initrd = nodes_boot_artifacts.get('initrd') + if not any([kernel, kernel_parameters, initrd]): + continue + key = (kernel, kernel_parameters, initrd) + boot_artifacts[key].add(component['id']) + # Handle the session + bos_sessions[component['id']] = component.get('session', "") + + return (boot_artifacts, bos_sessions) + + def _set_bss(self, boot_artifacts, bos_sessions, retries=5): """ Set the boot artifacts (kernel, kernel parameters, and initrd) in BSS. Receive a BSS_REFERRAL_TOKEN from BSS. @@ -88,26 +133,12 @@ def _set_bss(self, components, retries=5): Because the connection to the BSS tokens database can be lost due to infrequent use, retry up to retries number of times. """ - if not components: + if not boot_artifacts: # If we have been passed an empty list, there is nothing to do. LOGGER.debug("_set_bss: No components to act on") return - parameters = defaultdict(set) - sessions = {} - for component in components: - # Handle the boot artifacts - boot_artifacts = component.get('desired_state', {}).get('boot_artifacts', {}) - kernel = boot_artifacts.get('kernel') - kernel_parameters = boot_artifacts.get('kernel_parameters') - initrd = boot_artifacts.get('initrd') - if not any([kernel, kernel_parameters, initrd]): - continue - key = (kernel, kernel_parameters, initrd) - parameters[key].add(component['id']) - # Handle the session - sessions[component['id']] = component.get('session', "") bss_tokens = [] - for key, nodes in parameters.items(): + for key, nodes in boot_artifacts.items(): kernel, kernel_parameters, initrd = key try: resp = bss.set_bss(node_set=nodes, kernel_params=kernel_parameters, @@ -134,7 +165,7 @@ def _set_bss(self, components, retries=5): for node in nodes: bss_tokens.append({"id": node, "desired_state": {"bss_token": token}, - "session": sessions[node]}) + "session": bos_sessions[node]}) LOGGER.info('Found %d components that require BSS token updates', len(bss_tokens)) if not bss_tokens: return @@ -147,5 +178,55 @@ def _set_bss(self, components, retries=5): redacted_component_updates) self.bos_client.components.update_components(bss_tokens) + def _tag_images(self, boot_artifacts: Dict[Tuple[str, str, str], Set[str]], components: List[dict]) -> None: + """ + If the component is receiving its root file system via the SBPS provisioner, + then tag that image in IMS, so that SBPS makes it available. + This requires finding the IMS image ID associated with each component. + Many components may be booted with the same image, but the image only needs to + be tagged once. + + Inputs: + * boot_artifacts: A dictionary keyed with a unique combination of boot artifacts + in this order: + * kernel + * kernel parameters + * initrd + These boot artifacts together represent a unique boot image + and are used to identify that image. + The values are the set of components being booted with that image. + * components: A list where each element is a component describe by a dictionary + This is used to update the component with an error should one + occur. + """ + if not boot_artifacts: + # If we have been passed an empty dictionary, there is nothing to do. + LOGGER.debug("_tag_images: No components to act on.") + return + + image_ids = set() + image_id_to_nodes = {} + for boot_artifact, components_list in boot_artifacts.items(): + kernel_parameters = boot_artifact[1] + if using_sbps_check_kernel_parameters(kernel_parameters): + # Get the image ID + kernel = boot_artifact[0] + image_id = get_image_id_from_kernel(kernel) + # Add it to the set. + image_ids.add(image_id) + # Map image IDs to nodes + image_id_to_nodes[image_id] = components_list + + my_components_by_id = components_by_id(components) + for image in image_ids: + try: + tag_image(image, "set", "sbps-project", "true") + except Exception as e: + components_to_update = [] + for node in image_id_to_nodes[image]: + my_components_by_id[node]["error"] = str(e) + components_to_update.append(my_components_by_id[node]) + self._update_database(components_to_update) + if __name__ == '__main__': main(PowerOnOperator) diff --git a/src/bos/operators/utils/clients/ims.py b/src/bos/operators/utils/clients/ims.py new file mode 100644 index 00000000..10d7f2bf --- /dev/null +++ b/src/bos/operators/utils/clients/ims.py @@ -0,0 +1,84 @@ +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +import logging +from requests.exceptions import HTTPError + +from bos.common.utils import compact_response_text, exc_type_msg, requests_retry_session, PROTOCOL + +SERVICE_NAME = 'cray-ims' +IMS_VERSION = 'v3' +BASE_ENDPOINT = f"{PROTOCOL}://{SERVICE_NAME}/{IMS_VERSION}" +IMAGES_ENDPOINT = f"{BASE_ENDPOINT}/images" + +LOGGER = logging.getLogger('bos.operators.utils.clients.ims') +IMS_TAG_OPERATIONS = ['set', 'remove'] + +class TagFailure(Exception): + pass + +def patch_image(image_id, data, session=None): + if not data: + LOGGER.warning("patch_image called without data; returning without action.") + return + if not session: + session = requests_retry_session() + LOGGER.debug("PATCH %s with body=%s", IMAGES_ENDPOINT, data) + response = session.patch(f"{IMAGES_ENDPOINT}/{image_id}", json=data) + LOGGER.debug("Response status code=%d, reason=%s, body=%s", response.status_code, + response.reason, compact_response_text(response.text)) + try: + response.raise_for_status() + except HTTPError as err: + LOGGER.error("Failed asking IMS to tag image: %s", exc_type_msg(err)) + raise + +def tag_image(image_id: str, operation: str, key: str, value: str = None, session=None) -> None: + if operation not in IMS_TAG_OPERATIONS: + msg = f"{operation} not valid. Expecting one of {IMS_TAG_OPERATIONS}" + LOGGER.error(msg) + raise TagFailure(msg) + + if not key: + msg = f"key must exist: {key}" + LOGGER.error(msg) + raise TagFailure(msg) + + if value: + LOGGER.debug(f"Patching image {image_id} {operation}ing key: {key} value: {value}") + else: + LOGGER.debug(f"Patching image {image_id} {operation}ing key: {key}") + + if not session: + session = requests_retry_session() + + data = { + "metadata": { + "operation": operation, + "key": key, + "value": value + } + } + patch_image(image_id=image_id, data=data, session=session) +