Skip to content

Commit

Permalink
Merge pull request #316 from Cray-HPE/casmtriage-6993-csm-1.6
Browse files Browse the repository at this point in the history
CASMTRIAGE-6993: Address BOS OOMKill issues at scale
  • Loading branch information
mharding-hpe authored May 22, 2024
2 parents 6c2addf + 0092e3c commit aa773ad
Show file tree
Hide file tree
Showing 11 changed files with 52 additions and 41 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Changed
- Increase memory requests and limits for BOS pods, to prevent OOM kill issues seen at scale.
- Set UWSGI `max-requests` and `harakiri` options to help avoid OOM and scaling issues.

### Dependencies
- Bump `openapi-generator-cli` from v6.6.0 to v7.6.0, in preparation for moving the API
spec to OAS 3.1

### Fixed
- Addressed linter complaints

## [2.17.7] - 2024-05-16
### Changed
- Added more checks to avoid operating on empty lists
Expand Down
4 changes: 4 additions & 0 deletions config/uwsgi.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,7 @@ callable=app
processes=8
threads=16
virtualenv=/app/venv
# Added to try and help avoid OOM issues, based on
# CASMTRIAGE-5369/CASMTRIAGE-6993
max-requests=1024
harakiri=30
4 changes: 2 additions & 2 deletions kubernetes/cray-bos/values.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ cray-service:
failureThreshold: 20
resources:
requests:
memory: "200Mi"
memory: "600Mi"
cpu: "300m"
limits:
memory: "500Mi"
memory: "1Gi"
cpu: "1000m"
volumes:
ca-vol:
Expand Down
2 changes: 1 addition & 1 deletion src/bos/operators/power_on.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def _set_bss(self, components, retries=5):
if not bss_tokens:
return
redacted_component_updates = [
{ "id": comp["id"],
{ "id": comp["id"],
"session": comp["session"]
}
for comp in bss_tokens ]
Expand Down
2 changes: 1 addition & 1 deletion src/bos/operators/utils/clients/pcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class PowerControlComponentsEmptyException(Exception):
Raised when one of the PCS utility functions that requires a non-empty
list of components is passed an empty component list. This will only
happen in the case of a programming bug.
This exception is not raised for functions that require a node list
but that are able to return a sensible object to the caller that
indicates nothing has been done. For example, the status function.
Expand Down
46 changes: 23 additions & 23 deletions src/bos/operators/utils/clients/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def url(self):
def s3_client(connection_timeout=60, read_timeout=60):
"""
Return an s3 client
Args:
connection_timeout -- Number of seconds to wait to time out the connection
Default: 60 seconds
Expand All @@ -102,8 +102,8 @@ def s3_client(connection_timeout=60, read_timeout=60):
Returns:
Returns an s3 client object
Raises:
S3MissingConfiguration -- it cannot contact S3 because it did not have the proper
credentials or configuration
S3MissingConfiguration -- it cannot contact S3 because it did not have the proper
credentials or configuration
"""
try:
s3_access_key = os.environ['S3_ACCESS_KEY']
Expand Down Expand Up @@ -144,14 +144,14 @@ def __init__(self, path, etag=None):
@property
def object_header(self) -> dict:
"""
Get the S3 object's header metadata.
Get the S3 object's header metadata.
Return:
The S3 object headers (dict)
Raises:
ClientError
ClientError
"""

try:
Expand All @@ -176,16 +176,16 @@ def object_header(self) -> dict:
def object(self):
"""
The S3 object itself. If the object was not found, log it and return an error.
Args:
path -- path to the S3 key
etag -- Entity tag
Return:
S3 Object
Raises:
boto3.exceptions.ClientError -- when it cannot read from S3
boto3.exceptions.ClientError -- when it cannot read from S3
"""

s3 = s3_client()
Expand Down Expand Up @@ -215,16 +215,16 @@ def __init__(self, path, etag=None):
def manifest_json(self):
"""
Read a manifest.json file from S3. If the object was not found, log it and return an error.
Args:
path -- path to the S3 key
etag -- Entity tag
Return:
Manifest file in JSON format
Raises:
boto3.exceptions.ClientError -- when it cannot read from S3
boto3.exceptions.ClientError -- when it cannot read from S3
"""

if self._manifest_json:
Expand All @@ -247,7 +247,7 @@ def manifest_json(self):
def _get_artifact(self, artifact_type):
"""
Get the artifact_type artifact object out of the manifest.
The artifact object looks like this
{
"link": {
Expand All @@ -258,10 +258,10 @@ def _get_artifact(self, artifact_type):
"type": "application/vnd.cray.image.rootfs.squashfs",
"md5": "cccccckvnfdikecvecdngnljnnhvdlvbkueckgbkelee"
}
Return:
Artifact object
Raises:
ValueError -- Manifest file is corrupt or invalid
ArtifactNotFound -- The requested artifact is missing
Expand All @@ -288,7 +288,7 @@ def _get_artifact(self, artifact_type):
def initrd(self):
"""
Get the initrd artifact object out of the manifest.
Return:
initrd object
"""
Expand All @@ -298,7 +298,7 @@ def initrd(self):
def kernel(self):
"""
Get the kernel artifact object out of the manifest.
Return:
Kernel object
"""
Expand All @@ -308,7 +308,7 @@ def kernel(self):
def boot_parameters(self):
"""
Get the kernel artifact object out of the manifest, if one exists.
Return:
boot parameters object if one exists, else None
"""
Expand All @@ -323,7 +323,7 @@ def boot_parameters(self):
def rootfs(self):
"""
Get the rootfs artifact object out of the manifest.
Return:
rootfs object
"""
Expand Down
2 changes: 1 addition & 1 deletion src/bos/reporter/components/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# OTHER DEALINGS IN THE SOFTWARE.
#
"""
This is a client module to the BOS component state API.
This is a client module to the BOS component state API.
The primary use for this is to allow nodes to indicate the
state of their boot artifacts as indicate by the BOS Session ID.
Expand Down
4 changes: 2 additions & 2 deletions src/bos/reporter/proc_cmdline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2021-2022 Hewlett Packard Enterprise Development LP
# (C) Copyright 2021-2022, 2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -32,7 +32,7 @@ def proc_cmdline():
the order they appear.
Emits both tuples and strings.
Raises:
OSError if it cannot open /proc/cmdline
"""
Expand Down
12 changes: 6 additions & 6 deletions src/bos/server/controllers/v2/boot_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,22 @@ def validate_boot_sets(session_template: dict,
Validates the boot sets listed in a session template.
It ensures that there are boot sets.
It checks that each boot set specifies nodes via one of the specifier fields.
Ensures that the boot artifacts exist.
Ensures that the boot artifacts exist.
Inputs:
session_template (dict): Session template data
operation (str): Requested operation
operation (str): Requested operation
template_name (str): The name of the session template; Note, during Session template
creation, the name in the session template data does not have
creation, the name in the session template data does not have
to match the name used to create the session template.
Returns:
Returns an error_code and a message
error_code:
0 -- Success
1 -- Warning, not fatal
2 -- Error, fatal
"""
# Verify boot sets exist.
if 'boot_sets' not in session_template or not session_template['boot_sets']:
Expand Down
8 changes: 4 additions & 4 deletions src/bos/server/dbs/boot_artifacts.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2022 Hewlett Packard Enterprise Development LP
# (C) Copyright 2022, 2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -35,7 +35,7 @@ class BssTokenException(Exception):


class BssTokenUnknown(BssTokenException):
"""
"""
The BSS Token is not present in the database.
"""
pass
Expand All @@ -62,10 +62,10 @@ def record_boot_artifacts(token: str,
def get_boot_artifacts(token: str) -> dict:
"""
Get the boot artifacts associated with a BSS token.
Returns:
Boot artifacts (dict)
Raises:
BssTokenUnknown
"""
Expand Down
2 changes: 1 addition & 1 deletion src/bos/server/migrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


def perform_migrations():
# Not removing this file entirely because we are going to be adding
# Not removing this file entirely because we are going to be adding
# code here to migrate the previous BOS data to enforce API field
# restrictions
pass
Expand Down

0 comments on commit aa773ad

Please sign in to comment.