Skip to content

Commit

Permalink
CASMCMS-8997: Improve BOS logging of unexpected errors
Browse files Browse the repository at this point in the history
  • Loading branch information
mharding-hpe committed May 16, 2024
1 parent acfe142 commit d578ff5
Show file tree
Hide file tree
Showing 25 changed files with 137 additions and 83 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Added more checks to avoid operating on empty lists
- Compact response bodies to single line before logging them
- Improve BOS logging of unexpected errors

## [2.17.6] - 2024-04-19
### Fixed
Expand Down
6 changes: 4 additions & 2 deletions src/bos/common/tenant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import logging
import hashlib
from requests.exceptions import HTTPError
from bos.common.utils import requests_retry_session, PROTOCOL
from bos.common.utils import exc_type_msg, requests_retry_session, PROTOCOL

LOGGER = logging.getLogger('bos.common.tenant_utils')

Expand Down Expand Up @@ -78,7 +78,7 @@ def get_tenant_data(tenant, session=None):
try:
response.raise_for_status()
except HTTPError as e:
LOGGER.error("Failed getting tenant data from tapms: %s", e)
LOGGER.error("Failed getting tenant data from tapms: %s", exc_type_msg(e))
if response.status_code == 404:
raise InvalidTenantException(f"Data not found for tenant {tenant}") from e
else:
Expand Down Expand Up @@ -110,6 +110,7 @@ def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except InvalidTenantException as e:
LOGGER.debug("Invalid tenant: %s", exc_type_msg(e))
return connexion.problem(
status=400, title='Invalid tenant',
detail=str(e))
Expand All @@ -122,6 +123,7 @@ def reject_invalid_tenant(func):
def wrapper(*args, **kwargs):
tenant = get_tenant_from_header()
if tenant and not validate_tenant_exists(tenant):
LOGGER.debug("The provided tenant does not exist")
return connexion.problem(
status=400, title="Invalid tenant",
detail=str("The provided tenant does not exist"))
Expand Down
8 changes: 8 additions & 0 deletions src/bos/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#
import datetime
import re
import traceback
from dateutil.parser import parse
import requests
from requests.adapters import HTTPAdapter
Expand Down Expand Up @@ -107,3 +108,10 @@ def compact_response_text(response_text: str) -> str:
if response_text:
return ' '.join([ line.strip() for line in response_text.split('\n') ])
return str(response_text)


def exc_type_msg(exc: Exception) -> str:
"""
Given an exception, returns a string of its type and its text (e.g. TypeError: 'int' object is not subscriptable)
"""
return ''.join(traceback.format_exception_only(type(exc), exc))
3 changes: 2 additions & 1 deletion src/bos/operators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import time
from typing import List, NoReturn, Type

from bos.common.utils import exc_type_msg
from bos.common.values import Status
from bos.operators.filters.base import BaseFilter
from bos.operators.utils.clients.bos.options import options
Expand Down Expand Up @@ -266,7 +267,7 @@ def _update_log_level() -> None:
LOGGER.log(new_level, 'Logging level changed from {} to {}'.format(
logging.getLevelName(current_level), logging.getLevelName(new_level)))
except Exception as e:
LOGGER.error('Error updating logging level: {}'.format(e))
LOGGER.error('Error updating logging level: %s', exc_type_msg(e))


def _liveliness_heartbeat() -> NoReturn:
Expand Down
10 changes: 6 additions & 4 deletions src/bos/operators/power_on.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import logging
from requests import HTTPError

from bos.common.utils import exc_type_msg
from bos.common.values import Action, Status
import bos.operators.utils.clients.bss as bss
import bos.operators.utils.clients.pcs as pcs
Expand Down Expand Up @@ -112,9 +113,9 @@ def _set_bss(self, components, retries=5):
resp = bss.set_bss(node_set=nodes, kernel_params=kernel_parameters,
kernel=kernel, initrd=initrd)
resp.raise_for_status()
except HTTPError:
LOGGER.error(f"Failed to set BSS for boot artifacts: {key} for"
"nodes: {nodes}. Error: {err}")
except HTTPError as err:
LOGGER.error("Failed to set BSS for boot artifacts: %s for nodes: %s. Error: %s",
key, nodes, exc_type_msg(err))
else:
token = resp.headers['bss-referral-token']
attempts = 0
Expand All @@ -124,7 +125,8 @@ def _set_bss(self, components, retries=5):
break
except Exception as err:
attempts += 1
LOGGER.error(f"An error occurred attempting to record the BSS token: {err}")
LOGGER.error("An error occurred attempting to record the BSS token: %s",
exc_type_msg(err))
if attempts > retries:
raise
LOGGER.info("Retrying to record the BSS token.")
Expand Down
5 changes: 3 additions & 2 deletions src/bos/operators/session_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from bos.operators.utils.clients.bos.options import options
from bos.operators.utils.rootfs.factory import ProviderFactory
from bos.operators.session_completion import SessionCompletionOperator
from bos.common.utils import exc_type_msg
from bos.common.values import Action, EMPTY_ACTUAL_STATE, EMPTY_DESIRED_STATE, EMPTY_STAGED_STATE
from bos.common.tenant_utils import get_tenant_component_set, InvalidTenantException

Expand Down Expand Up @@ -135,7 +136,7 @@ def _setup_components(self):
if not all_component_ids:
raise SessionSetupException("No nodes were found to act upon.")
except Exception as err:
raise SessionSetupException(err)
raise SessionSetupException(err) from err
else:
self._log(LOGGER.info, 'Found %d components that require updates', len(data))
self._log(LOGGER.debug, f'Updated components: {data}')
Expand Down Expand Up @@ -413,7 +414,7 @@ def assemble_kernel_boot_parameters(self, boot_set, artifact_info):
except (ClientError, UnicodeDecodeError, S3ObjectNotFound) as error:
self._log(LOGGER.error, "Unable to read file {}. Thus, no kernel boot parameters obtained "
"from image".format(artifact_info['boot_parameters']))
LOGGER.error(error)
LOGGER.error(exc_type_msg(error))
raise

# Parameters from the BOS Session template if the parameters exist.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2021-2022 Hewlett Packard Enterprise Development LP
# (C) Copyright 2021-2022, 2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand All @@ -25,8 +25,9 @@

from botocore.exceptions import ClientError

from . import BootImageMetaData, BootImageMetaDataBadRead
from ..clients.s3 import S3BootArtifacts, S3MissingConfiguration, ArtifactNotFound
from bos.common.utils import exc_type_msg
from bos.operators.utils.boot_image_metadata import BootImageMetaData, BootImageMetaDataBadRead
from bos.operators.utils.clients.s3 import S3BootArtifacts, S3MissingConfiguration, ArtifactNotFound

LOGGER = logging.getLogger('bos.operators.utils.boot_image_metadata.s3_boot_image_metadata')

Expand All @@ -45,27 +46,27 @@ def __init__(self, boot_set):
try:
self.artifact_summary['kernel'] = self.kernel_path
except ArtifactNotFound as err:
LOGGER.warn(err)
LOGGER.warn(exc_type_msg(err))
try:
self.artifact_summary['initrd'] = self.initrd_path
except ArtifactNotFound as err:
LOGGER.warn(err)
LOGGER.warn(exc_type_msg(err))
try:
self.artifact_summary['rootfs'] = self.rootfs_path
except ArtifactNotFound as err:
LOGGER.warn(err)
LOGGER.warn(exc_type_msg(err))
try:
self.artifact_summary['rootfs_etag'] = self.rootfs_etag
except ArtifactNotFound as err:
LOGGER.warn(err)
LOGGER.warn(exc_type_msg(err))
try:
self.artifact_summary['boot_parameters'] = self.boot_parameters_path
except ArtifactNotFound as err:
LOGGER.warn(err)
LOGGER.warn(exc_type_msg(err))
try:
self.artifact_summary['boot_parameters_etag'] = self.boot_parameters_etag
except ArtifactNotFound as err:
LOGGER.warn(err)
LOGGER.warn(exc_type_msg(err))

@property
def metadata(self):
Expand All @@ -79,7 +80,7 @@ def metadata(self):
try:
return self.boot_artifacts.manifest_json
except (ClientError, S3MissingConfiguration) as error:
LOGGER.error("Unable to read %s -- Error: %s", self._boot_set.get('path', ''), error)
LOGGER.error("Unable to read %s -- Error: %s", self._boot_set.get('path', ''), exc_type_msg(error))
raise BootImageMetaDataBadRead(error)

@property
Expand Down
8 changes: 4 additions & 4 deletions src/bos/operators/utils/clients/bos/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from urllib3.exceptions import MaxRetryError

from bos.common.tenant_utils import get_new_tenant_header
from bos.common.utils import PROTOCOL, requests_retry_session
from bos.common.utils import PROTOCOL, exc_type_msg, requests_retry_session

LOGGER = logging.getLogger('bos.operators.utils.clients.bos.base')

Expand All @@ -43,13 +43,13 @@ def wrap(*args, **kwargs):
result = func(*args, **kwargs)
return result
except (ConnectionError, MaxRetryError) as e:
LOGGER.error("Unable to connect to BOS: {}".format(e))
LOGGER.error("Unable to connect to BOS: %s", exc_type_msg(e))
raise e
except HTTPError as e:
LOGGER.error("Unexpected response from BOS: {}".format(e))
LOGGER.error("Unexpected response from BOS: %s", exc_type_msg(e))
raise e
except json.JSONDecodeError as e:
LOGGER.error("Non-JSON response from BOS: {}".format(e))
LOGGER.error("Non-JSON response from BOS: %s", exc_type_msg(e))
raise e

return wrap
Expand Down
8 changes: 4 additions & 4 deletions src/bos/operators/utils/clients/bos/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from requests.exceptions import HTTPError, ConnectionError
from urllib3.exceptions import MaxRetryError

from bos.common.utils import requests_retry_session
from bos.common.utils import exc_type_msg, requests_retry_session
from bos.operators.utils.clients.bos.base import BASE_ENDPOINT

LOGGER = logging.getLogger('bos.operators.utils.clients.bos.options')
Expand Down Expand Up @@ -56,11 +56,11 @@ def _get_options(self):
response.raise_for_status()
return json.loads(response.text)
except (ConnectionError, MaxRetryError) as e:
LOGGER.error("Unable to connect to BOS: {}".format(e))
LOGGER.error("Unable to connect to BOS: %s", exc_type_msg(e))
except HTTPError as e:
LOGGER.error("Unexpected response from BOS: {}".format(e))
LOGGER.error("Unexpected response from BOS: %s", exc_type_msg(e))
except json.JSONDecodeError as e:
LOGGER.error("Non-JSON response from BOS: {}".format(e))
LOGGER.error("Non-JSON response from BOS: %s", exc_type_msg(e))
return {}

def get_option(self, key, value_type, default):
Expand Down
4 changes: 2 additions & 2 deletions src/bos/operators/utils/clients/bss.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import logging
import json

from bos.common.utils import compact_response_text, requests_retry_session, PROTOCOL
from bos.common.utils import compact_response_text, exc_type_msg, requests_retry_session, PROTOCOL

LOGGER = logging.getLogger(__name__)
SERVICE_NAME = 'cray-bss'
Expand Down Expand Up @@ -82,5 +82,5 @@ def set_bss(node_set, kernel_params, kernel, initrd, session=None):
resp.raise_for_status()
return resp
except HTTPError as err:
LOGGER.error("%s" % err)
LOGGER.error(exc_type_msg(err))
raise
6 changes: 3 additions & 3 deletions src/bos/operators/utils/clients/cfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import logging
from requests.exceptions import HTTPError, ConnectionError

from bos.common.utils import compact_response_text, requests_retry_session, PROTOCOL
from bos.common.utils import compact_response_text, exc_type_msg, requests_retry_session, PROTOCOL

SERVICE_NAME = 'cray-cfs-api'
BASE_ENDPOINT = "%s://%s/v3" % (PROTOCOL, SERVICE_NAME)
Expand Down Expand Up @@ -55,7 +55,7 @@ def get_components(session=None, **params):
try:
response.raise_for_status()
except HTTPError as err:
LOGGER.error("Failed getting nodes from CFS: %s", err)
LOGGER.error("Failed getting nodes from CFS: %s", exc_type_msg(err))
raise
response_json = response.json()
new_components = response_json["components"]
Expand All @@ -79,7 +79,7 @@ def patch_components(data, session=None):
try:
response.raise_for_status()
except HTTPError as err:
LOGGER.error("Failed asking CFS to configure nodes: %s", err)
LOGGER.error("Failed asking CFS to configure nodes: %s", exc_type_msg(err))
raise


Expand Down
16 changes: 8 additions & 8 deletions src/bos/operators/utils/clients/hsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from requests.exceptions import HTTPError, ConnectionError
from urllib3.exceptions import MaxRetryError

from bos.common.utils import compact_response_text, requests_retry_session, PROTOCOL
from bos.common.utils import compact_response_text, exc_type_msg, requests_retry_session, PROTOCOL

SERVICE_NAME = 'cray-smd'
BASE_ENDPOINT = "%s://%s/hsm/v2/" % (PROTOCOL, SERVICE_NAME)
Expand Down Expand Up @@ -59,14 +59,14 @@ def read_all_node_xnames():
try:
response = session.get(endpoint)
except ConnectionError as ce:
LOGGER.error("Unable to contact HSM service: %s", ce)
LOGGER.error("Unable to contact HSM service: %s", exc_type_msg(ce))
raise HWStateManagerException(ce) from ce
LOGGER.debug("Response status code=%d, reason=%s, body=%s", response.status_code,
response.reason, compact_response_text(response.text))
try:
response.raise_for_status()
except (HTTPError, MaxRetryError) as hpe:
LOGGER.error("Unexpected response from HSM: %s", response)
LOGGER.error("Unexpected response from HSM: %s (%s)", response, exc_type_msg(hpe))
raise HWStateManagerException(hpe) from hpe
try:
json_body = json.loads(response.text)
Expand All @@ -77,7 +77,7 @@ def read_all_node_xnames():
return set([component['ID'] for component in json_body['Components']
if component.get('Type', None) == 'Node'])
except KeyError as ke:
LOGGER.error("Unexpected API response from HSM")
LOGGER.error("Unexpected API response from HSM: %s", exc_type_msg(ke))
raise HWStateManagerException(ke) from ke


Expand Down Expand Up @@ -135,13 +135,13 @@ def get_components(node_list, enabled=None) -> dict[str,list[dict]]:
response.raise_for_status()
components = json.loads(response.text)
except (ConnectionError, MaxRetryError) as e:
LOGGER.error("Unable to connect to HSM: {}".format(e))
LOGGER.error("Unable to connect to HSM: %s", exc_type_msg(e))
raise e
except HTTPError as e:
LOGGER.error("Unexpected response from HSM: {}".format(e))
LOGGER.error("Unexpected response from HSM: %s", exc_type_msg(e))
raise e
except json.JSONDecodeError as e:
LOGGER.error("Non-JSON response from HSM: {}".format(e))
LOGGER.error("Non-JSON response from HSM: %s", exc_type_msg(e))
raise e
return components

Expand Down Expand Up @@ -230,7 +230,7 @@ def get(self, path, params=None):
response.reason, compact_response_text(response.text))
response.raise_for_status()
except HTTPError as err:
LOGGER.error("Failed to get '{}': {}".format(url, err))
LOGGER.error("Failed to get '%s': %s", url, exc_type_msg(err))
raise
try:
return response.json()
Expand Down
4 changes: 2 additions & 2 deletions src/bos/operators/utils/clients/pcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,8 @@ def _transition_create(xnames, operation, task_deadline_minutes=None, deputy_key
session = session or requests_retry_session()
try:
assert operation in set(['On', 'Off', 'Soft-Off', 'Soft-Restart', 'Hard-Restart', 'Init', 'Force-Off'])
except AssertionError:
raise PowerControlSyntaxException("Operation '%s' is not supported or implemented." %(operation))
except AssertionError as err:
raise PowerControlSyntaxException("Operation '%s' is not supported or implemented." %(operation)) from err
params = {'location': [], 'operation': operation}
if task_deadline_minutes:
params['taskDeadlineMinutes'] = int(task_deadline_minutes)
Expand Down
Loading

0 comments on commit d578ff5

Please sign in to comment.