Skip to content

Commit

Permalink
Merge pull request #244 from Cray-HPE/CRAYSAT-1878-remove-automatic-c…
Browse files Browse the repository at this point in the history
…ronjob-recreation

CRAYSAT-1878: Remove automatic cronjob recreation from `bootsys`
  • Loading branch information
haasken-hpe authored Jul 25, 2024
2 parents 3f9a619 + 97368b2 commit 5c2e7d6
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 15 deletions.
10 changes: 9 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- If containers fail to stop, automate the procedure of trying to stop them again
in the `platform-services` stage.
- Adding PG_NOT_DEEP_SCRUBBED in allowable checks excluded during ceph health check as it is
ignorable.
ignorable.
- Automate the procedure of setting next boot device to disk before the management nodes are
powered off as part of the full-system shutdown.
- Adding a ceph health check bypass prompt to take input from user and act accordingly.
unfreezing of ceph would be done, only the wait period will be skipped if user wishes to.
- Improved logic in `cabinet-power` stage of `sat bootsys boot` to more reliably
determine when the a Job has been scheduled for the `hms-discovery` Kubernetes
CronJob after it has been resumed.
- Remove unreliable step from the `platform-services` stage of `sat bootsys
boot` that checked for Kubernetes CronJobs that were not being scheduled due
to missing too many start times. The problem this attempted to solve should no
longer occur with the CronJobControllerV2 being the default starting in
Kubernetes 1.21.

### Fixed
- Updated `sat bootsys` to increase the default management NCN shutdown timeout
Expand Down
12 changes: 10 additions & 2 deletions sat/cli/bootsys/cabinet_power.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@
"""
Powers on and off liquid-cooled compute cabinets.
"""
from datetime import datetime
import logging

from dateutil.tz import tzutc

from sat.apiclient import APIError, HSMClient
from sat.apiclient.pcs import PCSClient
from sat.cli.bootsys.power import PCSPowerWaiter
Expand Down Expand Up @@ -66,7 +69,7 @@ def do_air_cooled_cabinets_power_off(args):
node_xnames = list(set(river_nodes) - set(river_mgmt_nodes))

if not node_xnames:
LOGGER.info(f'No non-management nodes in air-cooled cabinets to power off.')
LOGGER.info('No non-management nodes in air-cooled cabinets to power off.')
return

LOGGER.info(f'Powering off {len(node_xnames)} non-management nodes in air-cooled cabinets.')
Expand Down Expand Up @@ -188,6 +191,11 @@ def do_cabinets_power_on(args):
None
"""
LOGGER.info(f'Resuming {HMSDiscoveryCronJob.FULL_NAME}.')
# Save the time immediately before resuming hms-discovery CronJob, so we can
# look for any jobs scheduled after this time. Zero out microseconds because
# Kubernetes may immediately create a job for the cronjob, and it doesn't
# include microseconds in the job's creation_timestamp.
hms_discovery_resumed_time = datetime.now(tz=tzutc()).replace(microsecond=0)
try:
HMSDiscoveryCronJob().set_suspend_status(False)
except HMSDiscoveryError as err:
Expand All @@ -196,7 +204,7 @@ def do_cabinets_power_on(args):

LOGGER.info(f'Waiting for {HMSDiscoveryCronJob.FULL_NAME} to be scheduled.')
try:
hms_discovery_waiter = HMSDiscoveryScheduledWaiter()
hms_discovery_waiter = HMSDiscoveryScheduledWaiter(start_time=hms_discovery_resumed_time)
except HMSDiscoveryError as err:
LOGGER.error(f'Failed to start waiting on HMS discovery job being '
f'rescheduled after resume: {err}')
Expand Down
1 change: 0 additions & 1 deletion sat/cli/bootsys/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,6 @@ def do_etcd_start(ncn_groups):
PlatformServicesStep('Ensure etcd is running and enabled on all Kubernetes manager NCNs.',
do_etcd_start),
PlatformServicesStep('Start and enable kubelet on all Kubernetes NCNs.', do_kubelet_start),
PlatformServicesStep('Recreate cron jobs that have become stuck', do_recreate_cronjobs),
],
# The ordered steps to stop platform services
'stop': [
Expand Down
15 changes: 14 additions & 1 deletion sat/cronjob.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP
# (C) Copyright 2020-2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -96,6 +96,19 @@ def recreate_cronjob(batch_api, cronjob):
def recreate_namespaced_stuck_cronjobs(batch_api, namespace):
"""Find cronjobs that are not being scheduled and recreate them.
This should no longer be necessary since Kubernetes 1.21.0, which made
the new CronJobControllerV2 the default. We retain this function in case
it is still necessary to check for cronjobs that need to be re-created
due to unforeseen circumstances.
The CronJobControllerV2 handles CronJobs that have missed too many start
times differently than the CronJobController. When a CronJob missed too
many start times under the original CronJobController, it would issue a
"FailedNeedsStart" warning event and then not schedule a job for the
CronJob. On the other hand, the CronJobControllerV2 issues a
"TooManyMissedTimes" warning event and then does still schedule a Job for
the CronJob.
Args:
batch_api (kubernetes.client.BatchV1Api): the Kubernetes API client
namespace (str): the namespace to search for cronjobs
Expand Down
26 changes: 20 additions & 6 deletions sat/hms_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ def data(self):
raise HMSDiscoveryError(f'Failed to get data for '
f'{self.FULL_NAME}: {err}') from err

@property
def schedule_interval(self):
"""int: the number of seconds between scheduled runs of the cronjob"""
ci = croniter(self.data.spec.schedule, start_time=datetime.now(tz=tzutc()))
next_time = ci.get_next()
return int(ci.get_next() - next_time)

@property
def job_label_selector(self):
"""str: the label selector for querying jobs created by this cronjob"""
Expand Down Expand Up @@ -201,7 +208,8 @@ def recreate(self):
class HMSDiscoveryScheduledWaiter(Waiter):
"""Waiter for HMS discovery cronjob to be scheduled by k8s."""

def __init__(self, poll_interval=5, grace_period=180, retries=1):
def __init__(self, poll_interval=5, grace_period=180, retries=1,
start_time=None):
"""Create a new HMSDiscoveryScheduledWaiter.
Timeout is computed automatically based on the latest possible time
Expand All @@ -211,17 +219,23 @@ def __init__(self, poll_interval=5, grace_period=180, retries=1):
poll_interval (int): see `Waiter.__init__`
grace_period (int): the number of seconds after the expected next
scheduled time to wait for the job to be scheduled.
retries (int): see `Waiter.__init__`
start_time (datetime or None): the starting UTC time after which we
should look for jobs to be scheduled.
Raises:
HMSDiscoveryError: if there is an error querying the k8s API for the
schedule of the cronjob.
"""
self.hd_cron_job = HMSDiscoveryCronJob()

# This call can raise HMSDiscoveryError
next_time = self.hd_cron_job.get_latest_next_schedule_time()
self.start_time = datetime.now(tz=tzutc())
timeout = (next_time - self.start_time).seconds + grace_period
if start_time:
self.start_time = start_time
else:
self.start_time = datetime.now(tz=tzutc())

# Wait for one schedule interval plus the grace period
timeout = self.hd_cron_job.schedule_interval + grace_period

super().__init__(timeout, poll_interval, retries)

Expand All @@ -242,7 +256,7 @@ def has_completed(self):
try:
jobs = self.hd_cron_job.get_jobs()
return any(
job.metadata.creation_timestamp > self.start_time
job.metadata.creation_timestamp >= self.start_time
for job in jobs
)
except ApiException as err:
Expand Down
28 changes: 24 additions & 4 deletions tests/test_hms_discovery.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#
# MIT License
#
# (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP
# (C) Copyright 2020-2024 Hewlett Packard Enterprise Development LP
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -35,7 +35,6 @@
from sat.hms_discovery import (HMSDiscoveryCronJob, HMSDiscoveryError,
HMSDiscoveryScheduledWaiter,
HMSDiscoverySuspendedWaiter)
from sat.waiting import WaitingFailure


class TestHMSDiscoveryCronJob(unittest.TestCase):
Expand Down Expand Up @@ -87,6 +86,21 @@ def test_data_api_exception(self):
with self.assertRaisesRegex(HMSDiscoveryError, 'Failed to get data'):
_ = self.hdcj.data

def test_schedule_interval(self):
"""Test the schedule_interval property returns the correct intervals for various schedules"""
schedules_and_intervals = [
('*/3 * * * *', 180),
('*/15 * * * *', 900),
('0 */3 * * *', 10800),
('5 0 * * *', 86400),
]

for schedule, interval in schedules_and_intervals:
mock_cron_job = Mock()
mock_cron_job.spec.schedule = schedule
self.mock_batch_api.read_namespaced_cron_job.return_value = mock_cron_job
self.assertEqual(interval, self.hdcj.schedule_interval)

def test_get_last_schedule_time(self):
"""Test get last_schedule_time method."""
self.assertEqual(
Expand Down Expand Up @@ -200,7 +214,7 @@ def setUp(self):

self.next_sched_time = datetime(2020, 10, 31, 8, 3, 0, tzinfo=tzutc())
self.mock_hd_cron_job = patch('sat.hms_discovery.HMSDiscoveryCronJob').start().return_value
self.mock_hd_cron_job.get_latest_next_schedule_time.return_value = self.next_sched_time
self.mock_hd_cron_job.schedule_interval = 180

self.hd_waiter = HMSDiscoveryScheduledWaiter(self.poll_interval, self.grace_period)

Expand All @@ -210,12 +224,18 @@ def tearDown(self):

def test_init(self):
"""Test creation of an HMSDiscoveryScheduledWaiter."""
self.assertEqual((self.next_sched_time - self.start_time).seconds + self.grace_period,
self.assertEqual(self.mock_hd_cron_job.schedule_interval + self.grace_period,
self.hd_waiter.timeout)
self.assertEqual(self.poll_interval, self.hd_waiter.poll_interval)
self.assertEqual(self.start_time, self.hd_waiter.start_time)
self.assertEqual(self.mock_hd_cron_job, self.hd_waiter.hd_cron_job)

def test_init_start_time(self):
"""Test creation of an HMSDiscoveryScheduledWaiter with overridden start_time"""
start_time = datetime(2018, 6, 7, 8, 45, 0, tzinfo=tzutc())
hd_waiter = HMSDiscoveryScheduledWaiter(start_time=start_time)
self.assertEqual(start_time, hd_waiter.start_time)

def test_condition_name(self):
"""Test the condition_name method of the HMSDiscoveryScheduledWaiter."""
self.assertEqual('HMS Discovery Scheduled', self.hd_waiter.condition_name())
Expand Down

0 comments on commit 5c2e7d6

Please sign in to comment.