From 4a3268687a6de42e465b1378035035090f33771c Mon Sep 17 00:00:00 2001 From: Ryan Haasken Date: Tue, 23 Jul 2024 17:23:23 -0500 Subject: [PATCH 1/2] CRAYSAT-1878: Remove automatic cronjob recreation from `bootsys` Remove the step that automatically checks for and re-creates stuck Kubernetes CronJobs from the `platform-services` stage of `sat bootsys boot`. This should not be necessary anymore starting in Kubernetes 1.21, which made a new CronJobControllerV2 the default. In addition, improve the logic of the HMSDiscoveryScheduledWaiter, so that it will more reliably detect when an `hms-discovery` Job has been scheduled for the CronJob. Pass in an explicit `start_time`, so that we can look for any jobs created for the CronJob after it is re-enabled. This ensures we won't miss the first one, which could be scheduled between when we set `suspend=False` on the CronJob and when we create the `HMSDiscoveryScheduledWaiter`. Test Description: Tested on rocket as follows: * Suspended the `hms-discovery` CronJob * Ran `sat bootsys boot --stage cabinet-power` * Verified that it correctly identified when the CronJob was scheduled --- CHANGELOG.md | 10 +++++++++- sat/cli/bootsys/cabinet_power.py | 10 +++++++++- sat/cli/bootsys/platform.py | 1 - sat/cronjob.py | 15 ++++++++++++++- sat/hms_discovery.py | 26 ++++++++++++++++++++------ tests/test_hms_discovery.py | 28 ++++++++++++++++++++++++---- 6 files changed, 76 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fd1ec282..7c178e39 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,11 +49,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - If containers fail to stop, automate the procedure of trying to stop them again in the `platform-services` stage. - Adding PG_NOT_DEEP_SCRUBBED in allowable checks excluded during ceph health check as it is - ignorable. + ignorable. - Automate the procedure of setting next boot device to disk before the management nodes are powered off as part of the full-system shutdown. - Adding a ceph health check bypass prompt to take input from user and act accordingly. unfreezing of ceph would be done, only the wait period will be skipped if user wishes to. +- Improved logic in `cabinet-power` stage of `sat bootsys boot` to more reliably + determine when the a Job has been scheduled for the `hms-discovery` Kubernetes + CronJob after it has been resumed. +- Remove unreliable step from the `platform-services` stage of `sat bootsys + boot` that checked for Kubernetes CronJobs that were not being scheduled due + to missing too many start times. The problem this attempted to solve should no + longer occur with the CronJobControllerV2 being the default starting in + Kubernetes 1.21. ### Fixed - Updated `sat bootsys` to increase the default management NCN shutdown timeout diff --git a/sat/cli/bootsys/cabinet_power.py b/sat/cli/bootsys/cabinet_power.py index 9d5c1308..50e71535 100644 --- a/sat/cli/bootsys/cabinet_power.py +++ b/sat/cli/bootsys/cabinet_power.py @@ -24,8 +24,11 @@ """ Powers on and off liquid-cooled compute cabinets. """ +from datetime import datetime import logging +from dateutil.tz import tzutc + from sat.apiclient import APIError, HSMClient from sat.apiclient.pcs import PCSClient from sat.cli.bootsys.power import PCSPowerWaiter @@ -188,6 +191,11 @@ def do_cabinets_power_on(args): None """ LOGGER.info(f'Resuming {HMSDiscoveryCronJob.FULL_NAME}.') + # Save the time immediately before resuming hms-discovery CronJob, so we can + # look for any jobs scheduled after this time. Zero out microseconds because + # Kubernetes may immediately create a job for the cronjob, and it doesn't + # include microseconds in the job's creation_timestamp. + hms_discovery_resumed_time = datetime.now(tz=tzutc()).replace(microsecond=0) try: HMSDiscoveryCronJob().set_suspend_status(False) except HMSDiscoveryError as err: @@ -196,7 +204,7 @@ def do_cabinets_power_on(args): LOGGER.info(f'Waiting for {HMSDiscoveryCronJob.FULL_NAME} to be scheduled.') try: - hms_discovery_waiter = HMSDiscoveryScheduledWaiter() + hms_discovery_waiter = HMSDiscoveryScheduledWaiter(start_time=hms_discovery_resumed_time) except HMSDiscoveryError as err: LOGGER.error(f'Failed to start waiting on HMS discovery job being ' f'rescheduled after resume: {err}') diff --git a/sat/cli/bootsys/platform.py b/sat/cli/bootsys/platform.py index 035f3bfb..88efdc35 100644 --- a/sat/cli/bootsys/platform.py +++ b/sat/cli/bootsys/platform.py @@ -612,7 +612,6 @@ def do_etcd_start(ncn_groups): PlatformServicesStep('Ensure etcd is running and enabled on all Kubernetes manager NCNs.', do_etcd_start), PlatformServicesStep('Start and enable kubelet on all Kubernetes NCNs.', do_kubelet_start), - PlatformServicesStep('Recreate cron jobs that have become stuck', do_recreate_cronjobs), ], # The ordered steps to stop platform services 'stop': [ diff --git a/sat/cronjob.py b/sat/cronjob.py index 6a6570b6..0b965eb0 100644 --- a/sat/cronjob.py +++ b/sat/cronjob.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP +# (C) Copyright 2020-2024 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -96,6 +96,19 @@ def recreate_cronjob(batch_api, cronjob): def recreate_namespaced_stuck_cronjobs(batch_api, namespace): """Find cronjobs that are not being scheduled and recreate them. + This should no longer be necessary since Kubernetes 1.21.0, which made + the new CronJobControllerV2 the default. We retain this function in case + it is still necessary to check for cronjobs that need to be re-created + due to unforeseen circumstances. + + The CronJobControllerV2 handles CronJobs that have missed too many start + times differently than the CronJobController. When a CronJob missed too + many start times under the original CronJobController, it would issue a + "FailedNeedsStart" warning event and then not schedule a job for the + CronJob. On the other hand, the CronJobControllerV2 issues a + "TooManyMissedTimes" warning event and then does still schedule a Job for + the CronJob. + Args: batch_api (kubernetes.client.BatchV1Api): the Kubernetes API client namespace (str): the namespace to search for cronjobs diff --git a/sat/hms_discovery.py b/sat/hms_discovery.py index 883f226f..1e79953c 100644 --- a/sat/hms_discovery.py +++ b/sat/hms_discovery.py @@ -82,6 +82,13 @@ def data(self): raise HMSDiscoveryError(f'Failed to get data for ' f'{self.FULL_NAME}: {err}') from err + @property + def schedule_interval(self): + """int: the number of seconds between scheduled runs of the cronjob""" + ci = croniter(self.data.spec.schedule, start_time=datetime.now(tz=tzutc())) + next_time = ci.get_next() + return int(ci.get_next() - next_time) + @property def job_label_selector(self): """str: the label selector for querying jobs created by this cronjob""" @@ -201,7 +208,8 @@ def recreate(self): class HMSDiscoveryScheduledWaiter(Waiter): """Waiter for HMS discovery cronjob to be scheduled by k8s.""" - def __init__(self, poll_interval=5, grace_period=180, retries=1): + def __init__(self, poll_interval=5, grace_period=180, retries=1, + start_time=None): """Create a new HMSDiscoveryScheduledWaiter. Timeout is computed automatically based on the latest possible time @@ -211,6 +219,9 @@ def __init__(self, poll_interval=5, grace_period=180, retries=1): poll_interval (int): see `Waiter.__init__` grace_period (int): the number of seconds after the expected next scheduled time to wait for the job to be scheduled. + retries (int): see `Waiter.__init__` + start_time (datetime or None): the starting UTC time after which we + should look for jobs to be scheduled. Raises: HMSDiscoveryError: if there is an error querying the k8s API for the @@ -218,10 +229,13 @@ def __init__(self, poll_interval=5, grace_period=180, retries=1): """ self.hd_cron_job = HMSDiscoveryCronJob() - # This call can raise HMSDiscoveryError - next_time = self.hd_cron_job.get_latest_next_schedule_time() - self.start_time = datetime.now(tz=tzutc()) - timeout = (next_time - self.start_time).seconds + grace_period + if start_time: + self.start_time = start_time + else: + self.start_time = datetime.now(tz=tzutc()) + + # Wait for one schedule interval plus the grace period + timeout = self.hd_cron_job.schedule_interval + grace_period super().__init__(timeout, poll_interval, retries) @@ -242,7 +256,7 @@ def has_completed(self): try: jobs = self.hd_cron_job.get_jobs() return any( - job.metadata.creation_timestamp > self.start_time + job.metadata.creation_timestamp >= self.start_time for job in jobs ) except ApiException as err: diff --git a/tests/test_hms_discovery.py b/tests/test_hms_discovery.py index 6a203131..1aeb6f31 100644 --- a/tests/test_hms_discovery.py +++ b/tests/test_hms_discovery.py @@ -1,7 +1,7 @@ # # MIT License # -# (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP +# (C) Copyright 2020-2024 Hewlett Packard Enterprise Development LP # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -35,7 +35,6 @@ from sat.hms_discovery import (HMSDiscoveryCronJob, HMSDiscoveryError, HMSDiscoveryScheduledWaiter, HMSDiscoverySuspendedWaiter) -from sat.waiting import WaitingFailure class TestHMSDiscoveryCronJob(unittest.TestCase): @@ -87,6 +86,21 @@ def test_data_api_exception(self): with self.assertRaisesRegex(HMSDiscoveryError, 'Failed to get data'): _ = self.hdcj.data + def test_schedule_interval(self): + """Test the schedule_interval property returns the correct intervals for various schedules""" + schedules_and_intervals = [ + ('*/3 * * * *', 180), + ('*/15 * * * *', 900), + ('0 */3 * * *', 10800), + ('5 0 * * *', 86400), + ] + + for schedule, interval in schedules_and_intervals: + mock_cron_job = Mock() + mock_cron_job.spec.schedule = schedule + self.mock_batch_api.read_namespaced_cron_job.return_value = mock_cron_job + self.assertEqual(interval, self.hdcj.schedule_interval) + def test_get_last_schedule_time(self): """Test get last_schedule_time method.""" self.assertEqual( @@ -200,7 +214,7 @@ def setUp(self): self.next_sched_time = datetime(2020, 10, 31, 8, 3, 0, tzinfo=tzutc()) self.mock_hd_cron_job = patch('sat.hms_discovery.HMSDiscoveryCronJob').start().return_value - self.mock_hd_cron_job.get_latest_next_schedule_time.return_value = self.next_sched_time + self.mock_hd_cron_job.schedule_interval = 180 self.hd_waiter = HMSDiscoveryScheduledWaiter(self.poll_interval, self.grace_period) @@ -210,12 +224,18 @@ def tearDown(self): def test_init(self): """Test creation of an HMSDiscoveryScheduledWaiter.""" - self.assertEqual((self.next_sched_time - self.start_time).seconds + self.grace_period, + self.assertEqual(self.mock_hd_cron_job.schedule_interval + self.grace_period, self.hd_waiter.timeout) self.assertEqual(self.poll_interval, self.hd_waiter.poll_interval) self.assertEqual(self.start_time, self.hd_waiter.start_time) self.assertEqual(self.mock_hd_cron_job, self.hd_waiter.hd_cron_job) + def test_init_start_time(self): + """Test creation of an HMSDiscoveryScheduledWaiter with overridden start_time""" + start_time = datetime(2018, 6, 7, 8, 45, 0, tzinfo=tzutc()) + hd_waiter = HMSDiscoveryScheduledWaiter(start_time=start_time) + self.assertEqual(start_time, hd_waiter.start_time) + def test_condition_name(self): """Test the condition_name method of the HMSDiscoveryScheduledWaiter.""" self.assertEqual('HMS Discovery Scheduled', self.hd_waiter.condition_name()) From 97368b2d8b40e055c5a734b3880d0c84b0f76720 Mon Sep 17 00:00:00 2001 From: Ryan Haasken Date: Tue, 23 Jul 2024 17:30:12 -0500 Subject: [PATCH 2/2] CRAYSAT-1878: Remove unnecessary f-string PyCharm was complaining about this. Make it happy. --- sat/cli/bootsys/cabinet_power.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sat/cli/bootsys/cabinet_power.py b/sat/cli/bootsys/cabinet_power.py index 50e71535..5ff3ead9 100644 --- a/sat/cli/bootsys/cabinet_power.py +++ b/sat/cli/bootsys/cabinet_power.py @@ -69,7 +69,7 @@ def do_air_cooled_cabinets_power_off(args): node_xnames = list(set(river_nodes) - set(river_mgmt_nodes)) if not node_xnames: - LOGGER.info(f'No non-management nodes in air-cooled cabinets to power off.') + LOGGER.info('No non-management nodes in air-cooled cabinets to power off.') return LOGGER.info(f'Powering off {len(node_xnames)} non-management nodes in air-cooled cabinets.')