Merge pull request #244 from Cray-HPE/CRAYSAT-1878-remove-automatic-c…

…ronjob-recreation CRAYSAT-1878: Remove automatic cronjob recreation from `bootsys`
Cray-HPE · Jul 25, 2024 · 5c2e7d6 · 5c2e7d6
2 parents 3f9a619 + 97368b2
commit 5c2e7d6
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -49,11 +49,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - If containers fail to stop, automate the procedure of trying to stop them again
   in the `platform-services` stage.
 - Adding PG_NOT_DEEP_SCRUBBED in allowable checks excluded during ceph health check as it is
-  ignorable. 
+  ignorable.
 - Automate the procedure of setting next boot device to disk before the management nodes are
   powered off as part of the full-system shutdown.
 - Adding a ceph health check bypass prompt to take input from user and act accordingly.
   unfreezing of ceph would be done, only the wait period will be skipped if user wishes to.
+- Improved logic in `cabinet-power` stage of `sat bootsys boot` to more reliably
+  determine when the a Job has been scheduled for the `hms-discovery` Kubernetes
+  CronJob after it has been resumed.
+- Remove unreliable step from the `platform-services` stage of `sat bootsys
+  boot` that checked for Kubernetes CronJobs that were not being scheduled due
+  to missing too many start times. The problem this attempted to solve should no
+  longer occur with the CronJobControllerV2 being the default starting in
+  Kubernetes 1.21.
 
 ### Fixed
 - Updated `sat bootsys` to increase the default management NCN shutdown timeout

diff --git a/sat/cli/bootsys/cabinet_power.py b/sat/cli/bootsys/cabinet_power.py
@@ -24,8 +24,11 @@
 """
 Powers on and off liquid-cooled compute cabinets.
 """
+from datetime import datetime
 import logging
 
+from dateutil.tz import tzutc
+
 from sat.apiclient import APIError, HSMClient
 from sat.apiclient.pcs import PCSClient
 from sat.cli.bootsys.power import PCSPowerWaiter
@@ -66,7 +69,7 @@ def do_air_cooled_cabinets_power_off(args):
     node_xnames = list(set(river_nodes) - set(river_mgmt_nodes))
 
     if not node_xnames:
-        LOGGER.info(f'No non-management nodes in air-cooled cabinets to power off.')
+        LOGGER.info('No non-management nodes in air-cooled cabinets to power off.')
         return
 
     LOGGER.info(f'Powering off {len(node_xnames)} non-management nodes in air-cooled cabinets.')
@@ -188,6 +191,11 @@ def do_cabinets_power_on(args):
         None
     """
     LOGGER.info(f'Resuming {HMSDiscoveryCronJob.FULL_NAME}.')
+    # Save the time immediately before resuming hms-discovery CronJob, so we can
+    # look for any jobs scheduled after this time. Zero out microseconds because
+    # Kubernetes may immediately create a job for the cronjob, and it doesn't
+    # include microseconds in the job's creation_timestamp.
+    hms_discovery_resumed_time = datetime.now(tz=tzutc()).replace(microsecond=0)
     try:
         HMSDiscoveryCronJob().set_suspend_status(False)
     except HMSDiscoveryError as err:
@@ -196,7 +204,7 @@ def do_cabinets_power_on(args):
 
     LOGGER.info(f'Waiting for {HMSDiscoveryCronJob.FULL_NAME} to be scheduled.')
     try:
-        hms_discovery_waiter = HMSDiscoveryScheduledWaiter()
+        hms_discovery_waiter = HMSDiscoveryScheduledWaiter(start_time=hms_discovery_resumed_time)
     except HMSDiscoveryError as err:
         LOGGER.error(f'Failed to start waiting on HMS discovery job being '
                      f'rescheduled after resume: {err}')

diff --git a/sat/cli/bootsys/platform.py b/sat/cli/bootsys/platform.py
@@ -612,7 +612,6 @@ def do_etcd_start(ncn_groups):
         PlatformServicesStep('Ensure etcd is running and enabled on all Kubernetes manager NCNs.',
                              do_etcd_start),
         PlatformServicesStep('Start and enable kubelet on all Kubernetes NCNs.', do_kubelet_start),
-        PlatformServicesStep('Recreate cron jobs that have become stuck', do_recreate_cronjobs),
     ],
     # The ordered steps to stop platform services
     'stop': [

diff --git a/sat/cronjob.py b/sat/cronjob.py
@@ -1,7 +1,7 @@
 #
 # MIT License
 #
-# (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+# (C) Copyright 2020-2024 Hewlett Packard Enterprise Development LP
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -96,6 +96,19 @@ def recreate_cronjob(batch_api, cronjob):
 def recreate_namespaced_stuck_cronjobs(batch_api, namespace):
     """Find cronjobs that are not being scheduled and recreate them.
 
+    This should no longer be necessary since Kubernetes 1.21.0, which made
+    the new CronJobControllerV2 the default. We retain this function in case
+    it is still necessary to check for cronjobs that need to be re-created
+    due to unforeseen circumstances.
+
+    The CronJobControllerV2 handles CronJobs that have missed too many start
+    times differently than the CronJobController. When a CronJob missed too
+    many start times under the original CronJobController, it would issue a
+    "FailedNeedsStart" warning event and then not schedule a job for the
+    CronJob. On the other hand, the CronJobControllerV2 issues a
+    "TooManyMissedTimes" warning event and then does still schedule a Job for
+    the CronJob.
+
     Args:
         batch_api (kubernetes.client.BatchV1Api): the Kubernetes API client
         namespace (str): the namespace to search for cronjobs

diff --git a/sat/hms_discovery.py b/sat/hms_discovery.py
@@ -82,6 +82,13 @@ def data(self):
             raise HMSDiscoveryError(f'Failed to get data for '
                                     f'{self.FULL_NAME}: {err}') from err
 
+    @property
+    def schedule_interval(self):
+        """int: the number of seconds between scheduled runs of the cronjob"""
+        ci = croniter(self.data.spec.schedule, start_time=datetime.now(tz=tzutc()))
+        next_time = ci.get_next()
+        return int(ci.get_next() - next_time)
+
     @property
     def job_label_selector(self):
         """str: the label selector for querying jobs created by this cronjob"""
@@ -201,7 +208,8 @@ def recreate(self):
 class HMSDiscoveryScheduledWaiter(Waiter):
     """Waiter for HMS discovery cronjob to be scheduled by k8s."""
 
-    def __init__(self, poll_interval=5, grace_period=180, retries=1):
+    def __init__(self, poll_interval=5, grace_period=180, retries=1,
+                 start_time=None):
         """Create a new HMSDiscoveryScheduledWaiter.
 
         Timeout is computed automatically based on the latest possible time
@@ -211,17 +219,23 @@ def __init__(self, poll_interval=5, grace_period=180, retries=1):
             poll_interval (int): see `Waiter.__init__`
             grace_period (int): the number of seconds after the expected next
                 scheduled time to wait for the job to be scheduled.
+            retries (int): see `Waiter.__init__`
+            start_time (datetime or None): the starting UTC time after which we
+                should look for jobs to be scheduled.
 
         Raises:
             HMSDiscoveryError: if there is an error querying the k8s API for the
                 schedule of the cronjob.
         """
         self.hd_cron_job = HMSDiscoveryCronJob()
 
-        # This call can raise HMSDiscoveryError
-        next_time = self.hd_cron_job.get_latest_next_schedule_time()
-        self.start_time = datetime.now(tz=tzutc())
-        timeout = (next_time - self.start_time).seconds + grace_period
+        if start_time:
+            self.start_time = start_time
+        else:
+            self.start_time = datetime.now(tz=tzutc())
+
+        # Wait for one schedule interval plus the grace period
+        timeout = self.hd_cron_job.schedule_interval + grace_period
 
         super().__init__(timeout, poll_interval, retries)
 
@@ -242,7 +256,7 @@ def has_completed(self):
         try:
             jobs = self.hd_cron_job.get_jobs()
             return any(
-                job.metadata.creation_timestamp > self.start_time
+                job.metadata.creation_timestamp >= self.start_time
                 for job in jobs
             )
         except ApiException as err:

diff --git a/tests/test_hms_discovery.py b/tests/test_hms_discovery.py
@@ -1,7 +1,7 @@
 #
 # MIT License
 #
-# (C) Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+# (C) Copyright 2020-2024 Hewlett Packard Enterprise Development LP
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -35,7 +35,6 @@
 from sat.hms_discovery import (HMSDiscoveryCronJob, HMSDiscoveryError,
                                HMSDiscoveryScheduledWaiter,
                                HMSDiscoverySuspendedWaiter)
-from sat.waiting import WaitingFailure
 
 
 class TestHMSDiscoveryCronJob(unittest.TestCase):
@@ -87,6 +86,21 @@ def test_data_api_exception(self):
         with self.assertRaisesRegex(HMSDiscoveryError, 'Failed to get data'):
             _ = self.hdcj.data
 
+    def test_schedule_interval(self):
+        """Test the schedule_interval property returns the correct intervals for various schedules"""
+        schedules_and_intervals = [
+            ('*/3 * * * *', 180),
+            ('*/15 * * * *', 900),
+            ('0 */3 * * *', 10800),
+            ('5 0 * * *', 86400),
+        ]
+
+        for schedule, interval in schedules_and_intervals:
+            mock_cron_job = Mock()
+            mock_cron_job.spec.schedule = schedule
+            self.mock_batch_api.read_namespaced_cron_job.return_value = mock_cron_job
+            self.assertEqual(interval, self.hdcj.schedule_interval)
+
     def test_get_last_schedule_time(self):
         """Test get last_schedule_time method."""
         self.assertEqual(
@@ -200,7 +214,7 @@ def setUp(self):
 
         self.next_sched_time = datetime(2020, 10, 31, 8, 3, 0, tzinfo=tzutc())
         self.mock_hd_cron_job = patch('sat.hms_discovery.HMSDiscoveryCronJob').start().return_value
-        self.mock_hd_cron_job.get_latest_next_schedule_time.return_value = self.next_sched_time
+        self.mock_hd_cron_job.schedule_interval = 180
 
         self.hd_waiter = HMSDiscoveryScheduledWaiter(self.poll_interval, self.grace_period)
 
@@ -210,12 +224,18 @@ def tearDown(self):
 
     def test_init(self):
         """Test creation of an HMSDiscoveryScheduledWaiter."""
-        self.assertEqual((self.next_sched_time - self.start_time).seconds + self.grace_period,
+        self.assertEqual(self.mock_hd_cron_job.schedule_interval + self.grace_period,
                          self.hd_waiter.timeout)
         self.assertEqual(self.poll_interval, self.hd_waiter.poll_interval)
         self.assertEqual(self.start_time, self.hd_waiter.start_time)
         self.assertEqual(self.mock_hd_cron_job, self.hd_waiter.hd_cron_job)
 
+    def test_init_start_time(self):
+        """Test creation of an HMSDiscoveryScheduledWaiter with overridden start_time"""
+        start_time = datetime(2018, 6, 7, 8, 45, 0, tzinfo=tzutc())
+        hd_waiter = HMSDiscoveryScheduledWaiter(start_time=start_time)
+        self.assertEqual(start_time, hd_waiter.start_time)
+
     def test_condition_name(self):
         """Test the condition_name method of the HMSDiscoveryScheduledWaiter."""
         self.assertEqual('HMS Discovery Scheduled', self.hd_waiter.condition_name())