From ed2a66098943419dd60092dd9bc1b4e64f93fde1 Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Wed, 27 Nov 2024 19:20:27 +0200 Subject: [PATCH 01/10] K8OP-294 Do not try to work out backup status if there are no pods --- CHANGELOG/CHANGELOG-1.21.md | 1 + .../medusa/medusabackupjob_controller.go | 9 ++ .../medusa/medusabackupjob_controller_test.go | 112 +++++++++++++----- 3 files changed, 94 insertions(+), 28 deletions(-) diff --git a/CHANGELOG/CHANGELOG-1.21.md b/CHANGELOG/CHANGELOG-1.21.md index dcd092818..c7927cc05 100644 --- a/CHANGELOG/CHANGELOG-1.21.md +++ b/CHANGELOG/CHANGELOG-1.21.md @@ -18,3 +18,4 @@ When cutting a new release, update the `unreleased` heading to the tag being gen * [CHANGE] [#1441](https://github.com/k8ssandra/k8ssandra-operator/issues/1441) Use k8ssandra-client instead of k8ssandra-tools for CRD upgrades * [BUGFIX] [#1383](https://github.com/k8ssandra/k8ssandra-operator/issues/1383) Do not create MedusaBackup if MedusaBakupJob did not fully succeed * [ENHANCEMENT] [#1667](https://github.com/k8ssahttps://github.com/k8ssandra/k8ssandra/issues/1667) Add `skipSchemaMigration` option to `K8ssandraCluster.spec.reaper` +* [BUGFIX] [#1454](https://github.com/k8ssandra/k8ssandra-operator/issues/1454) Do not try to work out backup status if there are no pods diff --git a/controllers/medusa/medusabackupjob_controller.go b/controllers/medusa/medusabackupjob_controller.go index 36607440f..0ec105c3e 100644 --- a/controllers/medusa/medusabackupjob_controller.go +++ b/controllers/medusa/medusabackupjob_controller.go @@ -103,6 +103,10 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ logger.Error(err, "Failed to get datacenter pods") return ctrl.Result{}, err } + if len(pods) == 0 { + logger.Info("No pods found for datacenter", "CassandraDatacenter", cassdcKey) + return ctrl.Result{Requeue: true}, nil + } // If there is anything in progress, simply requeue the request until each pod has finished or errored if len(backupJob.Status.InProgress) > 0 { @@ -186,6 +190,11 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ logger.Error(err, "Failed to get backup summary") return ctrl.Result{}, err } + if backupSummary == nil { + // if a backup is complete, but we fail to find the summary, we're in a non-recoverable situation + logger.Info("Backup summary not found", "backupJob", backupJob) + return ctrl.Result{Requeue: false}, nil + } if err := r.createMedusaBackup(ctx, backupJob, backupSummary, logger); err != nil { logger.Error(err, "Failed to create MedusaBackup") return ctrl.Result{}, err diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index e20ce82f8..e6dca4095 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -7,6 +7,7 @@ import ( "strings" "sync" "testing" + "time" cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1" k8ss "github.com/k8ssandra/k8ssandra-operator/apis/k8ssandra/v1alpha1" @@ -31,6 +32,7 @@ const ( successfulBackupName = "good-backup" failingBackupName = "bad-backup" missingBackupName = "missing-backup" + backupWithNoPods = "backup-with-no-pods" dc1PodPrefix = "192.168.1." dc2PodPrefix = "192.168.2." fakeBackupFileCount = int64(13) @@ -167,6 +169,10 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework. backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, missingBackupName) require.False(backupCreated, "the backup object shouldn't have been created") + // in K8OP-294 we found out we can try to make backups on StatefulSets with no pods + backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNoPods) + require.False(backupCreated, "the backup object shouldn't have been created") + err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval) require.NoError(err, "failed to delete K8ssandraCluster") verifyObjectDoesNotExist(ctx, t, f, dc1Key, &cassdcapi.CassandraDatacenter{}) @@ -202,13 +208,23 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa } } - createDatacenterPods(t, f, ctx, dcKey, dc) + if backupName != backupWithNoPods { + createDatacenterPods(t, f, ctx, dcKey, dc) + } dcCopy := dc.DeepCopy() dcKeyCopy := framework.NewClusterKey(f.DataPlaneContexts[0], dcKey.Namespace+"-copy", dcKey.Name) dcCopy.ObjectMeta.Namespace = dc.Namespace + "-copy" - createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy) + if backupName != backupWithNoPods { + createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy) + } + + // clean up the pods to keep the environment reusable + if backupName != backupWithNoPods { + defer deleteDatacenterPods(t, f, ctx, dcKey, dc) + defer deleteDatacenterPods(t, f, ctx, dcKeyCopy, dc) + } t.Log("creating MedusaBackupJob") backupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName) @@ -225,32 +241,14 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa err := f.Create(ctx, backupKey, backup) require.NoError(err, "failed to create MedusaBackupJob") - t.Log("verify that the backups are started") - require.Eventually(func() bool { - t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) - updated := &api.MedusaBackupJob{} - err := f.Get(ctx, backupKey, updated) - if err != nil { - t.Logf("failed to get MedusaBackupJob: %v", err) - return false - } - return !updated.Status.StartTime.IsZero() - }, timeout, interval) - - t.Log("verify the backup finished") - require.Eventually(func() bool { - t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) - updated := &api.MedusaBackupJob{} - err := f.Get(ctx, backupKey, updated) - if err != nil { - return false - } - t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime) - t.Logf("backup %s failed: %v", backupName, updated.Status.Failed) - t.Logf("backup %s finished: %v", backupName, updated.Status.Finished) - t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress) - return !updated.Status.FinishTime.IsZero() - }, timeout, interval) + if backupName != backupWithNoPods { + t.Log("verify that the backups start eventually") + verifyBackupJobStarted(require.Eventually, t, dc, f, ctx, backupKey) + verifyBackupJobFinished(t, require, dc, f, ctx, backupKey, backupName) + } else { + t.Log("verify that the backups never start") + verifyBackupJobStarted(require.Never, t, dc, f, ctx, backupKey) + } t.Log("check for the MedusaBackup being created") medusaBackupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName) @@ -258,6 +256,7 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa err = f.Get(ctx, medusaBackupKey, medusaBackup) if err != nil { if errors.IsNotFound(err) { + // exit the test if the backup was not created. this happens for some backup names on purpose return false } } @@ -274,6 +273,43 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa return true } +func verifyBackupJobFinished(t *testing.T, require *require.Assertions, dc *cassdcapi.CassandraDatacenter, f *framework.Framework, ctx context.Context, backupKey framework.ClusterKey, backupName string) { + t.Log("verify the backup finished") + require.Eventually(func() bool { + t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) + updated := &api.MedusaBackupJob{} + err := f.Get(ctx, backupKey, updated) + if err != nil { + return false + } + t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime) + t.Logf("backup %s failed: %v", backupName, updated.Status.Failed) + t.Logf("backup %s finished: %v", backupName, updated.Status.Finished) + t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress) + return !updated.Status.FinishTime.IsZero() + }, timeout, interval) +} + +func verifyBackupJobStarted( + verifyFunction func(condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}), + t *testing.T, + dc *cassdcapi.CassandraDatacenter, + f *framework.Framework, + ctx context.Context, + backupKey framework.ClusterKey, +) { + verifyFunction(func() bool { + t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) + updated := &api.MedusaBackupJob{} + err := f.Get(ctx, backupKey, updated) + if err != nil { + t.Logf("failed to get MedusaBackupJob: %v", err) + return false + } + return !updated.Status.StartTime.IsZero() + }, timeout, interval) +} + func reconcileReplicatedSecret(ctx context.Context, t *testing.T, f *framework.Framework, kc *k8ss.K8ssandraCluster) { t.Log("check ReplicatedSecret reconciled") @@ -488,6 +524,26 @@ func findDatacenterCondition(status *cassdcapi.CassandraDatacenterStatus, condTy return nil } +func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) { + for i := int32(0); i < dc.Spec.Size; i++ { + pod := &corev1.Pod{} + podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i) + podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName) + err := f.Get(ctx, podKey, pod) + if err != nil { + if !errors.IsNotFound(err) { + t.Logf("failed to get pod %s: %v", podKey, err) + } + } else { + err = f.Delete(ctx, podKey, pod) + if err != nil { + t.Logf("failed to delete pod %s: %v", podKey, err) + } + } + + } +} + func createDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) { _ = f.CreateNamespace(dcKey.Namespace) for i := int32(0); i < dc.Spec.Size; i++ { From 417b98a256ad4567306143971d5318718f446a3f Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Thu, 28 Nov 2024 17:31:46 +0200 Subject: [PATCH 02/10] Fix the medusa task controller test --- .../medusa/medusabackupjob_controller_test.go | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index e6dca4095..f8c7ec3ce 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -208,22 +208,23 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa } } - if backupName != backupWithNoPods { - createDatacenterPods(t, f, ctx, dcKey, dc) - } + createDatacenterPods(t, f, ctx, dcKey, dc) dcCopy := dc.DeepCopy() dcKeyCopy := framework.NewClusterKey(f.DataPlaneContexts[0], dcKey.Namespace+"-copy", dcKey.Name) dcCopy.ObjectMeta.Namespace = dc.Namespace + "-copy" - if backupName != backupWithNoPods { - createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy) - } - - // clean up the pods to keep the environment reusable - if backupName != backupWithNoPods { - defer deleteDatacenterPods(t, f, ctx, dcKey, dc) - defer deleteDatacenterPods(t, f, ctx, dcKeyCopy, dc) + createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy) + + // one test scenario needs to have no pods available in the STSs (see #1454) + // we reproduce that by deleting the pods. we need this for the medusa backup controller tests to work + // however, we need to bring them back up because medusa task controller tests interact with them later + // both backup and task controller tests use this function to verify backups + if backupName == backupWithNoPods { + deleteDatacenterPods(t, f, ctx, dcKey, dc) + deleteDatacenterPods(t, f, ctx, dcKeyCopy, dc) + defer createDatacenterPods(t, f, ctx, dcKey, dc) + defer createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy) } t.Log("creating MedusaBackupJob") From ca96675604603cc7d8c33a757347aff32279c99e Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Fri, 29 Nov 2024 11:16:50 +0200 Subject: [PATCH 03/10] Do not duplicate the slice with pods --- pkg/medusa/utils.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/medusa/utils.go b/pkg/medusa/utils.go index cc04907a2..7e1968dc1 100644 --- a/pkg/medusa/utils.go +++ b/pkg/medusa/utils.go @@ -2,6 +2,7 @@ package medusa import ( "context" + "errors" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" @@ -21,8 +22,8 @@ func GetCassandraDatacenterPods(ctx context.Context, cassdc *cassdcapi.Cassandra return nil, err } - pods := make([]corev1.Pod, 0) - pods = append(pods, podList.Items...) - - return pods, nil + if podList.Items != nil { + return podList.Items, nil + } + return nil, errors.New("podList came with nil Items field") } From 74627720041f350b47e081fffb03896455132880 Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Fri, 29 Nov 2024 11:26:51 +0200 Subject: [PATCH 04/10] Return an error, not nil, if backupsummary is not found --- controllers/medusa/medusabackupjob_controller.go | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/controllers/medusa/medusabackupjob_controller.go b/controllers/medusa/medusabackupjob_controller.go index 0ec105c3e..ebdbd4a3d 100644 --- a/controllers/medusa/medusabackupjob_controller.go +++ b/controllers/medusa/medusabackupjob_controller.go @@ -18,12 +18,13 @@ package medusa import ( "context" + "errors" "fmt" "net" "strings" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apiErrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -67,7 +68,7 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ err := r.Get(ctx, req.NamespacedName, instance) if err != nil { logger.Error(err, "Failed to get MedusaBackupJob") - if errors.IsNotFound(err) { + if apiErrors.IsNotFound(err) { return ctrl.Result{}, nil } return ctrl.Result{}, err @@ -190,11 +191,6 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ logger.Error(err, "Failed to get backup summary") return ctrl.Result{}, err } - if backupSummary == nil { - // if a backup is complete, but we fail to find the summary, we're in a non-recoverable situation - logger.Info("Backup summary not found", "backupJob", backupJob) - return ctrl.Result{Requeue: false}, nil - } if err := r.createMedusaBackup(ctx, backupJob, backupSummary, logger); err != nil { logger.Error(err, "Failed to create MedusaBackup") return ctrl.Result{}, err @@ -257,7 +253,7 @@ func (r *MedusaBackupJobReconciler) getBackupSummary(ctx context.Context, backup } } } - return nil, nil + return nil, errors.New("backup summary couldn't be found") } func (r *MedusaBackupJobReconciler) createMedusaBackup(ctx context.Context, backup *medusav1alpha1.MedusaBackupJob, backupSummary *medusa.BackupSummary, logger logr.Logger) error { @@ -266,7 +262,7 @@ func (r *MedusaBackupJobReconciler) createMedusaBackup(ctx context.Context, back backupKey := types.NamespacedName{Namespace: backup.ObjectMeta.Namespace, Name: backup.Name} backupResource := &medusav1alpha1.MedusaBackup{} if err := r.Get(ctx, backupKey, backupResource); err != nil { - if errors.IsNotFound(err) { + if apiErrors.IsNotFound(err) { // Backup doesn't exist, create it startTime := backup.Status.StartTime finishTime := metav1.Now() @@ -340,7 +336,7 @@ func backupStatus(ctx context.Context, name string, pod *corev1.Pod, clientFacto resp, err := medusaClient.BackupStatus(ctx, name) if err != nil { // the gRPC client does not return proper NotFound error, we need to check the payload too - if errors.IsNotFound(err) || strings.Contains(err.Error(), "NotFound") { + if apiErrors.IsNotFound(err) || strings.Contains(err.Error(), "NotFound") { logger.Info(fmt.Sprintf("did not find backup %s for pod %s", name, pod.Name)) return medusa.StatusType_UNKNOWN, nil } From d5607c60f5e8d0f98cfa9676f9cdbef5c1a60a0d Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Fri, 29 Nov 2024 11:32:05 +0200 Subject: [PATCH 05/10] Replace int32 with int when deleting pods in test --- controllers/medusa/medusabackupjob_controller_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index f8c7ec3ce..0a9516ddf 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -526,7 +526,7 @@ func findDatacenterCondition(status *cassdcapi.CassandraDatacenterStatus, condTy } func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) { - for i := int32(0); i < dc.Spec.Size; i++ { + for i := 0; i < int(dc.Spec.Size); i++ { pod := &corev1.Pod{} podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i) podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName) From d9b53f5d555ae75dbd63454a1a01ecc00f116896 Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Fri, 29 Nov 2024 11:37:42 +0200 Subject: [PATCH 06/10] Return nil backup summary for backup with no pods --- .../medusa/medusabackupjob_controller_test.go | 61 ++++++++++--------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index 0a9516ddf..c2100b6b5 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -433,35 +433,40 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm status = *medusa.StatusType_IN_PROGRESS.Enum() } - backup := &medusa.BackupSummary{ - BackupName: name, - StartTime: 0, - FinishTime: 10, - TotalNodes: 3, - FinishedNodes: 3, - TotalObjects: fakeBackupFileCount, - TotalSize: fakeBackupByteSize, - Status: status, - Nodes: []*medusa.BackupNode{ - { - Host: "host1", - Tokens: []int64{1, 2, 3}, - Datacenter: "dc1", - Rack: "rack1", - }, - { - Host: "host2", - Tokens: []int64{1, 2, 3}, - Datacenter: "dc1", - Rack: "rack1", - }, - { - Host: "host3", - Tokens: []int64{1, 2, 3}, - Datacenter: "dc1", - Rack: "rack1", + var backup *medusa.BackupSummary + if strings.HasPrefix(name, backupWithNoPods) { + backup = nil + } else { + backup = &medusa.BackupSummary{ + BackupName: name, + StartTime: 0, + FinishTime: 10, + TotalNodes: 3, + FinishedNodes: 3, + TotalObjects: fakeBackupFileCount, + TotalSize: fakeBackupByteSize, + Status: status, + Nodes: []*medusa.BackupNode{ + { + Host: "host1", + Tokens: []int64{1, 2, 3}, + Datacenter: "dc1", + Rack: "rack1", + }, + { + Host: "host2", + Tokens: []int64{1, 2, 3}, + Datacenter: "dc1", + Rack: "rack1", + }, + { + Host: "host3", + Tokens: []int64{1, 2, 3}, + Datacenter: "dc1", + Rack: "rack1", + }, }, - }, + } } backups = append(backups, backup) } From 5e1f31ad8bcc2d3cf2486c9e65e3cd1ad983d40b Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Mon, 2 Dec 2024 11:34:57 +0200 Subject: [PATCH 07/10] Remove get before delete in medusabackupjob_controller_test --- .../medusa/medusabackupjob_controller_test.go | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index c2100b6b5..bda09450c 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -535,18 +535,10 @@ func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Cont pod := &corev1.Pod{} podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i) podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName) - err := f.Get(ctx, podKey, pod) + err := f.Delete(ctx, podKey, pod) if err != nil { - if !errors.IsNotFound(err) { - t.Logf("failed to get pod %s: %v", podKey, err) - } - } else { - err = f.Delete(ctx, podKey, pod) - if err != nil { - t.Logf("failed to delete pod %s: %v", podKey, err) - } + t.Logf("failed to delete pod %s: %v", podKey, err) } - } } From fc97af92b63d8b09286f464b2caf494312b163ef Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Mon, 2 Dec 2024 11:40:36 +0200 Subject: [PATCH 08/10] Use a dedicated test backup for the nil backupSummary case --- controllers/medusa/medusabackupjob_controller_test.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index bda09450c..07b29604c 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -33,6 +33,7 @@ const ( failingBackupName = "bad-backup" missingBackupName = "missing-backup" backupWithNoPods = "backup-with-no-pods" + backupWithNilSummary = "backup-with-nil-summary" dc1PodPrefix = "192.168.1." dc2PodPrefix = "192.168.2." fakeBackupFileCount = int64(13) @@ -172,6 +173,9 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework. // in K8OP-294 we found out we can try to make backups on StatefulSets with no pods backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNoPods) require.False(backupCreated, "the backup object shouldn't have been created") + // in that same effort, we also found out we can have nil instead of backup sumamry + backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNilSummary) + require.False(backupCreated, "the backup object shouldn't have been created") err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval) require.NoError(err, "failed to delete K8ssandraCluster") @@ -434,7 +438,7 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm } var backup *medusa.BackupSummary - if strings.HasPrefix(name, backupWithNoPods) { + if strings.HasPrefix(name, backupWithNilSummary) { backup = nil } else { backup = &medusa.BackupSummary{ From d04f2ed850aded28c549835e6073f953a5d79a06 Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Mon, 2 Dec 2024 11:43:16 +0200 Subject: [PATCH 09/10] Return reconcile.TerminalError if getBackupSummary originates the error --- controllers/medusa/medusabackupjob_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/medusa/medusabackupjob_controller.go b/controllers/medusa/medusabackupjob_controller.go index ebdbd4a3d..80a676818 100644 --- a/controllers/medusa/medusabackupjob_controller.go +++ b/controllers/medusa/medusabackupjob_controller.go @@ -18,7 +18,6 @@ package medusa import ( "context" - "errors" "fmt" "net" "strings" @@ -34,6 +33,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/go-logr/logr" cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1" @@ -253,7 +253,7 @@ func (r *MedusaBackupJobReconciler) getBackupSummary(ctx context.Context, backup } } } - return nil, errors.New("backup summary couldn't be found") + return nil, reconcile.TerminalError(fmt.Errorf("backup summary couldn't be found")) } func (r *MedusaBackupJobReconciler) createMedusaBackup(ctx context.Context, backup *medusav1alpha1.MedusaBackupJob, backupSummary *medusa.BackupSummary, logger logr.Logger) error { From 3ae146ace593024ebf5a577cb2020924525b236c Mon Sep 17 00:00:00 2001 From: Radovan Zvoncek Date: Tue, 3 Dec 2024 12:43:27 +0200 Subject: [PATCH 10/10] Expect backup with nil summary to actually start --- controllers/medusa/medusabackupjob_controller.go | 5 +++++ controllers/medusa/medusabackupjob_controller_test.go | 10 ++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/controllers/medusa/medusabackupjob_controller.go b/controllers/medusa/medusabackupjob_controller.go index 80a676818..1376d0e22 100644 --- a/controllers/medusa/medusabackupjob_controller.go +++ b/controllers/medusa/medusabackupjob_controller.go @@ -245,6 +245,11 @@ func (r *MedusaBackupJobReconciler) getBackupSummary(ctx context.Context, backup return nil, err } else { for _, remoteBackup := range remoteBackups { + if remoteBackup == nil { + err := fmt.Errorf("backup %s summary is nil", backup.Name) + logger.Error(err, "remote backup is nil") + return nil, err + } logger.Info("found backup", "CassandraPod", pod.Name, "Backup", remoteBackup.BackupName) if backup.ObjectMeta.Name == remoteBackup.BackupName { return remoteBackup, nil diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index 07b29604c..9a33684f5 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -249,7 +249,10 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa if backupName != backupWithNoPods { t.Log("verify that the backups start eventually") verifyBackupJobStarted(require.Eventually, t, dc, f, ctx, backupKey) - verifyBackupJobFinished(t, require, dc, f, ctx, backupKey, backupName) + if backupName != backupWithNilSummary { + // backup that will receive a nil summary will never finish, can't assert on that + verifyBackupJobFinished(t, require, dc, f, ctx, backupKey, backupName) + } } else { t.Log("verify that the backups never start") verifyBackupJobStarted(require.Never, t, dc, f, ctx, backupKey) @@ -536,9 +539,12 @@ func findDatacenterCondition(status *cassdcapi.CassandraDatacenterStatus, condTy func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) { for i := 0; i < int(dc.Spec.Size); i++ { - pod := &corev1.Pod{} + podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i) podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName) + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: podKey.Name, Namespace: podKey.Namespace}, + } err := f.Delete(ctx, podKey, pod) if err != nil { t.Logf("failed to delete pod %s: %v", podKey, err)