diff --git a/CHANGELOG/CHANGELOG-1.21.md b/CHANGELOG/CHANGELOG-1.21.md index dcd092818..c7927cc05 100644 --- a/CHANGELOG/CHANGELOG-1.21.md +++ b/CHANGELOG/CHANGELOG-1.21.md @@ -18,3 +18,4 @@ When cutting a new release, update the `unreleased` heading to the tag being gen * [CHANGE] [#1441](https://github.com/k8ssandra/k8ssandra-operator/issues/1441) Use k8ssandra-client instead of k8ssandra-tools for CRD upgrades * [BUGFIX] [#1383](https://github.com/k8ssandra/k8ssandra-operator/issues/1383) Do not create MedusaBackup if MedusaBakupJob did not fully succeed * [ENHANCEMENT] [#1667](https://github.com/k8ssahttps://github.com/k8ssandra/k8ssandra/issues/1667) Add `skipSchemaMigration` option to `K8ssandraCluster.spec.reaper` +* [BUGFIX] [#1454](https://github.com/k8ssandra/k8ssandra-operator/issues/1454) Do not try to work out backup status if there are no pods diff --git a/controllers/medusa/medusabackupjob_controller.go b/controllers/medusa/medusabackupjob_controller.go index 36607440f..1376d0e22 100644 --- a/controllers/medusa/medusabackupjob_controller.go +++ b/controllers/medusa/medusabackupjob_controller.go @@ -23,7 +23,7 @@ import ( "strings" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" + apiErrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -33,6 +33,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/go-logr/logr" cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1" @@ -67,7 +68,7 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ err := r.Get(ctx, req.NamespacedName, instance) if err != nil { logger.Error(err, "Failed to get MedusaBackupJob") - if errors.IsNotFound(err) { + if apiErrors.IsNotFound(err) { return ctrl.Result{}, nil } return ctrl.Result{}, err @@ -103,6 +104,10 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ logger.Error(err, "Failed to get datacenter pods") return ctrl.Result{}, err } + if len(pods) == 0 { + logger.Info("No pods found for datacenter", "CassandraDatacenter", cassdcKey) + return ctrl.Result{Requeue: true}, nil + } // If there is anything in progress, simply requeue the request until each pod has finished or errored if len(backupJob.Status.InProgress) > 0 { @@ -240,6 +245,11 @@ func (r *MedusaBackupJobReconciler) getBackupSummary(ctx context.Context, backup return nil, err } else { for _, remoteBackup := range remoteBackups { + if remoteBackup == nil { + err := fmt.Errorf("backup %s summary is nil", backup.Name) + logger.Error(err, "remote backup is nil") + return nil, err + } logger.Info("found backup", "CassandraPod", pod.Name, "Backup", remoteBackup.BackupName) if backup.ObjectMeta.Name == remoteBackup.BackupName { return remoteBackup, nil @@ -248,7 +258,7 @@ func (r *MedusaBackupJobReconciler) getBackupSummary(ctx context.Context, backup } } } - return nil, nil + return nil, reconcile.TerminalError(fmt.Errorf("backup summary couldn't be found")) } func (r *MedusaBackupJobReconciler) createMedusaBackup(ctx context.Context, backup *medusav1alpha1.MedusaBackupJob, backupSummary *medusa.BackupSummary, logger logr.Logger) error { @@ -257,7 +267,7 @@ func (r *MedusaBackupJobReconciler) createMedusaBackup(ctx context.Context, back backupKey := types.NamespacedName{Namespace: backup.ObjectMeta.Namespace, Name: backup.Name} backupResource := &medusav1alpha1.MedusaBackup{} if err := r.Get(ctx, backupKey, backupResource); err != nil { - if errors.IsNotFound(err) { + if apiErrors.IsNotFound(err) { // Backup doesn't exist, create it startTime := backup.Status.StartTime finishTime := metav1.Now() @@ -331,7 +341,7 @@ func backupStatus(ctx context.Context, name string, pod *corev1.Pod, clientFacto resp, err := medusaClient.BackupStatus(ctx, name) if err != nil { // the gRPC client does not return proper NotFound error, we need to check the payload too - if errors.IsNotFound(err) || strings.Contains(err.Error(), "NotFound") { + if apiErrors.IsNotFound(err) || strings.Contains(err.Error(), "NotFound") { logger.Info(fmt.Sprintf("did not find backup %s for pod %s", name, pod.Name)) return medusa.StatusType_UNKNOWN, nil } diff --git a/controllers/medusa/medusabackupjob_controller_test.go b/controllers/medusa/medusabackupjob_controller_test.go index e20ce82f8..9a33684f5 100644 --- a/controllers/medusa/medusabackupjob_controller_test.go +++ b/controllers/medusa/medusabackupjob_controller_test.go @@ -7,6 +7,7 @@ import ( "strings" "sync" "testing" + "time" cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1" k8ss "github.com/k8ssandra/k8ssandra-operator/apis/k8ssandra/v1alpha1" @@ -31,6 +32,8 @@ const ( successfulBackupName = "good-backup" failingBackupName = "bad-backup" missingBackupName = "missing-backup" + backupWithNoPods = "backup-with-no-pods" + backupWithNilSummary = "backup-with-nil-summary" dc1PodPrefix = "192.168.1." dc2PodPrefix = "192.168.2." fakeBackupFileCount = int64(13) @@ -167,6 +170,13 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework. backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, missingBackupName) require.False(backupCreated, "the backup object shouldn't have been created") + // in K8OP-294 we found out we can try to make backups on StatefulSets with no pods + backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNoPods) + require.False(backupCreated, "the backup object shouldn't have been created") + // in that same effort, we also found out we can have nil instead of backup sumamry + backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNilSummary) + require.False(backupCreated, "the backup object shouldn't have been created") + err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval) require.NoError(err, "failed to delete K8ssandraCluster") verifyObjectDoesNotExist(ctx, t, f, dc1Key, &cassdcapi.CassandraDatacenter{}) @@ -210,6 +220,17 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy) + // one test scenario needs to have no pods available in the STSs (see #1454) + // we reproduce that by deleting the pods. we need this for the medusa backup controller tests to work + // however, we need to bring them back up because medusa task controller tests interact with them later + // both backup and task controller tests use this function to verify backups + if backupName == backupWithNoPods { + deleteDatacenterPods(t, f, ctx, dcKey, dc) + deleteDatacenterPods(t, f, ctx, dcKeyCopy, dc) + defer createDatacenterPods(t, f, ctx, dcKey, dc) + defer createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy) + } + t.Log("creating MedusaBackupJob") backupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName) backup := &api.MedusaBackupJob{ @@ -225,32 +246,17 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa err := f.Create(ctx, backupKey, backup) require.NoError(err, "failed to create MedusaBackupJob") - t.Log("verify that the backups are started") - require.Eventually(func() bool { - t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) - updated := &api.MedusaBackupJob{} - err := f.Get(ctx, backupKey, updated) - if err != nil { - t.Logf("failed to get MedusaBackupJob: %v", err) - return false + if backupName != backupWithNoPods { + t.Log("verify that the backups start eventually") + verifyBackupJobStarted(require.Eventually, t, dc, f, ctx, backupKey) + if backupName != backupWithNilSummary { + // backup that will receive a nil summary will never finish, can't assert on that + verifyBackupJobFinished(t, require, dc, f, ctx, backupKey, backupName) } - return !updated.Status.StartTime.IsZero() - }, timeout, interval) - - t.Log("verify the backup finished") - require.Eventually(func() bool { - t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) - updated := &api.MedusaBackupJob{} - err := f.Get(ctx, backupKey, updated) - if err != nil { - return false - } - t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime) - t.Logf("backup %s failed: %v", backupName, updated.Status.Failed) - t.Logf("backup %s finished: %v", backupName, updated.Status.Finished) - t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress) - return !updated.Status.FinishTime.IsZero() - }, timeout, interval) + } else { + t.Log("verify that the backups never start") + verifyBackupJobStarted(require.Never, t, dc, f, ctx, backupKey) + } t.Log("check for the MedusaBackup being created") medusaBackupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName) @@ -258,6 +264,7 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa err = f.Get(ctx, medusaBackupKey, medusaBackup) if err != nil { if errors.IsNotFound(err) { + // exit the test if the backup was not created. this happens for some backup names on purpose return false } } @@ -274,6 +281,43 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa return true } +func verifyBackupJobFinished(t *testing.T, require *require.Assertions, dc *cassdcapi.CassandraDatacenter, f *framework.Framework, ctx context.Context, backupKey framework.ClusterKey, backupName string) { + t.Log("verify the backup finished") + require.Eventually(func() bool { + t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) + updated := &api.MedusaBackupJob{} + err := f.Get(ctx, backupKey, updated) + if err != nil { + return false + } + t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime) + t.Logf("backup %s failed: %v", backupName, updated.Status.Failed) + t.Logf("backup %s finished: %v", backupName, updated.Status.Finished) + t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress) + return !updated.Status.FinishTime.IsZero() + }, timeout, interval) +} + +func verifyBackupJobStarted( + verifyFunction func(condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}), + t *testing.T, + dc *cassdcapi.CassandraDatacenter, + f *framework.Framework, + ctx context.Context, + backupKey framework.ClusterKey, +) { + verifyFunction(func() bool { + t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName())) + updated := &api.MedusaBackupJob{} + err := f.Get(ctx, backupKey, updated) + if err != nil { + t.Logf("failed to get MedusaBackupJob: %v", err) + return false + } + return !updated.Status.StartTime.IsZero() + }, timeout, interval) +} + func reconcileReplicatedSecret(ctx context.Context, t *testing.T, f *framework.Framework, kc *k8ss.K8ssandraCluster) { t.Log("check ReplicatedSecret reconciled") @@ -396,35 +440,40 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm status = *medusa.StatusType_IN_PROGRESS.Enum() } - backup := &medusa.BackupSummary{ - BackupName: name, - StartTime: 0, - FinishTime: 10, - TotalNodes: 3, - FinishedNodes: 3, - TotalObjects: fakeBackupFileCount, - TotalSize: fakeBackupByteSize, - Status: status, - Nodes: []*medusa.BackupNode{ - { - Host: "host1", - Tokens: []int64{1, 2, 3}, - Datacenter: "dc1", - Rack: "rack1", - }, - { - Host: "host2", - Tokens: []int64{1, 2, 3}, - Datacenter: "dc1", - Rack: "rack1", - }, - { - Host: "host3", - Tokens: []int64{1, 2, 3}, - Datacenter: "dc1", - Rack: "rack1", + var backup *medusa.BackupSummary + if strings.HasPrefix(name, backupWithNilSummary) { + backup = nil + } else { + backup = &medusa.BackupSummary{ + BackupName: name, + StartTime: 0, + FinishTime: 10, + TotalNodes: 3, + FinishedNodes: 3, + TotalObjects: fakeBackupFileCount, + TotalSize: fakeBackupByteSize, + Status: status, + Nodes: []*medusa.BackupNode{ + { + Host: "host1", + Tokens: []int64{1, 2, 3}, + Datacenter: "dc1", + Rack: "rack1", + }, + { + Host: "host2", + Tokens: []int64{1, 2, 3}, + Datacenter: "dc1", + Rack: "rack1", + }, + { + Host: "host3", + Tokens: []int64{1, 2, 3}, + Datacenter: "dc1", + Rack: "rack1", + }, }, - }, + } } backups = append(backups, backup) } @@ -488,6 +537,21 @@ func findDatacenterCondition(status *cassdcapi.CassandraDatacenterStatus, condTy return nil } +func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) { + for i := 0; i < int(dc.Spec.Size); i++ { + + podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i) + podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName) + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: podKey.Name, Namespace: podKey.Namespace}, + } + err := f.Delete(ctx, podKey, pod) + if err != nil { + t.Logf("failed to delete pod %s: %v", podKey, err) + } + } +} + func createDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) { _ = f.CreateNamespace(dcKey.Namespace) for i := int32(0); i < dc.Spec.Size; i++ { diff --git a/pkg/medusa/utils.go b/pkg/medusa/utils.go index cc04907a2..7e1968dc1 100644 --- a/pkg/medusa/utils.go +++ b/pkg/medusa/utils.go @@ -2,6 +2,7 @@ package medusa import ( "context" + "errors" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" @@ -21,8 +22,8 @@ func GetCassandraDatacenterPods(ctx context.Context, cassdc *cassdcapi.Cassandra return nil, err } - pods := make([]corev1.Pod, 0) - pods = append(pods, podList.Items...) - - return pods, nil + if podList.Items != nil { + return podList.Items, nil + } + return nil, errors.New("podList came with nil Items field") }