Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

K8OP-294 Do not try to work out backup status if there are no pods #1462

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG/CHANGELOG-1.21.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ When cutting a new release, update the `unreleased` heading to the tag being gen
* [CHANGE] [#1441](https://github.com/k8ssandra/k8ssandra-operator/issues/1441) Use k8ssandra-client instead of k8ssandra-tools for CRD upgrades
* [BUGFIX] [#1383](https://github.com/k8ssandra/k8ssandra-operator/issues/1383) Do not create MedusaBackup if MedusaBakupJob did not fully succeed
* [ENHANCEMENT] [#1667](https://github.com/k8ssahttps://github.com/k8ssandra/k8ssandra/issues/1667) Add `skipSchemaMigration` option to `K8ssandraCluster.spec.reaper`
* [BUGFIX] [#1454](https://github.com/k8ssandra/k8ssandra-operator/issues/1454) Do not try to work out backup status if there are no pods
9 changes: 9 additions & 0 deletions controllers/medusa/medusabackupjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
logger.Error(err, "Failed to get datacenter pods")
return ctrl.Result{}, err
}
if len(pods) == 0 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is related to the GetCassandraDatacenterPods..

	pods := make([]corev1.Pod, 0)
	pods = append(pods, podList.Items...)

Why not simply return podList.Items ? What's the point of making a new slice?

For this method, if there's no pods, why are we requeueing? Do we assume that pods will reappear?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know why we re-create the slice. It makes sense to not do it. I pushed the commit fixing this.

Yes, we are re-queing because the assumption is the pods will indeed reappear. In the initial ticket, they were replacing nodepools, and they eventually came back. Checking how precisely this works might deserve at least a manual test, and perhaps a follow up ticket.

logger.Info("No pods found for datacenter", "CassandraDatacenter", cassdcKey)
return ctrl.Result{Requeue: true}, nil
}

// If there is anything in progress, simply requeue the request until each pod has finished or errored
if len(backupJob.Status.InProgress) > 0 {
Expand Down Expand Up @@ -186,6 +190,11 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
logger.Error(err, "Failed to get backup summary")
return ctrl.Result{}, err
}
if backupSummary == nil {
rzvoncek marked this conversation as resolved.
Show resolved Hide resolved
// if a backup is complete, but we fail to find the summary, we're in a non-recoverable situation
logger.Info("Backup summary not found", "backupJob", backupJob)
return ctrl.Result{Requeue: false}, nil
}
if err := r.createMedusaBackup(ctx, backupJob, backupSummary, logger); err != nil {
logger.Error(err, "Failed to create MedusaBackup")
return ctrl.Result{}, err
Expand Down
109 changes: 83 additions & 26 deletions controllers/medusa/medusabackupjob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"strings"
"sync"
"testing"
"time"

cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
k8ss "github.com/k8ssandra/k8ssandra-operator/apis/k8ssandra/v1alpha1"
Expand All @@ -31,6 +32,7 @@ const (
successfulBackupName = "good-backup"
failingBackupName = "bad-backup"
missingBackupName = "missing-backup"
backupWithNoPods = "backup-with-no-pods"
dc1PodPrefix = "192.168.1."
dc2PodPrefix = "192.168.2."
fakeBackupFileCount = int64(13)
Expand Down Expand Up @@ -167,6 +169,10 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework.
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, missingBackupName)
require.False(backupCreated, "the backup object shouldn't have been created")

// in K8OP-294 we found out we can try to make backups on StatefulSets with no pods
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNoPods)
require.False(backupCreated, "the backup object shouldn't have been created")

err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval)
require.NoError(err, "failed to delete K8ssandraCluster")
verifyObjectDoesNotExist(ctx, t, f, dc1Key, &cassdcapi.CassandraDatacenter{})
Expand Down Expand Up @@ -210,6 +216,17 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa

createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy)

// one test scenario needs to have no pods available in the STSs (see #1454)
// we reproduce that by deleting the pods. we need this for the medusa backup controller tests to work
// however, we need to bring them back up because medusa task controller tests interact with them later
// both backup and task controller tests use this function to verify backups
if backupName == backupWithNoPods {
rzvoncek marked this conversation as resolved.
Show resolved Hide resolved
deleteDatacenterPods(t, f, ctx, dcKey, dc)
deleteDatacenterPods(t, f, ctx, dcKeyCopy, dc)
defer createDatacenterPods(t, f, ctx, dcKey, dc)
defer createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy)
}

t.Log("creating MedusaBackupJob")
backupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName)
backup := &api.MedusaBackupJob{
Expand All @@ -225,39 +242,22 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
err := f.Create(ctx, backupKey, backup)
require.NoError(err, "failed to create MedusaBackupJob")

t.Log("verify that the backups are started")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
t.Logf("failed to get MedusaBackupJob: %v", err)
return false
}
return !updated.Status.StartTime.IsZero()
}, timeout, interval)

t.Log("verify the backup finished")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
return false
}
t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime)
t.Logf("backup %s failed: %v", backupName, updated.Status.Failed)
t.Logf("backup %s finished: %v", backupName, updated.Status.Finished)
t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero()
}, timeout, interval)
if backupName != backupWithNoPods {
t.Log("verify that the backups start eventually")
verifyBackupJobStarted(require.Eventually, t, dc, f, ctx, backupKey)
verifyBackupJobFinished(t, require, dc, f, ctx, backupKey, backupName)
} else {
t.Log("verify that the backups never start")
verifyBackupJobStarted(require.Never, t, dc, f, ctx, backupKey)
}

t.Log("check for the MedusaBackup being created")
medusaBackupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName)
medusaBackup := &api.MedusaBackup{}
err = f.Get(ctx, medusaBackupKey, medusaBackup)
if err != nil {
if errors.IsNotFound(err) {
// exit the test if the backup was not created. this happens for some backup names on purpose
return false
}
}
Expand All @@ -274,6 +274,43 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
return true
}

func verifyBackupJobFinished(t *testing.T, require *require.Assertions, dc *cassdcapi.CassandraDatacenter, f *framework.Framework, ctx context.Context, backupKey framework.ClusterKey, backupName string) {
t.Log("verify the backup finished")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
return false
}
t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime)
t.Logf("backup %s failed: %v", backupName, updated.Status.Failed)
t.Logf("backup %s finished: %v", backupName, updated.Status.Finished)
t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero()
}, timeout, interval)
}

func verifyBackupJobStarted(
verifyFunction func(condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}),
t *testing.T,
dc *cassdcapi.CassandraDatacenter,
f *framework.Framework,
ctx context.Context,
backupKey framework.ClusterKey,
) {
verifyFunction(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
t.Logf("failed to get MedusaBackupJob: %v", err)
return false
}
return !updated.Status.StartTime.IsZero()
}, timeout, interval)
}

func reconcileReplicatedSecret(ctx context.Context, t *testing.T, f *framework.Framework, kc *k8ss.K8ssandraCluster) {
t.Log("check ReplicatedSecret reconciled")

Expand Down Expand Up @@ -488,6 +525,26 @@ func findDatacenterCondition(status *cassdcapi.CassandraDatacenterStatus, condTy
return nil
}

func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) {
for i := int32(0); i < dc.Spec.Size; i++ {
rzvoncek marked this conversation as resolved.
Show resolved Hide resolved
pod := &corev1.Pod{}
podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i)
podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName)
err := f.Get(ctx, podKey, pod)
rzvoncek marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
if !errors.IsNotFound(err) {
t.Logf("failed to get pod %s: %v", podKey, err)
}
} else {
err = f.Delete(ctx, podKey, pod)
if err != nil {
t.Logf("failed to delete pod %s: %v", podKey, err)
}
}

}
}

func createDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) {
_ = f.CreateNamespace(dcKey.Namespace)
for i := int32(0); i < dc.Spec.Size; i++ {
Expand Down
Loading