Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

K8OP-294 Do not try to work out backup status if there are no pods #1462

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG/CHANGELOG-1.21.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ When cutting a new release, update the `unreleased` heading to the tag being gen
* [CHANGE] [#1441](https://github.com/k8ssandra/k8ssandra-operator/issues/1441) Use k8ssandra-client instead of k8ssandra-tools for CRD upgrades
* [BUGFIX] [#1383](https://github.com/k8ssandra/k8ssandra-operator/issues/1383) Do not create MedusaBackup if MedusaBakupJob did not fully succeed
* [ENHANCEMENT] [#1667](https://github.com/k8ssahttps://github.com/k8ssandra/k8ssandra/issues/1667) Add `skipSchemaMigration` option to `K8ssandraCluster.spec.reaper`
* [BUGFIX] [#1454](https://github.com/k8ssandra/k8ssandra-operator/issues/1454) Do not try to work out backup status if there are no pods
20 changes: 15 additions & 5 deletions controllers/medusa/medusabackupjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
"strings"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apiErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -33,6 +33,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/go-logr/logr"
cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
Expand Down Expand Up @@ -67,7 +68,7 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
err := r.Get(ctx, req.NamespacedName, instance)
if err != nil {
logger.Error(err, "Failed to get MedusaBackupJob")
if errors.IsNotFound(err) {
if apiErrors.IsNotFound(err) {
return ctrl.Result{}, nil
}
return ctrl.Result{}, err
Expand Down Expand Up @@ -103,6 +104,10 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
logger.Error(err, "Failed to get datacenter pods")
return ctrl.Result{}, err
}
if len(pods) == 0 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is related to the GetCassandraDatacenterPods..

	pods := make([]corev1.Pod, 0)
	pods = append(pods, podList.Items...)

Why not simply return podList.Items ? What's the point of making a new slice?

For this method, if there's no pods, why are we requeueing? Do we assume that pods will reappear?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know why we re-create the slice. It makes sense to not do it. I pushed the commit fixing this.

Yes, we are re-queing because the assumption is the pods will indeed reappear. In the initial ticket, they were replacing nodepools, and they eventually came back. Checking how precisely this works might deserve at least a manual test, and perhaps a follow up ticket.

logger.Info("No pods found for datacenter", "CassandraDatacenter", cassdcKey)
return ctrl.Result{Requeue: true}, nil
}

// If there is anything in progress, simply requeue the request until each pod has finished or errored
if len(backupJob.Status.InProgress) > 0 {
Expand Down Expand Up @@ -240,6 +245,11 @@ func (r *MedusaBackupJobReconciler) getBackupSummary(ctx context.Context, backup
return nil, err
} else {
for _, remoteBackup := range remoteBackups {
if remoteBackup == nil {
err := fmt.Errorf("backup %s summary is nil", backup.Name)
logger.Error(err, "remote backup is nil")
return nil, err
}
logger.Info("found backup", "CassandraPod", pod.Name, "Backup", remoteBackup.BackupName)
if backup.ObjectMeta.Name == remoteBackup.BackupName {
return remoteBackup, nil
Expand All @@ -248,7 +258,7 @@ func (r *MedusaBackupJobReconciler) getBackupSummary(ctx context.Context, backup
}
}
}
return nil, nil
return nil, reconcile.TerminalError(fmt.Errorf("backup summary couldn't be found"))
}

func (r *MedusaBackupJobReconciler) createMedusaBackup(ctx context.Context, backup *medusav1alpha1.MedusaBackupJob, backupSummary *medusa.BackupSummary, logger logr.Logger) error {
Expand All @@ -257,7 +267,7 @@ func (r *MedusaBackupJobReconciler) createMedusaBackup(ctx context.Context, back
backupKey := types.NamespacedName{Namespace: backup.ObjectMeta.Namespace, Name: backup.Name}
backupResource := &medusav1alpha1.MedusaBackup{}
if err := r.Get(ctx, backupKey, backupResource); err != nil {
if errors.IsNotFound(err) {
if apiErrors.IsNotFound(err) {
// Backup doesn't exist, create it
startTime := backup.Status.StartTime
finishTime := metav1.Now()
Expand Down Expand Up @@ -331,7 +341,7 @@ func backupStatus(ctx context.Context, name string, pod *corev1.Pod, clientFacto
resp, err := medusaClient.BackupStatus(ctx, name)
if err != nil {
// the gRPC client does not return proper NotFound error, we need to check the payload too
if errors.IsNotFound(err) || strings.Contains(err.Error(), "NotFound") {
if apiErrors.IsNotFound(err) || strings.Contains(err.Error(), "NotFound") {
logger.Info(fmt.Sprintf("did not find backup %s for pod %s", name, pod.Name))
return medusa.StatusType_UNKNOWN, nil
}
Expand Down
170 changes: 117 additions & 53 deletions controllers/medusa/medusabackupjob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"strings"
"sync"
"testing"
"time"

cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
k8ss "github.com/k8ssandra/k8ssandra-operator/apis/k8ssandra/v1alpha1"
Expand All @@ -31,6 +32,8 @@ const (
successfulBackupName = "good-backup"
failingBackupName = "bad-backup"
missingBackupName = "missing-backup"
backupWithNoPods = "backup-with-no-pods"
backupWithNilSummary = "backup-with-nil-summary"
dc1PodPrefix = "192.168.1."
dc2PodPrefix = "192.168.2."
fakeBackupFileCount = int64(13)
Expand Down Expand Up @@ -167,6 +170,13 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework.
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, missingBackupName)
require.False(backupCreated, "the backup object shouldn't have been created")

// in K8OP-294 we found out we can try to make backups on StatefulSets with no pods
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNoPods)
require.False(backupCreated, "the backup object shouldn't have been created")
// in that same effort, we also found out we can have nil instead of backup sumamry
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNilSummary)
require.False(backupCreated, "the backup object shouldn't have been created")

err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval)
require.NoError(err, "failed to delete K8ssandraCluster")
verifyObjectDoesNotExist(ctx, t, f, dc1Key, &cassdcapi.CassandraDatacenter{})
Expand Down Expand Up @@ -210,6 +220,17 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa

createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy)

// one test scenario needs to have no pods available in the STSs (see #1454)
// we reproduce that by deleting the pods. we need this for the medusa backup controller tests to work
// however, we need to bring them back up because medusa task controller tests interact with them later
// both backup and task controller tests use this function to verify backups
if backupName == backupWithNoPods {
rzvoncek marked this conversation as resolved.
Show resolved Hide resolved
deleteDatacenterPods(t, f, ctx, dcKey, dc)
deleteDatacenterPods(t, f, ctx, dcKeyCopy, dc)
defer createDatacenterPods(t, f, ctx, dcKey, dc)
defer createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy)
}

t.Log("creating MedusaBackupJob")
backupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName)
backup := &api.MedusaBackupJob{
Expand All @@ -225,39 +246,25 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
err := f.Create(ctx, backupKey, backup)
require.NoError(err, "failed to create MedusaBackupJob")

t.Log("verify that the backups are started")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
t.Logf("failed to get MedusaBackupJob: %v", err)
return false
if backupName != backupWithNoPods {
t.Log("verify that the backups start eventually")
verifyBackupJobStarted(require.Eventually, t, dc, f, ctx, backupKey)
if backupName != backupWithNilSummary {
// backup that will receive a nil summary will never finish, can't assert on that
verifyBackupJobFinished(t, require, dc, f, ctx, backupKey, backupName)
}
return !updated.Status.StartTime.IsZero()
}, timeout, interval)

t.Log("verify the backup finished")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
return false
}
t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime)
t.Logf("backup %s failed: %v", backupName, updated.Status.Failed)
t.Logf("backup %s finished: %v", backupName, updated.Status.Finished)
t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero()
}, timeout, interval)
} else {
t.Log("verify that the backups never start")
verifyBackupJobStarted(require.Never, t, dc, f, ctx, backupKey)
}

t.Log("check for the MedusaBackup being created")
medusaBackupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName)
medusaBackup := &api.MedusaBackup{}
err = f.Get(ctx, medusaBackupKey, medusaBackup)
if err != nil {
if errors.IsNotFound(err) {
// exit the test if the backup was not created. this happens for some backup names on purpose
return false
}
}
Expand All @@ -274,6 +281,43 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
return true
}

func verifyBackupJobFinished(t *testing.T, require *require.Assertions, dc *cassdcapi.CassandraDatacenter, f *framework.Framework, ctx context.Context, backupKey framework.ClusterKey, backupName string) {
t.Log("verify the backup finished")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
return false
}
t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime)
t.Logf("backup %s failed: %v", backupName, updated.Status.Failed)
t.Logf("backup %s finished: %v", backupName, updated.Status.Finished)
t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero()
}, timeout, interval)
}

func verifyBackupJobStarted(
verifyFunction func(condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}),
t *testing.T,
dc *cassdcapi.CassandraDatacenter,
f *framework.Framework,
ctx context.Context,
backupKey framework.ClusterKey,
) {
verifyFunction(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
t.Logf("failed to get MedusaBackupJob: %v", err)
return false
}
return !updated.Status.StartTime.IsZero()
}, timeout, interval)
}

func reconcileReplicatedSecret(ctx context.Context, t *testing.T, f *framework.Framework, kc *k8ss.K8ssandraCluster) {
t.Log("check ReplicatedSecret reconciled")

Expand Down Expand Up @@ -396,35 +440,40 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm
status = *medusa.StatusType_IN_PROGRESS.Enum()
}

backup := &medusa.BackupSummary{
BackupName: name,
StartTime: 0,
FinishTime: 10,
TotalNodes: 3,
FinishedNodes: 3,
TotalObjects: fakeBackupFileCount,
TotalSize: fakeBackupByteSize,
Status: status,
Nodes: []*medusa.BackupNode{
{
Host: "host1",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
{
Host: "host2",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
{
Host: "host3",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
var backup *medusa.BackupSummary
if strings.HasPrefix(name, backupWithNilSummary) {
backup = nil
} else {
backup = &medusa.BackupSummary{
BackupName: name,
StartTime: 0,
FinishTime: 10,
TotalNodes: 3,
FinishedNodes: 3,
TotalObjects: fakeBackupFileCount,
TotalSize: fakeBackupByteSize,
Status: status,
Nodes: []*medusa.BackupNode{
{
Host: "host1",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
{
Host: "host2",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
{
Host: "host3",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
},
},
}
}
backups = append(backups, backup)
}
Expand Down Expand Up @@ -488,6 +537,21 @@ func findDatacenterCondition(status *cassdcapi.CassandraDatacenterStatus, condTy
return nil
}

func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) {
for i := 0; i < int(dc.Spec.Size); i++ {

podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i)
podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName)
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: podKey.Name, Namespace: podKey.Namespace},
}
err := f.Delete(ctx, podKey, pod)
if err != nil {
t.Logf("failed to delete pod %s: %v", podKey, err)
}
}
}

func createDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) {
_ = f.CreateNamespace(dcKey.Namespace)
for i := int32(0); i < dc.Spec.Size; i++ {
Expand Down
9 changes: 5 additions & 4 deletions pkg/medusa/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package medusa

import (
"context"
"errors"

"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
Expand All @@ -21,8 +22,8 @@ func GetCassandraDatacenterPods(ctx context.Context, cassdc *cassdcapi.Cassandra
return nil, err
}

pods := make([]corev1.Pod, 0)
pods = append(pods, podList.Items...)

return pods, nil
if podList.Items != nil {
return podList.Items, nil
}
return nil, errors.New("podList came with nil Items field")
}
Loading