From 3d41f93d3f8bbd84dc145600bd09cb59cfd667af Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 25 Sep 2023 17:10:47 +0800 Subject: [PATCH] ebs br: resume gc and scheduler when volume snapshots created (#5288) (#5298) Co-authored-by: WangLe1321 Co-authored-by: csuzhangxc --- cmd/backup-manager/app/backup/backup.go | 17 +++- cmd/backup-manager/app/restore/restore.go | 12 ++- cmd/backup-manager/app/util/util.go | 24 +++++ docs/api-references/docs.md | 24 +++++ images/tidb-backup-manager/entrypoint.sh | 15 ++- manifests/crd.yaml | 6 ++ manifests/crd/v1/pingcap.com_backups.yaml | 2 + .../crd/v1/pingcap.com_backupschedules.yaml | 4 + .../crd/v1beta1/pingcap.com_backups.yaml | 2 + .../v1beta1/pingcap.com_backupschedules.yaml | 4 + manifests/crd_v1beta1.yaml | 6 ++ pkg/apis/federation/pingcap/v1alpha1/types.go | 2 + .../pingcap/v1alpha1/volume_backup.go | 6 ++ pkg/apis/pingcap/v1alpha1/backup.go | 16 ++++ .../pingcap/v1alpha1/openapi_generated.go | 7 ++ pkg/apis/pingcap/v1alpha1/types.go | 9 +- pkg/backup/backup/backup_manager.go | 88 ++++++++++++++++- pkg/fedvolumebackup/backup/backup_manager.go | 96 ++++++++++++++++++- pkg/fedvolumebackup/backup/backup_test.go | 70 ++++++++++++++ 19 files changed, 400 insertions(+), 10 deletions(-) diff --git a/cmd/backup-manager/app/backup/backup.go b/cmd/backup-manager/app/backup/backup.go index 73f3be79aa..a149c32759 100644 --- a/cmd/backup-manager/app/backup/backup.go +++ b/cmd/backup-manager/app/backup/backup.go @@ -264,7 +264,7 @@ func (bo *Options) brCommandRunWithLogCallback(ctx context.Context, fullArgs []s } klog.Infof("Running br command with args: %v", fullArgs) bin := filepath.Join(util.BRBinPath, "br") - cmd := exec.CommandContext(ctx, bin, fullArgs...) + cmd := exec.Command(bin, fullArgs...) stdOut, err := cmd.StdoutPipe() if err != nil { @@ -278,7 +278,17 @@ func (bo *Options) brCommandRunWithLogCallback(ctx context.Context, fullArgs []s if err != nil { return fmt.Errorf("cluster %s, execute br command failed, args: %s, err: %v", bo, fullArgs, err) } + + // only the initialization command of volume snapshot backup use gracefully shutting down + // because it should resume gc and pd scheduler immediately + if bo.Mode == string(v1alpha1.BackupModeVolumeSnapshot) && bo.Initialize { + go backupUtil.GracefullyShutDownSubProcess(ctx, cmd) + } + var errMsg string + stdErrCh := make(chan []byte, 1) + go backupUtil.ReadAllStdErrToChannel(stdErr, stdErrCh) + reader := bufio.NewReader(stdOut) for { line, err := reader.ReadString('\n') @@ -291,10 +301,13 @@ func (bo *Options) brCommandRunWithLogCallback(ctx context.Context, fullArgs []s klog.Info(strings.Replace(line, "\n", "", -1)) if err != nil { + if err != io.EOF { + klog.Errorf("read stdout error: %s", err.Error()) + } break } } - tmpErr, _ := io.ReadAll(stdErr) + tmpErr := <-stdErrCh if len(tmpErr) > 0 { klog.Info(string(tmpErr)) errMsg += string(tmpErr) diff --git a/cmd/backup-manager/app/restore/restore.go b/cmd/backup-manager/app/restore/restore.go index 684e7d37c2..faeaed28a9 100644 --- a/cmd/backup-manager/app/restore/restore.go +++ b/cmd/backup-manager/app/restore/restore.go @@ -120,7 +120,7 @@ func (ro *Options) restoreData( fullArgs = append(fullArgs, args...) klog.Infof("Running br command with args: %v", fullArgs) bin := path.Join(util.BRBinPath, "br") - cmd := exec.CommandContext(ctx, bin, fullArgs...) + cmd := exec.Command(bin, fullArgs...) stdOut, err := cmd.StdoutPipe() if err != nil { @@ -151,6 +151,9 @@ func (ro *Options) restoreData( }() } + stdErrCh := make(chan []byte, 1) + go backupUtil.ReadAllStdErrToChannel(stdErr, stdErrCh) + var errMsg string reader := bufio.NewReader(stdOut) for { @@ -164,11 +167,14 @@ func (ro *Options) restoreData( ro.updateResolvedTSForCSB(line, restore, progressStep, statusUpdater) } klog.Info(strings.Replace(line, "\n", "", -1)) - if err != nil || io.EOF == err { + if err != nil { + if err != io.EOF { + klog.Errorf("read stdout error: %s", err.Error()) + } break } } - tmpErr, _ := io.ReadAll(stdErr) + tmpErr := <-stdErrCh if len(tmpErr) > 0 { klog.Info(string(tmpErr)) errMsg += string(tmpErr) diff --git a/cmd/backup-manager/app/util/util.go b/cmd/backup-manager/app/util/util.go index 60862f51ed..9ebc6ddc44 100644 --- a/cmd/backup-manager/app/util/util.go +++ b/cmd/backup-manager/app/util/util.go @@ -16,8 +16,10 @@ package util import ( "context" "fmt" + "io" "io/ioutil" "os" + "os/exec" "os/signal" "path" "path/filepath" @@ -510,6 +512,28 @@ func ParseRestoreProgress(line string) (step, progress string) { return } +// ReadAllStdErrToChannel read the stdErr and send the output to channel +func ReadAllStdErrToChannel(stdErr io.Reader, errMsgCh chan []byte) { + errMsg, err := io.ReadAll(stdErr) + if err != nil { + klog.Errorf("read stderr error: %s", err.Error()) + } + errMsgCh <- errMsg + close(errMsgCh) +} + +// GracefullyShutDownSubProcess just send SIGTERM to the process of cmd when context done +// the caller should wait the process of cmd to shut down +func GracefullyShutDownSubProcess(ctx context.Context, cmd *exec.Cmd) { + <-ctx.Done() + klog.Errorf("context done, err: %s. start to shut down sub process gracefully", ctx.Err().Error()) + if err := cmd.Process.Signal(syscall.SIGTERM); err != nil { + klog.Errorf("send SIGTERM to sub process error: %s", err.Error()) + } else { + klog.Infof("send SIGTERM to sub process successfully") + } +} + const ( e2eBackupEnv string = "E2E_TEST_ENV" e2eExtendBackupTime string = "Extend_BACKUP_TIME" diff --git a/docs/api-references/docs.md b/docs/api-references/docs.md index 1acd8e009e..c9a790fee3 100644 --- a/docs/api-references/docs.md +++ b/docs/api-references/docs.md @@ -296,6 +296,18 @@ FederalVolumeBackupPhase +resumeGcSchedule
+ +bool + + + +(Optional) +

ResumeGcSchedule indicates whether resume gc and pd scheduler for EBS volume snapshot backup

+ + + + dumpling
@@ -4100,6 +4112,18 @@ FederalVolumeBackupPhase +resumeGcSchedule
+ +bool + + + +(Optional) +

ResumeGcSchedule indicates whether resume gc and pd scheduler for EBS volume snapshot backup

+ + + + dumpling
diff --git a/images/tidb-backup-manager/entrypoint.sh b/images/tidb-backup-manager/entrypoint.sh index 7dfb41d26a..c4ba99967f 100755 --- a/images/tidb-backup-manager/entrypoint.sh +++ b/images/tidb-backup-manager/entrypoint.sh @@ -58,12 +58,25 @@ else EXEC_COMMAND="/usr/local/bin/shush exec --" fi +terminate_subprocesses() { + echo "get SIGTERM, send it to sub process $1" + kill -15 $1 # -15 is SIGTERM + wait $1 +} + # exec command case "$1" in backup) shift 1 echo "$BACKUP_BIN backup $@" - $EXEC_COMMAND $BACKUP_BIN backup "$@" + $EXEC_COMMAND $BACKUP_BIN backup "$@" & + + # save the PID of the sub process + pid=$! + # Trap the SIGTERM signal and forward it to the main process + trap 'terminate_subprocesses $pid' SIGTERM + # Wait for the sub process to complete + wait $pid ;; export) shift 1 diff --git a/manifests/crd.yaml b/manifests/crd.yaml index 08c8fa9114..fe2eeeb495 100644 --- a/manifests/crd.yaml +++ b/manifests/crd.yaml @@ -1338,6 +1338,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: @@ -2860,6 +2862,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: @@ -4186,6 +4190,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: diff --git a/manifests/crd/v1/pingcap.com_backups.yaml b/manifests/crd/v1/pingcap.com_backups.yaml index d34037b439..07dd9c4c95 100644 --- a/manifests/crd/v1/pingcap.com_backups.yaml +++ b/manifests/crd/v1/pingcap.com_backups.yaml @@ -1338,6 +1338,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: diff --git a/manifests/crd/v1/pingcap.com_backupschedules.yaml b/manifests/crd/v1/pingcap.com_backupschedules.yaml index c72bc2d0a9..f142847724 100644 --- a/manifests/crd/v1/pingcap.com_backupschedules.yaml +++ b/manifests/crd/v1/pingcap.com_backupschedules.yaml @@ -1313,6 +1313,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: @@ -2639,6 +2641,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: diff --git a/manifests/crd/v1beta1/pingcap.com_backups.yaml b/manifests/crd/v1beta1/pingcap.com_backups.yaml index 626dca67fe..e81191aafd 100644 --- a/manifests/crd/v1beta1/pingcap.com_backups.yaml +++ b/manifests/crd/v1beta1/pingcap.com_backups.yaml @@ -1332,6 +1332,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: diff --git a/manifests/crd/v1beta1/pingcap.com_backupschedules.yaml b/manifests/crd/v1beta1/pingcap.com_backupschedules.yaml index 7c3ed7135d..2d5ff90ad7 100644 --- a/manifests/crd/v1beta1/pingcap.com_backupschedules.yaml +++ b/manifests/crd/v1beta1/pingcap.com_backupschedules.yaml @@ -1307,6 +1307,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: @@ -2627,6 +2629,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: diff --git a/manifests/crd_v1beta1.yaml b/manifests/crd_v1beta1.yaml index 2d293a62df..9cb5839b52 100644 --- a/manifests/crd_v1beta1.yaml +++ b/manifests/crd_v1beta1.yaml @@ -1332,6 +1332,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: @@ -2850,6 +2852,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: @@ -4170,6 +4174,8 @@ spec: x-kubernetes-int-or-string: true type: object type: object + resumeGcSchedule: + type: boolean s3: properties: acl: diff --git a/pkg/apis/federation/pingcap/v1alpha1/types.go b/pkg/apis/federation/pingcap/v1alpha1/types.go index 697aa2b8e4..fcc7e9e360 100644 --- a/pkg/apis/federation/pingcap/v1alpha1/types.go +++ b/pkg/apis/federation/pingcap/v1alpha1/types.go @@ -199,6 +199,8 @@ const ( VolumeBackupInvalid VolumeBackupConditionType = "Invalid" // VolumeBackupRunning means the VolumeBackup is running VolumeBackupRunning VolumeBackupConditionType = "Running" + // VolumeBackupSnapshotsCreated means the all the volume snapshots have created, and we have safely resumed GC and PD scheduler + VolumeBackupSnapshotsCreated VolumeBackupConditionType = "SnapshotsCreated" // VolumeBackupComplete means all the backups in data plane are complete and the VolumeBackup is complete VolumeBackupComplete VolumeBackupConditionType = "Complete" // VolumeBackupFailed means one of backup in data plane is failed and the VolumeBackup is failed diff --git a/pkg/apis/federation/pingcap/v1alpha1/volume_backup.go b/pkg/apis/federation/pingcap/v1alpha1/volume_backup.go index e8e26ced2d..f440dd024c 100644 --- a/pkg/apis/federation/pingcap/v1alpha1/volume_backup.go +++ b/pkg/apis/federation/pingcap/v1alpha1/volume_backup.go @@ -97,6 +97,12 @@ func IsVolumeBackupRunning(volumeBackup *VolumeBackup) bool { return condition != nil && condition.Status == corev1.ConditionTrue } +// IsVolumeBackupSnapshotsCreated returns true if VolumeBackup's snapshots are all created +func IsVolumeBackupSnapshotsCreated(volumeBackup *VolumeBackup) bool { + _, condition := GetVolumeBackupCondition(&volumeBackup.Status, VolumeBackupSnapshotsCreated) + return condition != nil && condition.Status == corev1.ConditionTrue +} + // IsVolumeBackupComplete returns true if VolumeBackup is complete func IsVolumeBackupComplete(volumeBackup *VolumeBackup) bool { _, condition := GetVolumeBackupCondition(&volumeBackup.Status, VolumeBackupComplete) diff --git a/pkg/apis/pingcap/v1alpha1/backup.go b/pkg/apis/pingcap/v1alpha1/backup.go index bff3c47185..a85e6be69a 100644 --- a/pkg/apis/pingcap/v1alpha1/backup.go +++ b/pkg/apis/pingcap/v1alpha1/backup.go @@ -243,6 +243,22 @@ func IsVolumeBackupInitializeFailed(backup *Backup) bool { return condition != nil && condition.Status == corev1.ConditionTrue } +func IsVolumeBackupSnapshotsCreated(backup *Backup) bool { + if backup.Spec.Mode != BackupModeVolumeSnapshot { + return false + } + _, condition := GetBackupCondition(&backup.Status, VolumeBackupSnapshotsCreated) + return condition != nil && condition.Status == corev1.ConditionTrue +} + +func IsVolumeBackupInitializeComplete(backup *Backup) bool { + if backup.Spec.Mode != BackupModeVolumeSnapshot { + return false + } + _, condition := GetBackupCondition(&backup.Status, VolumeBackupInitializeComplete) + return condition != nil && condition.Status == corev1.ConditionTrue +} + // IsVolumeBackupComplete returns true if volume backup is complete func IsVolumeBackupComplete(backup *Backup) bool { if backup.Spec.Mode != BackupModeVolumeSnapshot { diff --git a/pkg/apis/pingcap/v1alpha1/openapi_generated.go b/pkg/apis/pingcap/v1alpha1/openapi_generated.go index 5ba67ea233..8315267753 100644 --- a/pkg/apis/pingcap/v1alpha1/openapi_generated.go +++ b/pkg/apis/pingcap/v1alpha1/openapi_generated.go @@ -1082,6 +1082,13 @@ func schema_pkg_apis_pingcap_v1alpha1_BackupSpec(ref common.ReferenceCallback) c Format: "", }, }, + "resumeGcSchedule": { + SchemaProps: spec.SchemaProps{ + Description: "ResumeGcSchedule indicates whether resume gc and pd scheduler for EBS volume snapshot backup", + Type: []string{"boolean"}, + Format: "", + }, + }, "dumpling": { SchemaProps: spec.SchemaProps{ Description: "DumplingConfig is the configs for dumpling", diff --git a/pkg/apis/pingcap/v1alpha1/types.go b/pkg/apis/pingcap/v1alpha1/types.go index f57eee77f5..f876c007e9 100644 --- a/pkg/apis/pingcap/v1alpha1/types.go +++ b/pkg/apis/pingcap/v1alpha1/types.go @@ -1966,6 +1966,9 @@ type BackupSpec struct { // FederalVolumeBackupPhase indicates which phase to execute in federal volume backup // +optional FederalVolumeBackupPhase FederalVolumeBackupPhase `json:"federalVolumeBackupPhase,omitempty"` + // ResumeGcSchedule indicates whether resume gc and pd scheduler for EBS volume snapshot backup + // +optional + ResumeGcSchedule bool `json:"resumeGcSchedule,omitempty"` // DumplingConfig is the configs for dumpling Dumpling *DumplingConfig `json:"dumpling,omitempty"` // Base tolerations of backup Pods, components may add more tolerations upon this respectively @@ -2122,10 +2125,14 @@ const ( BackupStopped BackupConditionType = "Stopped" // BackupRestart means the backup was restarted, now just support snapshot backup BackupRestart BackupConditionType = "Restart" - // VolumeBackupInitialized means the volume backup has stopped GC and PD schedule + // VolumeBackupInitialized means the volume backup has stopped GC and PD scheduler VolumeBackupInitialized BackupConditionType = "VolumeBackupInitialized" // VolumeBackupInitializeFailed means the volume backup initialize job failed VolumeBackupInitializeFailed BackupConditionType = "VolumeBackupInitializeFailed" + // VolumeBackupSnapshotsCreated means the local volume snapshots created, and they won't be changed + VolumeBackupSnapshotsCreated BackupConditionType = "VolumeBackupSnapshotsCreated" + // VolumeBackupInitializeComplete means the volume backup has safely resumed GC and PD scheduler + VolumeBackupInitializeComplete BackupConditionType = "VolumeBackupInitializeComplete" // VolumeBackupComplete means the volume backup has taken volume snapshots successfully VolumeBackupComplete BackupConditionType = "VolumeBackupComplete" // VolumeBackupFailed means the volume backup take volume snapshots failed diff --git a/pkg/backup/backup/backup_manager.go b/pkg/backup/backup/backup_manager.go index 13444a8c2e..30eb60ea49 100644 --- a/pkg/backup/backup/backup_manager.go +++ b/pkg/backup/backup/backup_manager.go @@ -103,11 +103,17 @@ func (bm *backupManager) syncBackupJob(backup *v1alpha1.Backup) error { if err := bm.deleteVolumeBackupInitializeJob(backup); err != nil { return err } - } else if err = bm.checkVolumeBackupInitializeJobExisted(backup); err != nil { + } else if err := bm.checkVolumeBackupInitializeJobExisted(backup); err != nil { // check volume backup initialize job, we should ensure the job is existed during volume backup klog.Errorf("backup %s/%s check volume backup initialize job error %v.", ns, name, err) return err } + if backup.Spec.ResumeGcSchedule { + if err := bm.resumeGCScheduleForVolumeBackup(backup); err != nil { + klog.Errorf("backup %s/%s resume gc and pd scheduler error %v.", ns, name, err) + return err + } + } } if v1alpha1.IsBackupComplete(backup) || v1alpha1.IsBackupFailed(backup) { @@ -120,6 +126,11 @@ func (bm *backupManager) syncBackupJob(backup *v1alpha1.Backup) error { return err } + if err = bm.checkVolumeBackupSnapshotsCreated(backup); err != nil { + klog.Errorf("backup %s/%s check snapshots created error %v.", ns, name, err) + return err + } + // skip backup skip := false if skip, err = bm.skipBackupSync(backup); err != nil { @@ -227,6 +238,9 @@ func (bm *backupManager) checkVolumeBackupInitializeJobExisted(backup *v1alpha1. if backup.Spec.FederalVolumeBackupPhase == v1alpha1.FederalVolumeBackupTeardown { return nil } + if v1alpha1.IsVolumeBackupSnapshotsCreated(backup) { + return nil + } if !v1alpha1.IsVolumeBackupInitialized(backup) || v1alpha1.IsVolumeBackupInitializeFailed(backup) { return nil } @@ -253,6 +267,75 @@ func (bm *backupManager) checkVolumeBackupInitializeJobExisted(backup *v1alpha1. return nil } +func (bm *backupManager) resumeGCScheduleForVolumeBackup(b *v1alpha1.Backup) error { + if b.Spec.Mode != v1alpha1.BackupModeVolumeSnapshot || !b.Spec.ResumeGcSchedule { + return nil + } + if v1alpha1.IsVolumeBackupInitializeFailed(b) { + return nil + } + if v1alpha1.IsVolumeBackupInitializeComplete(b) { + return nil + } + + ns := b.GetNamespace() + name := b.GetName() + if !v1alpha1.IsVolumeBackupSnapshotsCreated(b) { + klog.Infof("backup %s/%s volume snapshots not created, can't resume gc and pd scheduler", ns, name) + return nil + } + if err := bm.deleteVolumeBackupInitializeJob(b); err != nil { + return fmt.Errorf("delete initialize job failed, err: %s", err.Error()) + } + klog.Infof("backup %s/%s resumed GC and PD scheduler successfully", ns, name) + err := bm.statusUpdater.Update(b, &v1alpha1.BackupCondition{ + Type: v1alpha1.VolumeBackupInitializeComplete, + Status: corev1.ConditionTrue, + }, nil) + if err != nil { + return fmt.Errorf("update status to VolumeBackupInitializeComplete error: %s", err.Error()) + } + return nil +} + +func (bm *backupManager) checkVolumeBackupSnapshotsCreated(b *v1alpha1.Backup) error { + if b.Spec.Mode != v1alpha1.BackupModeVolumeSnapshot || b.Spec.FederalVolumeBackupPhase != v1alpha1.FederalVolumeBackupExecute { + return nil + } + if v1alpha1.IsVolumeBackupInitializeFailed(b) { + return nil + } + if v1alpha1.IsVolumeBackupSnapshotsCreated(b) { + return nil + } + + ns := b.GetNamespace() + name := b.GetName() + cred := backuputil.GetStorageCredential(b.Namespace, b.Spec.StorageProvider, bm.deps.SecretLister) + externalStorage, err := backuputil.NewStorageBackend(b.Spec.StorageProvider, cred) + if err != nil { + return fmt.Errorf("create storage backend error: %s", err.Error()) + } + + ctx := context.Background() + existed, err := externalStorage.Exists(ctx, constants.MetaFile) + if err != nil { + return fmt.Errorf("check backupmeta existed error: %s", err.Error()) + } + if !existed { + return nil + } + klog.Infof("backup %s/%s volume snapshots have created", ns, name) + err = bm.statusUpdater.Update(b, &v1alpha1.BackupCondition{ + Type: v1alpha1.VolumeBackupSnapshotsCreated, + Status: corev1.ConditionTrue, + }, nil) + if err != nil { + return fmt.Errorf("update status to VolumeBackupSnapshotsCreated error: %s", err.Error()) + } + return nil +} + // skipBackupSync skip backup sync, if return true, backup can be skiped directly. func (bm *backupManager) skipBackupSync(backup *v1alpha1.Backup) (bool, error) { if backup.Spec.Mode == v1alpha1.BackupModeLog { @@ -1055,6 +1138,9 @@ func (bm *backupManager) teardownVolumeBackup(backup *v1alpha1.Backup) (err erro }, nil) }() + if v1alpha1.IsVolumeBackupInitializeComplete(backup) { + return nil + } backupInitializeJob, err := bm.deps.JobLister.Jobs(ns).Get(backupInitializeJobName) if err != nil { if errors.IsNotFound(err) { diff --git a/pkg/fedvolumebackup/backup/backup_manager.go b/pkg/fedvolumebackup/backup/backup_manager.go index a03d030a06..743908684e 100644 --- a/pkg/fedvolumebackup/backup/backup_manager.go +++ b/pkg/fedvolumebackup/backup/backup_manager.go @@ -75,6 +75,7 @@ func (bm *backupManager) Sync(volumeBackup *v1alpha1.VolumeBackup) error { if _, ok := err.(*fedvolumebackup.BRDataPlaneFailedError); !ok { return err } else { + // volume backup failed, can't return error directly. need to enter teardown phase klog.Errorf("VolumeBackup %s/%s data plane backup failed when sync backup, will teardown all the backups. err: %s", ns, name, err.Error()) } } else if !backupFinished { @@ -136,7 +137,23 @@ func (bm *backupManager) runBackup(ctx context.Context, volumeBackup *v1alpha1.V if newMemberCreatedOrUpdated { return false, nil } - if err := bm.waitVolumeSnapshotsComplete(ctx, volumeBackup, backupMembers); err != nil { + + if err := bm.waitVolumeSnapshotsCreated(backupMembers); err != nil { + return false, err + } + memberUpdated, err := bm.resumeGCSchedule(ctx, volumeBackup, backupMembers) + if err != nil { + return false, err + } + if memberUpdated { + return false, nil + } + + if err := bm.waitBackupMemberInitializeComplete(volumeBackup, backupMembers); err != nil { + return false, err + } + + if err := bm.waitVolumeSnapshotsComplete(backupMembers); err != nil { return false, err } return true, nil @@ -277,7 +294,75 @@ func (bm *backupManager) executeVolumeBackup(ctx context.Context, volumeBackup * return } -func (bm *backupManager) waitVolumeSnapshotsComplete(ctx context.Context, volumeBackup *v1alpha1.VolumeBackup, backupMembers []*volumeBackupMember) error { +func (bm *backupManager) resumeGCSchedule(ctx context.Context, volumeBackup *v1alpha1.VolumeBackup, backupMembers []*volumeBackupMember) (memberUpdated bool, err error) { + var initializedBackupMember *volumeBackupMember + for _, backupMember := range backupMembers { + if pingcapv1alpha1.IsVolumeBackupInitialized(backupMember.backup) { + initializedBackupMember = backupMember + break + } + } + + if initializedBackupMember == nil { + return false, controller.RequeueErrorf("not found initialized member") + } + if initializedBackupMember.backup.Spec.ResumeGcSchedule { + return false, nil + } + + backupCR := initializedBackupMember.backup.DeepCopy() + backupCR.Spec.ResumeGcSchedule = true + kubeClient := bm.deps.FedClientset[initializedBackupMember.k8sClusterName] + if _, err := kubeClient.PingcapV1alpha1().Backups(backupCR.Namespace).Update(ctx, backupCR, metav1.UpdateOptions{}); err != nil { + return false, controller.RequeueErrorf( + "update backup member %s of cluster %s to resume gc and pd scheduler error: %s", + backupCR.Name, initializedBackupMember.k8sClusterName, err.Error()) + } + klog.Infof("VolumeBackup %s/%s update backup member %s to resume gc and pd scheduler", volumeBackup.Namespace, volumeBackup.Name, backupCR.Name) + return true, nil +} + +func (bm *backupManager) waitVolumeSnapshotsCreated(backupMembers []*volumeBackupMember) error { + for _, backupMember := range backupMembers { + if pingcapv1alpha1.IsVolumeBackupInitializeFailed(backupMember.backup) || + pingcapv1alpha1.IsVolumeBackupFailed(backupMember.backup) || + pingcapv1alpha1.IsBackupFailed(backupMember.backup) { + errMsg := fmt.Sprintf("backup member %s of cluster %s failed", backupMember.backup.Name, backupMember.k8sClusterName) + return &fedvolumebackup.BRDataPlaneFailedError{ + Reason: reasonVolumeBackupMemberFailed, + Message: errMsg, + } + } + if !pingcapv1alpha1.IsVolumeBackupSnapshotsCreated(backupMember.backup) { + return controller.IgnoreErrorf("backup member %s of cluster %s is not volume snapshots created", backupMember.backup.Name, backupMember.k8sClusterName) + } + } + return nil +} + +func (bm *backupManager) waitBackupMemberInitializeComplete(volumeBackup *v1alpha1.VolumeBackup, backupMembers []*volumeBackupMember) error { + for _, backupMember := range backupMembers { + if pingcapv1alpha1.IsVolumeBackupInitializeFailed(backupMember.backup) || + pingcapv1alpha1.IsVolumeBackupFailed(backupMember.backup) || + pingcapv1alpha1.IsBackupFailed(backupMember.backup) { + errMsg := fmt.Sprintf("backup member %s of cluster %s failed", backupMember.backup.Name, backupMember.k8sClusterName) + return &fedvolumebackup.BRDataPlaneFailedError{ + Reason: reasonVolumeBackupMemberFailed, + Message: errMsg, + } + } + + if pingcapv1alpha1.IsVolumeBackupInitializeComplete(backupMember.backup) { + if !v1alpha1.IsVolumeBackupSnapshotsCreated(volumeBackup) { + bm.setVolumeBackupSnapshotCreated(&volumeBackup.Status) + } + return nil + } + } + return controller.IgnoreErrorf("not found backup member with status VolumeBackupInitializeComplete") +} + +func (bm *backupManager) waitVolumeSnapshotsComplete(backupMembers []*volumeBackupMember) error { for _, backupMember := range backupMembers { if pingcapv1alpha1.IsVolumeBackupInitializeFailed(backupMember.backup) || pingcapv1alpha1.IsVolumeBackupFailed(backupMember.backup) || @@ -330,6 +415,13 @@ func (bm *backupManager) waitVolumeBackupComplete(ctx context.Context, volumeBac return bm.setVolumeBackupComplete(&volumeBackup.Status, backupMembers) } +func (bm *backupManager) setVolumeBackupSnapshotCreated(volumeBackupStatus *v1alpha1.VolumeBackupStatus) { + v1alpha1.UpdateVolumeBackupCondition(volumeBackupStatus, &v1alpha1.VolumeBackupCondition{ + Type: v1alpha1.VolumeBackupSnapshotsCreated, + Status: corev1.ConditionTrue, + }) +} + func (bm *backupManager) setVolumeBackupComplete(volumeBackupStatus *v1alpha1.VolumeBackupStatus, backupMembers []*volumeBackupMember) error { volumeBackupStatus.TimeCompleted = metav1.Now() volumeBackupStatus.TimeTaken = volumeBackupStatus.TimeCompleted.Sub(volumeBackupStatus.TimeStarted.Time).Round(time.Second).String() diff --git a/pkg/fedvolumebackup/backup/backup_test.go b/pkg/fedvolumebackup/backup/backup_test.go index bccce4605c..037cd76652 100644 --- a/pkg/fedvolumebackup/backup/backup_test.go +++ b/pkg/fedvolumebackup/backup/backup_test.go @@ -104,6 +104,16 @@ func (h *helper) assertRunExecute(ctx context.Context, volumeBackup *v1alpha1.Vo h.g.Expect(backupMember3.Spec.FederalVolumeBackupPhase).To(gomega.Equal(pingcapv1alpha1.FederalVolumeBackupExecute)) } +func (h *helper) assertRunResumeGcSchedule(ctx context.Context) { + backupMember1, err := h.dataPlaneClient1.PingcapV1alpha1().Backups(fakeTcNamespace1).Get(ctx, h.backupMemberName1, metav1.GetOptions{}) + h.g.Expect(err).To(gomega.BeNil()) + h.g.Expect(backupMember1.Spec.ResumeGcSchedule).To(gomega.Equal(true)) +} + +func (h *helper) assertControlPlaneSnapshotsCreated(volumeBackup *v1alpha1.VolumeBackup) { + h.g.Expect(v1alpha1.IsVolumeBackupSnapshotsCreated(volumeBackup)).To(gomega.BeTrue()) +} + func (h *helper) assertRunTeardown(ctx context.Context, volumeBackup *v1alpha1.VolumeBackup, initializeFailed bool) { backupMember1, err := h.dataPlaneClient1.PingcapV1alpha1().Backups(fakeTcNamespace1).Get(ctx, h.backupMemberName1, metav1.GetOptions{}) h.g.Expect(err).To(gomega.BeNil()) @@ -146,6 +156,46 @@ func (h *helper) setDataPlaneInitialized(ctx context.Context) { h.g.Expect(err).To(gomega.BeNil()) } +func (h *helper) setDataPlaneSnapshotCreated(ctx context.Context) { + backupMember1, err := h.dataPlaneClient1.PingcapV1alpha1().Backups(fakeTcNamespace1).Get(ctx, h.backupMemberName1, metav1.GetOptions{}) + h.g.Expect(err).To(gomega.BeNil()) + pingcapv1alpha1.UpdateBackupCondition(&backupMember1.Status, &pingcapv1alpha1.BackupCondition{ + Status: corev1.ConditionTrue, + Type: pingcapv1alpha1.VolumeBackupSnapshotsCreated, + }) + _, err = h.dataPlaneClient1.PingcapV1alpha1().Backups(fakeTcNamespace1).UpdateStatus(ctx, backupMember1, metav1.UpdateOptions{}) + h.g.Expect(err).To(gomega.BeNil()) + + backupMember2, err := h.dataPlaneClient2.PingcapV1alpha1().Backups(fakeTcNamespace2).Get(ctx, h.backupMemberName2, metav1.GetOptions{}) + h.g.Expect(err).To(gomega.BeNil()) + pingcapv1alpha1.UpdateBackupCondition(&backupMember2.Status, &pingcapv1alpha1.BackupCondition{ + Status: corev1.ConditionTrue, + Type: pingcapv1alpha1.VolumeBackupSnapshotsCreated, + }) + _, err = h.dataPlaneClient2.PingcapV1alpha1().Backups(fakeTcNamespace2).UpdateStatus(ctx, backupMember2, metav1.UpdateOptions{}) + h.g.Expect(err).To(gomega.BeNil()) + + backupMember3, err := h.dataPlaneClient3.PingcapV1alpha1().Backups(fakeTcNamespace3).Get(ctx, h.backupMemberName3, metav1.GetOptions{}) + h.g.Expect(err).To(gomega.BeNil()) + pingcapv1alpha1.UpdateBackupCondition(&backupMember3.Status, &pingcapv1alpha1.BackupCondition{ + Status: corev1.ConditionTrue, + Type: pingcapv1alpha1.VolumeBackupSnapshotsCreated, + }) + _, err = h.dataPlaneClient3.PingcapV1alpha1().Backups(fakeTcNamespace3).UpdateStatus(ctx, backupMember3, metav1.UpdateOptions{}) + h.g.Expect(err).To(gomega.BeNil()) +} + +func (h *helper) setDataPlaneInitializeComplete(ctx context.Context) { + backupMember1, err := h.dataPlaneClient1.PingcapV1alpha1().Backups(fakeTcNamespace1).Get(ctx, h.backupMemberName1, metav1.GetOptions{}) + h.g.Expect(err).To(gomega.BeNil()) + pingcapv1alpha1.UpdateBackupCondition(&backupMember1.Status, &pingcapv1alpha1.BackupCondition{ + Status: corev1.ConditionTrue, + Type: pingcapv1alpha1.VolumeBackupInitializeComplete, + }) + _, err = h.dataPlaneClient1.PingcapV1alpha1().Backups(fakeTcNamespace1).UpdateStatus(ctx, backupMember1, metav1.UpdateOptions{}) + h.g.Expect(err).To(gomega.BeNil()) +} + func (h *helper) setDataPlaneVolumeComplete(ctx context.Context) { backupMember1, err := h.dataPlaneClient1.PingcapV1alpha1().Backups(fakeTcNamespace1).Get(ctx, h.backupMemberName1, metav1.GetOptions{}) h.g.Expect(err).To(gomega.BeNil()) @@ -264,6 +314,26 @@ func TestVolumeBackup(t *testing.T) { h.g.Expect(err).To(gomega.BeNil()) h.assertRunExecute(ctx, volumeBackup) + // wait snapshots created + err = h.bm.Sync(volumeBackup) + h.g.Expect(err).To(gomega.HaveOccurred()) + + // snapshots created, resume gc and scheduler + h.setDataPlaneSnapshotCreated(ctx) + err = h.bm.Sync(volumeBackup) + h.g.Expect(err).To(gomega.BeNil()) + h.assertRunResumeGcSchedule(ctx) + + // wait initialize complete + err = h.bm.Sync(volumeBackup) + h.g.Expect(err).To(gomega.HaveOccurred()) + + // initialize complete + h.setDataPlaneInitializeComplete(ctx) + err = h.bm.Sync(volumeBackup) + h.g.Expect(err).To(gomega.HaveOccurred()) + h.assertControlPlaneSnapshotsCreated(volumeBackup) + // wait volume complete err = h.bm.Sync(volumeBackup) h.g.Expect(err).To(gomega.HaveOccurred())