Skip to content

Commit

Permalink
ebs br: resume gc and scheduler when volume snapshots created (#5288) (
Browse files Browse the repository at this point in the history
…#5298)

Co-authored-by: WangLe1321 <[email protected]>
Co-authored-by: csuzhangxc <[email protected]>
  • Loading branch information
3 people authored Sep 25, 2023
1 parent 6c6c99c commit 3d41f93
Show file tree
Hide file tree
Showing 19 changed files with 400 additions and 10 deletions.
17 changes: 15 additions & 2 deletions cmd/backup-manager/app/backup/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ func (bo *Options) brCommandRunWithLogCallback(ctx context.Context, fullArgs []s
}
klog.Infof("Running br command with args: %v", fullArgs)
bin := filepath.Join(util.BRBinPath, "br")
cmd := exec.CommandContext(ctx, bin, fullArgs...)
cmd := exec.Command(bin, fullArgs...)

stdOut, err := cmd.StdoutPipe()
if err != nil {
Expand All @@ -278,7 +278,17 @@ func (bo *Options) brCommandRunWithLogCallback(ctx context.Context, fullArgs []s
if err != nil {
return fmt.Errorf("cluster %s, execute br command failed, args: %s, err: %v", bo, fullArgs, err)
}

// only the initialization command of volume snapshot backup use gracefully shutting down
// because it should resume gc and pd scheduler immediately
if bo.Mode == string(v1alpha1.BackupModeVolumeSnapshot) && bo.Initialize {
go backupUtil.GracefullyShutDownSubProcess(ctx, cmd)
}

var errMsg string
stdErrCh := make(chan []byte, 1)
go backupUtil.ReadAllStdErrToChannel(stdErr, stdErrCh)

reader := bufio.NewReader(stdOut)
for {
line, err := reader.ReadString('\n')
Expand All @@ -291,10 +301,13 @@ func (bo *Options) brCommandRunWithLogCallback(ctx context.Context, fullArgs []s

klog.Info(strings.Replace(line, "\n", "", -1))
if err != nil {
if err != io.EOF {
klog.Errorf("read stdout error: %s", err.Error())
}
break
}
}
tmpErr, _ := io.ReadAll(stdErr)
tmpErr := <-stdErrCh
if len(tmpErr) > 0 {
klog.Info(string(tmpErr))
errMsg += string(tmpErr)
Expand Down
12 changes: 9 additions & 3 deletions cmd/backup-manager/app/restore/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ func (ro *Options) restoreData(
fullArgs = append(fullArgs, args...)
klog.Infof("Running br command with args: %v", fullArgs)
bin := path.Join(util.BRBinPath, "br")
cmd := exec.CommandContext(ctx, bin, fullArgs...)
cmd := exec.Command(bin, fullArgs...)

stdOut, err := cmd.StdoutPipe()
if err != nil {
Expand Down Expand Up @@ -151,6 +151,9 @@ func (ro *Options) restoreData(
}()
}

stdErrCh := make(chan []byte, 1)
go backupUtil.ReadAllStdErrToChannel(stdErr, stdErrCh)

var errMsg string
reader := bufio.NewReader(stdOut)
for {
Expand All @@ -164,11 +167,14 @@ func (ro *Options) restoreData(
ro.updateResolvedTSForCSB(line, restore, progressStep, statusUpdater)
}
klog.Info(strings.Replace(line, "\n", "", -1))
if err != nil || io.EOF == err {
if err != nil {
if err != io.EOF {
klog.Errorf("read stdout error: %s", err.Error())
}
break
}
}
tmpErr, _ := io.ReadAll(stdErr)
tmpErr := <-stdErrCh
if len(tmpErr) > 0 {
klog.Info(string(tmpErr))
errMsg += string(tmpErr)
Expand Down
24 changes: 24 additions & 0 deletions cmd/backup-manager/app/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ package util
import (
"context"
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"os/signal"
"path"
"path/filepath"
Expand Down Expand Up @@ -510,6 +512,28 @@ func ParseRestoreProgress(line string) (step, progress string) {
return
}

// ReadAllStdErrToChannel read the stdErr and send the output to channel
func ReadAllStdErrToChannel(stdErr io.Reader, errMsgCh chan []byte) {
errMsg, err := io.ReadAll(stdErr)
if err != nil {
klog.Errorf("read stderr error: %s", err.Error())
}
errMsgCh <- errMsg
close(errMsgCh)
}

// GracefullyShutDownSubProcess just send SIGTERM to the process of cmd when context done
// the caller should wait the process of cmd to shut down
func GracefullyShutDownSubProcess(ctx context.Context, cmd *exec.Cmd) {
<-ctx.Done()
klog.Errorf("context done, err: %s. start to shut down sub process gracefully", ctx.Err().Error())
if err := cmd.Process.Signal(syscall.SIGTERM); err != nil {
klog.Errorf("send SIGTERM to sub process error: %s", err.Error())
} else {
klog.Infof("send SIGTERM to sub process successfully")
}
}

const (
e2eBackupEnv string = "E2E_TEST_ENV"
e2eExtendBackupTime string = "Extend_BACKUP_TIME"
Expand Down
24 changes: 24 additions & 0 deletions docs/api-references/docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,18 @@ FederalVolumeBackupPhase
</tr>
<tr>
<td>
<code>resumeGcSchedule</code></br>
<em>
bool
</em>
</td>
<td>
<em>(Optional)</em>
<p>ResumeGcSchedule indicates whether resume gc and pd scheduler for EBS volume snapshot backup</p>
</td>
</tr>
<tr>
<td>
<code>dumpling</code></br>
<em>
<a href="#dumplingconfig">
Expand Down Expand Up @@ -4100,6 +4112,18 @@ FederalVolumeBackupPhase
</tr>
<tr>
<td>
<code>resumeGcSchedule</code></br>
<em>
bool
</em>
</td>
<td>
<em>(Optional)</em>
<p>ResumeGcSchedule indicates whether resume gc and pd scheduler for EBS volume snapshot backup</p>
</td>
</tr>
<tr>
<td>
<code>dumpling</code></br>
<em>
<a href="#dumplingconfig">
Expand Down
15 changes: 14 additions & 1 deletion images/tidb-backup-manager/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,25 @@ else
EXEC_COMMAND="/usr/local/bin/shush exec --"
fi

terminate_subprocesses() {
echo "get SIGTERM, send it to sub process $1"
kill -15 $1 # -15 is SIGTERM
wait $1
}

# exec command
case "$1" in
backup)
shift 1
echo "$BACKUP_BIN backup $@"
$EXEC_COMMAND $BACKUP_BIN backup "$@"
$EXEC_COMMAND $BACKUP_BIN backup "$@" &

# save the PID of the sub process
pid=$!
# Trap the SIGTERM signal and forward it to the main process
trap 'terminate_subprocesses $pid' SIGTERM
# Wait for the sub process to complete
wait $pid
;;
export)
shift 1
Expand Down
6 changes: 6 additions & 0 deletions manifests/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1338,6 +1338,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down Expand Up @@ -2860,6 +2862,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down Expand Up @@ -4186,6 +4190,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down
2 changes: 2 additions & 0 deletions manifests/crd/v1/pingcap.com_backups.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1338,6 +1338,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down
4 changes: 4 additions & 0 deletions manifests/crd/v1/pingcap.com_backupschedules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1313,6 +1313,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down Expand Up @@ -2639,6 +2641,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down
2 changes: 2 additions & 0 deletions manifests/crd/v1beta1/pingcap.com_backups.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1332,6 +1332,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down
4 changes: 4 additions & 0 deletions manifests/crd/v1beta1/pingcap.com_backupschedules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down Expand Up @@ -2627,6 +2629,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down
6 changes: 6 additions & 0 deletions manifests/crd_v1beta1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1332,6 +1332,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down Expand Up @@ -2850,6 +2852,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down Expand Up @@ -4170,6 +4174,8 @@ spec:
x-kubernetes-int-or-string: true
type: object
type: object
resumeGcSchedule:
type: boolean
s3:
properties:
acl:
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/federation/pingcap/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ const (
VolumeBackupInvalid VolumeBackupConditionType = "Invalid"
// VolumeBackupRunning means the VolumeBackup is running
VolumeBackupRunning VolumeBackupConditionType = "Running"
// VolumeBackupSnapshotsCreated means the all the volume snapshots have created, and we have safely resumed GC and PD scheduler
VolumeBackupSnapshotsCreated VolumeBackupConditionType = "SnapshotsCreated"
// VolumeBackupComplete means all the backups in data plane are complete and the VolumeBackup is complete
VolumeBackupComplete VolumeBackupConditionType = "Complete"
// VolumeBackupFailed means one of backup in data plane is failed and the VolumeBackup is failed
Expand Down
6 changes: 6 additions & 0 deletions pkg/apis/federation/pingcap/v1alpha1/volume_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,12 @@ func IsVolumeBackupRunning(volumeBackup *VolumeBackup) bool {
return condition != nil && condition.Status == corev1.ConditionTrue
}

// IsVolumeBackupSnapshotsCreated returns true if VolumeBackup's snapshots are all created
func IsVolumeBackupSnapshotsCreated(volumeBackup *VolumeBackup) bool {
_, condition := GetVolumeBackupCondition(&volumeBackup.Status, VolumeBackupSnapshotsCreated)
return condition != nil && condition.Status == corev1.ConditionTrue
}

// IsVolumeBackupComplete returns true if VolumeBackup is complete
func IsVolumeBackupComplete(volumeBackup *VolumeBackup) bool {
_, condition := GetVolumeBackupCondition(&volumeBackup.Status, VolumeBackupComplete)
Expand Down
16 changes: 16 additions & 0 deletions pkg/apis/pingcap/v1alpha1/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,22 @@ func IsVolumeBackupInitializeFailed(backup *Backup) bool {
return condition != nil && condition.Status == corev1.ConditionTrue
}

func IsVolumeBackupSnapshotsCreated(backup *Backup) bool {
if backup.Spec.Mode != BackupModeVolumeSnapshot {
return false
}
_, condition := GetBackupCondition(&backup.Status, VolumeBackupSnapshotsCreated)
return condition != nil && condition.Status == corev1.ConditionTrue
}

func IsVolumeBackupInitializeComplete(backup *Backup) bool {
if backup.Spec.Mode != BackupModeVolumeSnapshot {
return false
}
_, condition := GetBackupCondition(&backup.Status, VolumeBackupInitializeComplete)
return condition != nil && condition.Status == corev1.ConditionTrue
}

// IsVolumeBackupComplete returns true if volume backup is complete
func IsVolumeBackupComplete(backup *Backup) bool {
if backup.Spec.Mode != BackupModeVolumeSnapshot {
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/pingcap/v1alpha1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 8 additions & 1 deletion pkg/apis/pingcap/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -1966,6 +1966,9 @@ type BackupSpec struct {
// FederalVolumeBackupPhase indicates which phase to execute in federal volume backup
// +optional
FederalVolumeBackupPhase FederalVolumeBackupPhase `json:"federalVolumeBackupPhase,omitempty"`
// ResumeGcSchedule indicates whether resume gc and pd scheduler for EBS volume snapshot backup
// +optional
ResumeGcSchedule bool `json:"resumeGcSchedule,omitempty"`
// DumplingConfig is the configs for dumpling
Dumpling *DumplingConfig `json:"dumpling,omitempty"`
// Base tolerations of backup Pods, components may add more tolerations upon this respectively
Expand Down Expand Up @@ -2122,10 +2125,14 @@ const (
BackupStopped BackupConditionType = "Stopped"
// BackupRestart means the backup was restarted, now just support snapshot backup
BackupRestart BackupConditionType = "Restart"
// VolumeBackupInitialized means the volume backup has stopped GC and PD schedule
// VolumeBackupInitialized means the volume backup has stopped GC and PD scheduler
VolumeBackupInitialized BackupConditionType = "VolumeBackupInitialized"
// VolumeBackupInitializeFailed means the volume backup initialize job failed
VolumeBackupInitializeFailed BackupConditionType = "VolumeBackupInitializeFailed"
// VolumeBackupSnapshotsCreated means the local volume snapshots created, and they won't be changed
VolumeBackupSnapshotsCreated BackupConditionType = "VolumeBackupSnapshotsCreated"
// VolumeBackupInitializeComplete means the volume backup has safely resumed GC and PD scheduler
VolumeBackupInitializeComplete BackupConditionType = "VolumeBackupInitializeComplete"
// VolumeBackupComplete means the volume backup has taken volume snapshots successfully
VolumeBackupComplete BackupConditionType = "VolumeBackupComplete"
// VolumeBackupFailed means the volume backup take volume snapshots failed
Expand Down
Loading

0 comments on commit 3d41f93

Please sign in to comment.