From 367fcd3a4190dfa2dfc428ca68ff41a803aef6c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Leszczy=C5=84ski?= <2000michal@wp.pl> Date: Wed, 20 Mar 2024 16:45:46 +0100 Subject: [PATCH] feat(backup): disable tablet migration on snapshot Fixes #3759 --- pkg/scyllaclient/client_scylla.go | 9 +++++++++ pkg/service/backup/worker_snapshot.go | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/pkg/scyllaclient/client_scylla.go b/pkg/scyllaclient/client_scylla.go index 2eaadaacf7..da2623121b 100644 --- a/pkg/scyllaclient/client_scylla.go +++ b/pkg/scyllaclient/client_scylla.go @@ -1049,6 +1049,15 @@ func (c *Client) ViewBuildStatus(ctx context.Context, keyspace, view string) (Vi return minStatus, nil } +// ControlTabletLoadBalancing disables or enables tablet load balancing in cluster. +func (c *Client) ControlTabletLoadBalancing(ctx context.Context, enabled bool) error { + _, err := c.scyllaOps.StorageServiceTabletsBalancingPost(&operations.StorageServiceTabletsBalancingPostParams{ + Context: ctx, + Enabled: enabled, + }) + return err +} + // ToCanonicalIP replaces ":0:0" in IPv6 addresses with "::" // ToCanonicalIP("192.168.0.1") -> "192.168.0.1" // ToCanonicalIP("100:200:0:0:0:0:0:1") -> "100:200::1". diff --git a/pkg/service/backup/worker_snapshot.go b/pkg/service/backup/worker_snapshot.go index 875dc41aa3..87b164baf9 100644 --- a/pkg/service/backup/worker_snapshot.go +++ b/pkg/service/backup/worker_snapshot.go @@ -4,12 +4,31 @@ package backup import ( "context" + stdErrors "errors" "github.com/pkg/errors" + "github.com/scylladb/scylla-manager/v3/pkg/scyllaclient" . "github.com/scylladb/scylla-manager/v3/pkg/service/backup/backupspec" ) func (w *worker) Snapshot(ctx context.Context, hosts []hostInfo, limits []DCLimit) (err error) { + snapshotTabletKs := false + ringDescriber := scyllaclient.NewRingDescriber(ctx, w.Client) + for _, u := range w.Units { + snapshotTabletKs = snapshotTabletKs || ringDescriber.IsTabletKeyspace(u.Keyspace) + } + // Disable tablet migration for the snapshot stage. + // Without that it could be possible that some tablet "escapes" being + // a part of any snapshot by migrating from not yet snapshot-ed host to already snapshot-ed one. + if snapshotTabletKs { + defer func() { + err = stdErrors.Join(err, w.Client.ControlTabletLoadBalancing(context.Background(), true)) + }() + if err := w.Client.ControlTabletLoadBalancing(ctx, false); err != nil { + return errors.Wrapf(err, "disable tablet load balancing") + } + } + f := func(h hostInfo) error { w.Logger.Info(ctx, "Taking snapshots on host", "host", h.IP) err := w.snapshotHost(ctx, h)