From 463297be6a621183386105525e0aa33a5fc09ebd Mon Sep 17 00:00:00 2001 From: okJiang <819421878@qq.com> Date: Mon, 24 Jun 2024 16:52:21 +0800 Subject: [PATCH] scheduler: skip evict-leader-scheduler when setting schedule deny label (#8303) ref tikv/pd#7300, close tikv/pd#7853 - add a real cluster test to test `skip evict-leader-scheduler when setting schedule deny label` - add `DeleteStoreLabel` API and `DeleteScheduler` API Signed-off-by: okJiang <819421878@qq.com> --- .gitignore | 2 + .../schedulers/scheduler_controller.go | 38 ++++++++++--------- server/cluster/cluster.go | 3 ++ 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index 748d24872b6..fb9f0424418 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,5 @@ coverage.xml coverage *.txt go.work* +embedded_assets_handler.go +*.log diff --git a/pkg/schedule/schedulers/scheduler_controller.go b/pkg/schedule/schedulers/scheduler_controller.go index 25a2c8b2afe..d6f5826771c 100644 --- a/pkg/schedule/schedulers/scheduler_controller.go +++ b/pkg/schedule/schedulers/scheduler_controller.go @@ -452,6 +452,8 @@ func (s *ScheduleController) Stop() { // Schedule tries to create some operators. func (s *ScheduleController) Schedule(diagnosable bool) []*operator.Operator { + _, isEvictLeaderScheduler := s.Scheduler.(*evictLeaderScheduler) +retry: for i := 0; i < maxScheduleRetries; i++ { // no need to retry if schedule should stop to speed exit select { @@ -466,29 +468,29 @@ func (s *ScheduleController) Schedule(diagnosable bool) []*operator.Operator { if diagnosable { s.diagnosticRecorder.SetResultFromPlans(ops, plans) } - foundDisabled := false + if len(ops) == 0 { + continue + } + // If we have schedule, reset interval to the minimal interval. + s.nextInterval = s.Scheduler.GetMinInterval() for _, op := range ops { - if labelMgr := s.cluster.GetRegionLabeler(); labelMgr != nil { - region := s.cluster.GetRegion(op.RegionID()) - if region == nil { - continue - } - if labelMgr.ScheduleDisabled(region) { - denySchedulersByLabelerCounter.Inc() - foundDisabled = true - break - } + region := s.cluster.GetRegion(op.RegionID()) + if region == nil { + continue retry } - } - if len(ops) > 0 { - // If we have schedule, reset interval to the minimal interval. - s.nextInterval = s.Scheduler.GetMinInterval() - // try regenerating operators - if foundDisabled { + labelMgr := s.cluster.GetRegionLabeler() + if labelMgr == nil { continue } - return ops + + // If the evict-leader-scheduler is disabled, it will obstruct the restart operation of tikv by the operator. + // Refer: https://docs.pingcap.com/tidb-in-kubernetes/stable/restart-a-tidb-cluster#perform-a-graceful-restart-to-a-single-tikv-pod + if labelMgr.ScheduleDisabled(region) && !isEvictLeaderScheduler { + denySchedulersByLabelerCounter.Inc() + continue retry + } } + return ops } s.nextInterval = s.Scheduler.GetNextInterval(s.nextInterval) return nil diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 043c0996acc..7cf696e11c0 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -1364,6 +1364,9 @@ func (c *RaftCluster) DeleteStoreLabel(storeID uint64, labelKey string) error { if store == nil { return errs.ErrInvalidStoreID.FastGenByArgs(storeID) } + if len(store.GetLabels()) == 0 { + return errors.Errorf("the label key %s does not exist", labelKey) + } newStore := typeutil.DeepClone(store.GetMeta(), core.StoreFactory) labels := make([]*metapb.StoreLabel, 0, len(newStore.GetLabels())-1) for _, label := range newStore.GetLabels() {