diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 1e502a542199..d106578b5fe5 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -568,3 +568,10 @@ func (c *Cluster) processRegionHeartbeat(region *core.RegionInfo) error { func (c *Cluster) IsPrepared() bool { return c.coordinator.GetPrepareChecker().IsPrepared() } + +// IsSchedulingHalted returns whether the scheduling is halted. +// Currently, the microservice scheduling is halted when: +// - The `HaltScheduling` persist option is set to true. +func (c *Cluster) IsSchedulingHalted() bool { + return c.persistConfig.IsSchedulingHalted() +} diff --git a/pkg/mcs/scheduling/server/config/config.go b/pkg/mcs/scheduling/server/config/config.go index d462a9a58b53..72e46cb341fd 100644 --- a/pkg/mcs/scheduling/server/config/config.go +++ b/pkg/mcs/scheduling/server/config/config.go @@ -603,6 +603,10 @@ func (o *PersistConfig) SetSplitMergeInterval(splitMergeInterval time.Duration) o.SetScheduleConfig(v) } +// SetSchedulingAllowanceStatus sets the scheduling allowance status to help distinguish the source of the halt. +// TODO: support this metrics for the scheduling service in the future. +func (*PersistConfig) SetSchedulingAllowanceStatus(bool, string) {} + // SetHaltScheduling set HaltScheduling. func (o *PersistConfig) SetHaltScheduling(halt bool, source string) { v := o.GetScheduleConfig().Clone() diff --git a/pkg/schedule/config/config_provider.go b/pkg/schedule/config/config_provider.go index 20c7f0dc2cf1..90e489f86f3e 100644 --- a/pkg/schedule/config/config_provider.go +++ b/pkg/schedule/config/config_provider.go @@ -46,7 +46,7 @@ func IsSchedulerRegistered(name string) bool { type SchedulerConfigProvider interface { SharedConfigProvider - IsSchedulingHalted() bool + SetSchedulingAllowanceStatus(bool, string) GetStoresLimit() map[uint64]StoreLimitConfig IsSchedulerDisabled(string) bool diff --git a/pkg/schedule/coordinator.go b/pkg/schedule/coordinator.go index fcdd8c9a32c8..7d4b839385ab 100644 --- a/pkg/schedule/coordinator.go +++ b/pkg/schedule/coordinator.go @@ -178,7 +178,7 @@ func (c *Coordinator) PatrolRegions() { log.Info("patrol regions has been stopped") return } - if c.isSchedulingHalted() { + if c.cluster.IsSchedulingHalted() { continue } @@ -207,10 +207,6 @@ func (c *Coordinator) PatrolRegions() { } } -func (c *Coordinator) isSchedulingHalted() bool { - return c.cluster.GetSchedulerConfig().IsSchedulingHalted() -} - func (c *Coordinator) checkRegions(startKey []byte) (key []byte, regions []*core.RegionInfo) { regions = c.cluster.ScanRegions(startKey, nil, patrolScanRegionLimit) if len(regions) == 0 { diff --git a/pkg/schedule/core/cluster_informer.go b/pkg/schedule/core/cluster_informer.go index 63dacd0c30dd..b97459d26ea6 100644 --- a/pkg/schedule/core/cluster_informer.go +++ b/pkg/schedule/core/cluster_informer.go @@ -43,6 +43,7 @@ type SchedulerCluster interface { GetSchedulerConfig() sc.SchedulerConfigProvider GetRegionLabeler() *labeler.RegionLabeler GetStoreConfig() sc.StoreConfigProvider + IsSchedulingHalted() bool } // CheckerCluster is an aggregate interface that wraps multiple interfaces diff --git a/pkg/schedule/schedulers/scheduler_controller.go b/pkg/schedule/schedulers/scheduler_controller.go index 25a2c8b2afe4..28bb5c96c030 100644 --- a/pkg/schedule/schedulers/scheduler_controller.go +++ b/pkg/schedule/schedulers/scheduler_controller.go @@ -114,7 +114,7 @@ func (c *Controller) CollectSchedulerMetrics() { var allowScheduler float64 // If the scheduler is not allowed to schedule, it will disappear in Grafana panel. // See issue #1341. - if !s.IsPaused() && !c.isSchedulingHalted() { + if !s.IsPaused() && !c.cluster.IsSchedulingHalted() { allowScheduler = 1 } schedulerStatusGauge.WithLabelValues(s.Scheduler.GetName(), "allow").Set(allowScheduler) @@ -130,10 +130,6 @@ func (c *Controller) CollectSchedulerMetrics() { ruleStatusGauge.WithLabelValues("group_count").Set(float64(groupCnt)) } -func (c *Controller) isSchedulingHalted() bool { - return c.cluster.GetSchedulerConfig().IsSchedulingHalted() -} - // ResetSchedulerMetrics resets metrics of all schedulers. func ResetSchedulerMetrics() { schedulerStatusGauge.Reset() @@ -518,7 +514,7 @@ func (s *ScheduleController) AllowSchedule(diagnosable bool) bool { } return false } - if s.isSchedulingHalted() { + if s.cluster.IsSchedulingHalted() { if diagnosable { s.diagnosticRecorder.SetResultFromStatus(Halted) } @@ -533,10 +529,6 @@ func (s *ScheduleController) AllowSchedule(diagnosable bool) bool { return true } -func (s *ScheduleController) isSchedulingHalted() bool { - return s.cluster.GetSchedulerConfig().IsSchedulingHalted() -} - // IsPaused returns if a scheduler is paused. func (s *ScheduleController) IsPaused() bool { delayUntil := atomic.LoadInt64(&s.delayUntil) diff --git a/pkg/unsaferecovery/unsafe_recovery_controller.go b/pkg/unsaferecovery/unsafe_recovery_controller.go index aa45ba6a2bdc..3ce299ca6e01 100644 --- a/pkg/unsaferecovery/unsafe_recovery_controller.go +++ b/pkg/unsaferecovery/unsafe_recovery_controller.go @@ -492,12 +492,11 @@ func (u *Controller) GetStage() stage { } func (u *Controller) changeStage(stage stage) { - u.stage = stage - // Halt and resume the scheduling once the running state changed. - running := isRunning(stage) - if opt := u.cluster.GetSchedulerConfig(); opt.IsSchedulingHalted() != running { - opt.SetHaltScheduling(running, "online-unsafe-recovery") + // If the running stage changes, update the scheduling allowance status to add or remove "online-unsafe-recovery" halt. + if running := isRunning(stage); running != isRunning(u.stage) { + u.cluster.GetSchedulerConfig().SetSchedulingAllowanceStatus(running, "online-unsafe-recovery") } + u.stage = stage var output StageOutput output.Time = time.Now().Format("2006-01-02 15:04:05.000") diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index d6454f48ee36..b15b03f125ee 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -920,6 +920,14 @@ func (c *RaftCluster) RemoveSuspectRegion(id uint64) { c.coordinator.GetCheckerController().RemoveSuspectRegion(id) } +// IsSchedulingHalted returns whether the scheduling is halted. +// Currently, the PD scheduling is halted when: +// - The `HaltScheduling` persist option is set to true. +// - Online unsafe recovery is running. +func (c *RaftCluster) IsSchedulingHalted() bool { + return c.opt.IsSchedulingHalted() || c.unsafeRecoveryController.IsRunning() +} + // GetUnsafeRecoveryController returns the unsafe recovery controller. func (c *RaftCluster) GetUnsafeRecoveryController() *unsaferecovery.Controller { return c.unsafeRecoveryController diff --git a/server/cluster/cluster_worker.go b/server/cluster/cluster_worker.go index c1da97363b53..781b3079703b 100644 --- a/server/cluster/cluster_worker.go +++ b/server/cluster/cluster_worker.go @@ -43,7 +43,7 @@ func (c *RaftCluster) HandleRegionHeartbeat(region *core.RegionInfo) error { // HandleAskSplit handles the split request. func (c *RaftCluster) HandleAskSplit(request *pdpb.AskSplitRequest) (*pdpb.AskSplitResponse, error) { - if c.isSchedulingHalted() { + if c.IsSchedulingHalted() { return nil, errs.ErrSchedulingIsHalted.FastGenByArgs() } if !c.opt.IsTikvRegionSplitEnabled() { @@ -86,10 +86,6 @@ func (c *RaftCluster) HandleAskSplit(request *pdpb.AskSplitRequest) (*pdpb.AskSp return split, nil } -func (c *RaftCluster) isSchedulingHalted() bool { - return c.opt.IsSchedulingHalted() -} - // ValidRequestRegion is used to decide if the region is valid. func (c *RaftCluster) ValidRequestRegion(reqRegion *metapb.Region) error { startKey := reqRegion.GetStartKey() @@ -109,7 +105,7 @@ func (c *RaftCluster) ValidRequestRegion(reqRegion *metapb.Region) error { // HandleAskBatchSplit handles the batch split request. func (c *RaftCluster) HandleAskBatchSplit(request *pdpb.AskBatchSplitRequest) (*pdpb.AskBatchSplitResponse, error) { - if c.isSchedulingHalted() { + if c.IsSchedulingHalted() { return nil, errs.ErrSchedulingIsHalted.FastGenByArgs() } if !c.opt.IsTikvRegionSplitEnabled() { diff --git a/server/config/persist_options.go b/server/config/persist_options.go index c0a0ebf5c47a..032e3736a98d 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -973,11 +973,8 @@ func (o *PersistOptions) SetAllStoresLimitTTL(ctx context.Context, client *clien var haltSchedulingStatus = schedulingAllowanceStatusGauge.WithLabelValues("halt-scheduling") -// SetHaltScheduling set HaltScheduling. -func (o *PersistOptions) SetHaltScheduling(halt bool, source string) { - v := o.GetScheduleConfig().Clone() - v.HaltScheduling = halt - o.SetScheduleConfig(v) +// SetSchedulingAllowanceStatus sets the scheduling allowance status to help distinguish the source of the halt. +func (*PersistOptions) SetSchedulingAllowanceStatus(halt bool, source string) { if halt { haltSchedulingStatus.Set(1) schedulingAllowanceStatusGauge.WithLabelValues(source).Set(1) @@ -987,6 +984,14 @@ func (o *PersistOptions) SetHaltScheduling(halt bool, source string) { } } +// SetHaltScheduling set HaltScheduling. +func (o *PersistOptions) SetHaltScheduling(halt bool, source string) { + v := o.GetScheduleConfig().Clone() + v.HaltScheduling = halt + o.SetScheduleConfig(v) + o.SetSchedulingAllowanceStatus(halt, source) +} + // IsSchedulingHalted returns if PD scheduling is halted. func (o *PersistOptions) IsSchedulingHalted() bool { if o == nil { diff --git a/server/server.go b/server/server.go index 1d2c4516abef..5fc9a64ed289 100644 --- a/server/server.go +++ b/server/server.go @@ -1016,6 +1016,8 @@ func (s *Server) SetScheduleConfig(cfg sc.ScheduleConfig) error { errs.ZapError(err)) return err } + // Update the scheduling halt status at the same time. + s.persistOptions.SetSchedulingAllowanceStatus(cfg.HaltScheduling, "manually") log.Info("schedule config is updated", zap.Reflect("new", cfg), zap.Reflect("old", old)) return nil }