From bf02fb5515267af20378b3d6db3c63181ddc0db5 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Thu, 13 Jun 2024 10:56:59 +0800 Subject: [PATCH] pkg: remove old duplicated task (#8234) ref tikv/pd#7897 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/core/region.go | 10 +++--- pkg/mcs/scheduling/server/cluster.go | 22 ++++++------ pkg/ratelimit/runner.go | 51 ++++++++++++++++++---------- pkg/ratelimit/runner_test.go | 34 ++++++++++++++++--- server/cluster/cluster.go | 25 +++++++------- 5 files changed, 91 insertions(+), 51 deletions(-) diff --git a/pkg/core/region.go b/pkg/core/region.go index 19c1d0d4794..5d7880823e8 100644 --- a/pkg/core/region.go +++ b/pkg/core/region.go @@ -16,7 +16,6 @@ package core import ( "bytes" - "context" "encoding/hex" "fmt" "math" @@ -751,21 +750,22 @@ func GenerateRegionGuideFunc(enableLog bool) RegionGuideFunc { logRunner := ctx.LogRunner // print log asynchronously debug, info := d, i + regionID := region.GetID() if logRunner != nil { debug = func(msg string, fields ...zap.Field) { logRunner.RunTask( - ctx.Context, + regionID, "DebugLog", - func(_ context.Context) { + func() { d(msg, fields...) }, ) } info = func(msg string, fields ...zap.Field) { logRunner.RunTask( - ctx.Context, + regionID, "InfoLog", - func(_ context.Context) { + func() { i(msg, fields...) }, ) diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index caaafe42c87..5fac3e1604a 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -607,15 +607,15 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c // Save to storage if meta is updated, except for flashback. // Save to cache if meta or leader is updated, or contains any down/pending peer. _, saveCache, _, retained := core.GenerateRegionGuideFunc(true)(ctx, region, origin) - + regionID := region.GetID() if !saveCache { // Due to some config changes need to update the region stats as well, // so we do some extra checks here. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.ObserveRegionStatsAsync, - func(_ context.Context) { + func() { if c.regionStats.RegionStatsNeedUpdate(region) { cluster.Collect(c, region, hasRegionStats) } @@ -625,9 +625,9 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c // region is not updated to the subtree. if origin.GetRef() < 2 { ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.UpdateSubTree, - func(_ context.Context) { + func() { c.CheckAndPutSubTree(region) }, ratelimit.WithRetained(true), @@ -649,18 +649,18 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c return err } ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.UpdateSubTree, - func(_ context.Context) { + func() { c.CheckAndPutSubTree(region) }, ratelimit.WithRetained(retained), ) tracer.OnUpdateSubTreeFinished() ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.HandleOverlaps, - func(_ context.Context) { + func() { cluster.HandleOverlaps(c, overlaps) }, ) @@ -668,9 +668,9 @@ func (c *Cluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *c tracer.OnSaveCacheFinished() // handle region stats ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.CollectRegionStatsAsync, - func(_ context.Context) { + func() { cluster.Collect(c, region, hasRegionStats) }, ) diff --git a/pkg/ratelimit/runner.go b/pkg/ratelimit/runner.go index 17a45067f3d..2d88e36106e 100644 --- a/pkg/ratelimit/runner.go +++ b/pkg/ratelimit/runner.go @@ -42,16 +42,16 @@ const ( // Runner is the interface for running tasks. type Runner interface { - RunTask(ctx context.Context, name string, f func(context.Context), opts ...TaskOption) error + RunTask(id uint64, name string, f func(), opts ...TaskOption) error Start() Stop() } // Task is a task to be run. type Task struct { - ctx context.Context + id uint64 submittedAt time.Time - f func(context.Context) + f func() name string // retained indicates whether the task should be dropped if the task queue exceeds maxPendingDuration. retained bool @@ -60,17 +60,22 @@ type Task struct { // ErrMaxWaitingTasksExceeded is returned when the number of waiting tasks exceeds the maximum. var ErrMaxWaitingTasksExceeded = errors.New("max waiting tasks exceeded") -// ConcurrentRunner is a simple task runner that limits the number of concurrent tasks. +type taskID struct { + id uint64 + name string +} + type ConcurrentRunner struct { name string limiter *ConcurrencyLimiter maxPendingDuration time.Duration taskChan chan *Task - pendingTasks []*Task pendingMu sync.Mutex stopChan chan struct{} wg sync.WaitGroup - pendingTaskCount map[string]int64 + pendingTaskCount map[string]int + pendingTasks []*Task + existTasks map[taskID]*Task maxWaitingDuration prometheus.Gauge } @@ -82,7 +87,8 @@ func NewConcurrentRunner(name string, limiter *ConcurrencyLimiter, maxPendingDur maxPendingDuration: maxPendingDuration, taskChan: make(chan *Task), pendingTasks: make([]*Task, 0, initialCapacity), - pendingTaskCount: make(map[string]int64), + pendingTaskCount: make(map[string]int), + existTasks: make(map[taskID]*Task), maxWaitingDuration: RunnerTaskMaxWaitingDuration.WithLabelValues(name), } return s @@ -101,6 +107,7 @@ func (cr *ConcurrentRunner) Start() { cr.stopChan = make(chan struct{}) cr.wg.Add(1) ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() go func() { defer cr.wg.Done() for { @@ -139,7 +146,7 @@ func (cr *ConcurrentRunner) Start() { func (cr *ConcurrentRunner) run(task *Task, token *TaskToken) { start := time.Now() - task.f(task.ctx) + task.f() if token != nil { cr.limiter.ReleaseToken(token) cr.processPendingTasks() @@ -157,6 +164,7 @@ func (cr *ConcurrentRunner) processPendingTasks() { case cr.taskChan <- task: cr.pendingTasks = cr.pendingTasks[1:] cr.pendingTaskCount[task.name]-- + delete(cr.existTasks, taskID{id: task.id, name: task.name}) default: } return @@ -170,11 +178,12 @@ func (cr *ConcurrentRunner) Stop() { } // RunTask runs the task asynchronously. -func (cr *ConcurrentRunner) RunTask(ctx context.Context, name string, f func(context.Context), opts ...TaskOption) error { +func (cr *ConcurrentRunner) RunTask(id uint64, name string, f func(), opts ...TaskOption) error { task := &Task{ - ctx: ctx, - name: name, - f: f, + id: id, + name: name, + f: f, + submittedAt: time.Now(), } for _, opt := range opts { opt(task) @@ -187,7 +196,15 @@ func (cr *ConcurrentRunner) RunTask(ctx context.Context, name string, f func(con }() pendingTaskNum := len(cr.pendingTasks) + tid := taskID{task.id, task.name} if pendingTaskNum > 0 { + // Here we use a map to find the task with the same ID. + // Then replace the old task with the new one. + if t, ok := cr.existTasks[tid]; ok { + t.f = f + t.submittedAt = time.Now() + return nil + } if !task.retained { maxWait := time.Since(cr.pendingTasks[0].submittedAt) if maxWait > cr.maxPendingDuration { @@ -195,15 +212,13 @@ func (cr *ConcurrentRunner) RunTask(ctx context.Context, name string, f func(con return ErrMaxWaitingTasksExceeded } } - // We use the max task number to limit the memory usage. - // It occupies around 1.5GB memory when there is 20000000 pending task. - if len(cr.pendingTasks) > maxPendingTaskNum { + if pendingTaskNum > maxPendingTaskNum { RunnerFailedTasks.WithLabelValues(cr.name, task.name).Inc() return ErrMaxWaitingTasksExceeded } } - task.submittedAt = time.Now() cr.pendingTasks = append(cr.pendingTasks, task) + cr.existTasks[tid] = task cr.pendingTaskCount[task.name]++ return nil } @@ -217,8 +232,8 @@ func NewSyncRunner() *SyncRunner { } // RunTask runs the task synchronously. -func (*SyncRunner) RunTask(ctx context.Context, _ string, f func(context.Context), _ ...TaskOption) error { - f(ctx) +func (*SyncRunner) RunTask(_ uint64, _ string, f func(), _ ...TaskOption) error { + f() return nil } diff --git a/pkg/ratelimit/runner_test.go b/pkg/ratelimit/runner_test.go index 0241536686b..0335a78bcbe 100644 --- a/pkg/ratelimit/runner_test.go +++ b/pkg/ratelimit/runner_test.go @@ -15,7 +15,6 @@ package ratelimit import ( - "context" "sync" "testing" "time" @@ -34,9 +33,9 @@ func TestConcurrentRunner(t *testing.T) { time.Sleep(50 * time.Millisecond) wg.Add(1) err := runner.RunTask( - context.Background(), + uint64(i), "test1", - func(context.Context) { + func() { defer wg.Done() time.Sleep(100 * time.Millisecond) }, @@ -54,9 +53,9 @@ func TestConcurrentRunner(t *testing.T) { for i := 0; i < 10; i++ { wg.Add(1) err := runner.RunTask( - context.Background(), + uint64(i), "test2", - func(context.Context) { + func() { defer wg.Done() time.Sleep(100 * time.Millisecond) }, @@ -74,4 +73,29 @@ func TestConcurrentRunner(t *testing.T) { } wg.Wait() }) + + t.Run("DuplicatedTask", func(t *testing.T) { + runner := NewConcurrentRunner("test", NewConcurrencyLimiter(1), time.Minute) + runner.Start() + defer runner.Stop() + for i := 1; i < 11; i++ { + regionID := uint64(i) + if i == 10 { + regionID = 4 + } + err := runner.RunTask( + regionID, + "test3", + func() { + time.Sleep(time.Second) + }, + ) + require.NoError(t, err) + time.Sleep(1 * time.Millisecond) + } + + updatedSubmitted := runner.pendingTasks[1].submittedAt + lastSubmitted := runner.pendingTasks[len(runner.pendingTasks)-1].submittedAt + require.Greater(t, updatedSubmitted, lastSubmitted) + }) } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 70d6b46b980..be0ba39b899 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -1038,6 +1038,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // Save to cache if meta or leader is updated, or contains any down/pending peer. saveKV, saveCache, needSync, retained := regionGuide(ctx, region, origin) tracer.OnRegionGuideFinished() + regionID := region.GetID() if !saveKV && !saveCache { // Due to some config changes need to update the region stats as well, // so we do some extra checks here. @@ -1046,9 +1047,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // We need to think of a better way to reduce this part of the cost in the future. if hasRegionStats && c.regionStats.RegionStatsNeedUpdate(region) { ctx.MiscRunner.RunTask( - ctx.Context, + regionID, ratelimit.ObserveRegionStatsAsync, - func(_ context.Context) { + func() { if c.regionStats.RegionStatsNeedUpdate(region) { cluster.Collect(c, region, hasRegionStats) } @@ -1058,9 +1059,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // region is not updated to the subtree. if origin.GetRef() < 2 { ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.UpdateSubTree, - func(_ context.Context) { + func() { c.CheckAndPutSubTree(region) }, ratelimit.WithRetained(true), @@ -1086,9 +1087,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio return err } ctx.TaskRunner.RunTask( - ctx, + regionID, ratelimit.UpdateSubTree, - func(_ context.Context) { + func() { c.CheckAndPutSubTree(region) }, ratelimit.WithRetained(retained), @@ -1097,9 +1098,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { ctx.MiscRunner.RunTask( - ctx.Context, + regionID, ratelimit.HandleOverlaps, - func(_ context.Context) { + func() { cluster.HandleOverlaps(c, overlaps) }, ) @@ -1110,9 +1111,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio tracer.OnSaveCacheFinished() // handle region stats ctx.MiscRunner.RunTask( - ctx.Context, + regionID, ratelimit.CollectRegionStatsAsync, - func(_ context.Context) { + func() { // TODO: Due to the accuracy requirements of the API "/regions/check/xxx", // region stats needs to be collected in API mode. // We need to think of a better way to reduce this part of the cost in the future. @@ -1124,9 +1125,9 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio if c.storage != nil { if saveKV { ctx.MiscRunner.RunTask( - ctx.Context, + regionID, ratelimit.SaveRegionToKV, - func(_ context.Context) { + func() { // If there are concurrent heartbeats from the same region, the last write will win even if // writes to storage in the critical area. So don't use mutex to protect it. // Not successfully saved to storage is not fatal, it only leads to longer warm-up