From 963c08f16cea40544ebe23aa11ab827abe35c86c Mon Sep 17 00:00:00 2001
From: Ti Chi Robot <ti-community-prow-bot@tidb.io>
Date: Mon, 11 Sep 2023 10:12:09 +0800
Subject: [PATCH] owner(ticdc): fix changefeed backoff (#9687) (#9698)

close pingcap/tiflow#9697
---
 cdc/owner/feed_state_manager.go      |  50 ++-----
 cdc/owner/feed_state_manager_test.go | 210 ++++++++++++++++++++++++++-
 2 files changed, 214 insertions(+), 46 deletions(-)

diff --git a/cdc/owner/feed_state_manager.go b/cdc/owner/feed_state_manager.go
index 514f2514de7..30c62e86aff 100644
--- a/cdc/owner/feed_state_manager.go
+++ b/cdc/owner/feed_state_manager.go
@@ -39,11 +39,6 @@ const (
 	defaultBackoffMaxElapsedTime      = 30 * time.Minute
 	defaultBackoffRandomizationFactor = 0.1
 	defaultBackoffMultiplier          = 2.0
-
-	// If all states recorded in window are 'normal', it can be assumed that the changefeed
-	// is running steady. And then if we enter a state other than normal at next tick,
-	// the backoff must be reset.
-	defaultStateWindowSize = 512
 )
 
 // feedStateManager manages the ReactorState of a changefeed
@@ -59,7 +54,7 @@ type feedStateManager struct {
 	shouldBeRemoved bool
 
 	adminJobQueue                 []*model.AdminJob
-	stateHistory                  [defaultStateWindowSize]model.FeedState
+	isRetrying                    bool
 	lastErrorRetryTime            time.Time                   // time of last error for a changefeed
 	lastErrorRetryCheckpointTs    model.Ts                    // checkpoint ts of last retry
 	lastWarningReportCheckpointTs model.Ts                    // checkpoint ts of last warning report
@@ -93,26 +88,6 @@ func newFeedStateManager(up *upstream.Upstream) *feedStateManager {
 	return f
 }
 
-// isChangefeedStable check if there are states other than 'normal' in this sliding window.
-func (m *feedStateManager) isChangefeedStable() bool {
-	for _, val := range m.stateHistory {
-		if val != model.StateNormal {
-			return false
-		}
-	}
-
-	return true
-}
-
-// shiftStateWindow shift the sliding window
-func (m *feedStateManager) shiftStateWindow(state model.FeedState) {
-	for i := 0; i < defaultStateWindowSize-1; i++ {
-		m.stateHistory[i] = m.stateHistory[i+1]
-	}
-
-	m.stateHistory[defaultStateWindowSize-1] = state
-}
-
 func (m *feedStateManager) Tick(
 	state *orchestrator.ChangefeedReactorState,
 	resolvedTs model.Ts,
@@ -128,7 +103,6 @@ func (m *feedStateManager) Tick(
 		}
 	}
 
-	m.shiftStateWindow(state.Info.State)
 	m.checkAndInitLastRetryCheckpointTs(state.Status)
 
 	m.state = state
@@ -167,13 +141,13 @@ func (m *feedStateManager) Tick(
 		// NextBackOff() will return -1 once the MaxElapsedTime has elapsed,
 		// set the changefeed to failed state.
 		if m.backoffInterval == m.errBackoff.Stop {
-			log.Warn("The changefeed won't be restarted "+
-				"as it has been experiencing failures for "+
+			log.Error("The changefeed won't be restarted as it has been experiencing failures for "+
 				"an extended duration",
-				zap.Duration(
-					"maxElapsedTime",
-					m.errBackoff.MaxElapsedTime,
-				),
+				zap.Duration("maxElapsedTime", m.errBackoff.MaxElapsedTime),
+				zap.String("namespace", m.state.ID.Namespace),
+				zap.String("changefeed", m.state.ID.ID),
+				zap.Time("lastRetryTime", m.lastErrorRetryTime),
+				zap.Uint64("lastRetryCheckpointTs", m.lastErrorRetryCheckpointTs),
 			)
 			m.shouldBeRunning = false
 			m.patchState(model.StateFailed)
@@ -563,13 +537,10 @@ func (m *feedStateManager) handleError(errs ...*model.RunningError) {
 		})
 	}
 
-	// If we enter into an abnormal state 'error' for this changefeed now
-	// but haven't seen abnormal states in a sliding window (512 ticks),
-	// it can be assumed that this changefeed meets a sudden change from a stable condition.
-	// So we can reset the exponential backoff and re-backoff from the InitialInterval.
-	// TODO: this detection policy should be added into unit test.
-	if m.isChangefeedStable() {
+	// The errBackoff needs to be reset before the first retry.
+	if !m.isRetrying {
 		m.resetErrRetry()
+		m.isRetrying = true
 	}
 }
 
@@ -646,6 +617,7 @@ func (m *feedStateManager) checkAndChangeState() {
 			zap.Uint64("checkpointTs", m.state.Status.CheckpointTs),
 			zap.Uint64("lastRetryCheckpointTs", m.lastErrorRetryCheckpointTs))
 		m.patchState(model.StateNormal)
+		m.isRetrying = false
 	}
 }
 
diff --git a/cdc/owner/feed_state_manager_test.go b/cdc/owner/feed_state_manager_test.go
index e91a8d8bc02..ed738e04b55 100644
--- a/cdc/owner/feed_state_manager_test.go
+++ b/cdc/owner/feed_state_manager_test.go
@@ -26,6 +26,7 @@ import (
 	cerror "github.com/pingcap/tiflow/pkg/errors"
 	"github.com/pingcap/tiflow/pkg/etcd"
 	"github.com/pingcap/tiflow/pkg/orchestrator"
+	"github.com/pingcap/tiflow/pkg/pdutil"
 	"github.com/pingcap/tiflow/pkg/upstream"
 	"github.com/stretchr/testify/require"
 	pd "github.com/tikv/pd/client"
@@ -46,19 +47,18 @@ func (p *mockPD) GetTS(_ context.Context) (int64, int64, error) {
 
 // newFeedStateManager4Test creates feedStateManager for test
 func newFeedStateManager4Test(
-	initialIntervalInMs time.Duration,
-	maxIntervalInMs time.Duration,
-	maxElapsedTimeInMs time.Duration,
+	initialIntervalInMs, maxIntervalInMs, maxElapsedTimeInMs int,
 	multiplier float64,
 ) *feedStateManager {
 	f := new(feedStateManager)
 	f.upstream = new(upstream.Upstream)
 	f.upstream.PDClient = &mockPD{}
+	f.upstream.PDClock = pdutil.NewClock4Test()
 
 	f.errBackoff = backoff.NewExponentialBackOff()
-	f.errBackoff.InitialInterval = initialIntervalInMs * time.Millisecond
-	f.errBackoff.MaxInterval = maxIntervalInMs * time.Millisecond
-	f.errBackoff.MaxElapsedTime = maxElapsedTimeInMs * time.Millisecond
+	f.errBackoff.InitialInterval = time.Duration(initialIntervalInMs) * time.Millisecond
+	f.errBackoff.MaxInterval = time.Duration(maxIntervalInMs) * time.Millisecond
+	f.errBackoff.MaxElapsedTime = time.Duration(maxElapsedTimeInMs) * time.Millisecond
 	f.errBackoff.Multiplier = multiplier
 	f.errBackoff.RandomizationFactor = 0
 
@@ -690,7 +690,7 @@ func TestBackoffNeverStops(t *testing.T) {
 func TestUpdateChangefeedEpoch(t *testing.T) {
 	ctx := cdcContext.NewBackendContext4Test(true)
 	// Set a long backoff time
-	manager := newFeedStateManager4Test(time.Hour, time.Hour, 0, 1.0)
+	manager := newFeedStateManager4Test(int(time.Hour), int(time.Hour), 0, 1.0)
 	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
 		ctx.ChangefeedVars().ID)
 	tester := orchestrator.NewReactorStateTester(t, state, nil)
@@ -739,3 +739,199 @@ func TestUpdateChangefeedEpoch(t *testing.T) {
 		}
 	}
 }
+
+func TestHandleWarning(t *testing.T) {
+	ctx := cdcContext.NewBackendContext4Test(true)
+	manager := newFeedStateManager4Test(200, 1600, 0, 2.0)
+	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
+		ctx.ChangefeedVars().ID)
+	tester := orchestrator.NewReactorStateTester(t, state, nil)
+	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
+		require.Nil(t, info)
+		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
+	})
+	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
+		require.Nil(t, status)
+		return &model.ChangeFeedStatus{
+			CheckpointTs: 200,
+		}, true, nil
+	})
+
+	tester.MustApplyPatches()
+	manager.Tick(state, 0)
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateNormal, state.Info.State)
+	require.True(t, manager.ShouldRunning())
+
+	// 1. test when an warning occurs, the changefeed state will be changed to warning
+	// and it will still keep running
+	state.PatchTaskPosition(ctx.GlobalVars().CaptureInfo.ID,
+		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
+			return &model.TaskPosition{Warning: &model.RunningError{
+				Addr:    ctx.GlobalVars().CaptureInfo.AdvertiseAddr,
+				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
+				Message: "fake error for test",
+			}}, true, nil
+		})
+	tester.MustApplyPatches()
+	manager.Tick(state, 0)
+	// some patches will be generated when the manager.Tick is called
+	// so we need to apply the patches before we check the state
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateWarning, state.Info.State)
+	require.True(t, manager.ShouldRunning())
+
+	// 2. test when the changefeed is in warning state, and the checkpointTs is not progressing,
+	// the changefeed state will remain warning
+	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
+		require.NotNil(t, status)
+		return &model.ChangeFeedStatus{
+			CheckpointTs: 200,
+		}, true, nil
+	})
+	tester.MustApplyPatches()
+	manager.Tick(state, 0)
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateWarning, state.Info.State)
+	require.True(t, manager.ShouldRunning())
+
+	// 3. test when the changefeed is in warning state, and the checkpointTs is progressing,
+	// the changefeed state will be changed to normal
+	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
+		require.NotNil(t, status)
+		return &model.ChangeFeedStatus{
+			CheckpointTs: 201,
+		}, true, nil
+	})
+	tester.MustApplyPatches()
+	manager.Tick(state, 0)
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateNormal, state.Info.State)
+	require.True(t, manager.ShouldRunning())
+
+	// 4. test when the changefeed is in warning state, and the checkpointTs is not progressing
+	// for defaultBackoffMaxElapsedTime time, the changefeed state will be changed to failed
+	// and it will stop running
+	state.PatchTaskPosition(ctx.GlobalVars().CaptureInfo.ID,
+		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
+			return &model.TaskPosition{Warning: &model.RunningError{
+				Addr:    ctx.GlobalVars().CaptureInfo.AdvertiseAddr,
+				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
+				Message: "fake error for test",
+			}}, true, nil
+		})
+	tester.MustApplyPatches()
+	manager.Tick(state, 0)
+	// some patches will be generated when the manager.Tick is called
+	// so we need to apply the patches before we check the state
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateWarning, state.Info.State)
+	require.True(t, manager.ShouldRunning())
+
+	state.PatchTaskPosition(ctx.GlobalVars().CaptureInfo.ID,
+		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
+			return &model.TaskPosition{Warning: &model.RunningError{
+				Addr:    ctx.GlobalVars().CaptureInfo.AdvertiseAddr,
+				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
+				Message: "fake error for test",
+			}}, true, nil
+		})
+	tester.MustApplyPatches()
+	// mock the checkpointTs is not progressing for defaultBackoffMaxElapsedTime time
+	manager.checkpointTsAdvanced = manager.
+		checkpointTsAdvanced.Add(-(defaultBackoffMaxElapsedTime + 1))
+	// resolveTs = 202 > checkpointTs = 201
+	manager.Tick(state, 202)
+	// some patches will be generated when the manager.Tick is called
+	// so we need to apply the patches before we check the state
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateFailed, state.Info.State)
+	require.False(t, manager.ShouldRunning())
+}
+
+func TestErrorAfterWarning(t *testing.T) {
+	t.Parallel()
+
+	maxElapsedTimeInMs := 2000
+	ctx := cdcContext.NewBackendContext4Test(true)
+	manager := newFeedStateManager4Test(200, 1600, maxElapsedTimeInMs, 2.0)
+	state := orchestrator.NewChangefeedReactorState(etcd.DefaultCDCClusterID,
+		ctx.ChangefeedVars().ID)
+	tester := orchestrator.NewReactorStateTester(t, state, nil)
+	state.PatchInfo(func(info *model.ChangeFeedInfo) (*model.ChangeFeedInfo, bool, error) {
+		require.Nil(t, info)
+		return &model.ChangeFeedInfo{SinkURI: "123", Config: &config.ReplicaConfig{}}, true, nil
+	})
+	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
+		require.Nil(t, status)
+		return &model.ChangeFeedStatus{
+			CheckpointTs: 200,
+		}, true, nil
+	})
+
+	tester.MustApplyPatches()
+	manager.Tick(state, 0)
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateNormal, state.Info.State)
+	require.True(t, manager.ShouldRunning())
+
+	// 1. test when an warning occurs, the changefeed state will be changed to warning
+	// and it will still keep running
+	state.PatchTaskPosition(ctx.GlobalVars().CaptureInfo.ID,
+		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
+			return &model.TaskPosition{Warning: &model.RunningError{
+				Addr:    ctx.GlobalVars().CaptureInfo.AdvertiseAddr,
+				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
+				Message: "fake error for test",
+			}}, true, nil
+		})
+	tester.MustApplyPatches()
+	manager.Tick(state, 0)
+	// some patches will be generated when the manager.Tick is called
+	// so we need to apply the patches before we check the state
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateWarning, state.Info.State)
+	require.True(t, manager.ShouldRunning())
+
+	// 2. test when the changefeed is in warning state, and the checkpointTs is not progressing,
+	// the changefeed state will remain warning
+	state.PatchStatus(func(status *model.ChangeFeedStatus) (*model.ChangeFeedStatus, bool, error) {
+		require.NotNil(t, status)
+		return &model.ChangeFeedStatus{
+			CheckpointTs: 200,
+		}, true, nil
+	})
+	tester.MustApplyPatches()
+	manager.Tick(state, 0)
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateWarning, state.Info.State)
+	require.True(t, manager.ShouldRunning())
+
+	// 3. Sleep maxElapsedTimeInMs to wait backoff timeout. And when an error occurs after an warning,
+	// the backoff will be reseted, and changefeed state will be changed to warning and it will still
+	// keep running.
+	time.Sleep(time.Millisecond * time.Duration(maxElapsedTimeInMs))
+	state.PatchTaskPosition(ctx.GlobalVars().CaptureInfo.ID,
+		func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
+			return &model.TaskPosition{Error: &model.RunningError{
+				Addr:    ctx.GlobalVars().CaptureInfo.AdvertiseAddr,
+				Code:    "[CDC:ErrSinkManagerRunError]", // it is fake error
+				Message: "fake error for test",
+			}}, true, nil
+		})
+	tester.MustApplyPatches()
+
+	manager.Tick(state, 0)
+	// some patches will be generated when the manager.Tick is called
+	// so we need to apply the patches before we check the state
+	tester.MustApplyPatches()
+	require.Equal(t, model.StatePending, state.Info.State)
+	require.False(t, manager.ShouldRunning())
+	manager.Tick(state, 0)
+
+	// some patches will be generated when the manager.Tick is called
+	// so we need to apply the patches before we check the state
+	tester.MustApplyPatches()
+	require.Equal(t, model.StateWarning, state.Info.State)
+	require.True(t, manager.ShouldRunning())
+}