pingcap · ti-chi-bot · Nov 3, 2023 · Nov 2, 2023 · Nov 3, 2023 · Nov 3, 2023
diff --git a/cdc/api/v2/model.go b/cdc/api/v2/model.go
@@ -255,11 +255,12 @@ func (c *ReplicaConfig) toInternalReplicaConfigWithOriginConfig(
 	}
 	if c.Consistent != nil {
 		res.Consistent = &config.ConsistentConfig{
-			Level:             c.Consistent.Level,
-			MaxLogSize:        c.Consistent.MaxLogSize,
-			FlushIntervalInMs: c.Consistent.FlushIntervalInMs,
-			Storage:           c.Consistent.Storage,
-			UseFileBackend:    c.Consistent.UseFileBackend,
+			Level:                 c.Consistent.Level,
+			MaxLogSize:            c.Consistent.MaxLogSize,
+			FlushIntervalInMs:     c.Consistent.FlushIntervalInMs,
+			MetaFlushIntervalInMs: c.Consistent.MetaFlushIntervalInMs,
+			Storage:               c.Consistent.Storage,
+			UseFileBackend:        c.Consistent.UseFileBackend,
 		}
 	}
 	if c.Sink != nil {
@@ -455,11 +456,12 @@ func ToAPIReplicaConfig(c *config.ReplicaConfig) *ReplicaConfig {
 	}
 	if cloned.Consistent != nil {
 		res.Consistent = &ConsistentConfig{
-			Level:             cloned.Consistent.Level,
-			MaxLogSize:        cloned.Consistent.MaxLogSize,
-			FlushIntervalInMs: cloned.Consistent.FlushIntervalInMs,
-			Storage:           cloned.Consistent.Storage,
-			UseFileBackend:    cloned.Consistent.UseFileBackend,
+			Level:                 cloned.Consistent.Level,
+			MaxLogSize:            cloned.Consistent.MaxLogSize,
+			FlushIntervalInMs:     cloned.Consistent.FlushIntervalInMs,
+			MetaFlushIntervalInMs: cloned.Consistent.MetaFlushIntervalInMs,
+			Storage:               cloned.Consistent.Storage,
+			UseFileBackend:        cloned.Consistent.UseFileBackend,
 		}
 	}
 	if cloned.Mounter != nil {
@@ -627,11 +629,12 @@ type ColumnSelector struct {
 // ConsistentConfig represents replication consistency config for a changefeed
 // This is a duplicate of config.ConsistentConfig
 type ConsistentConfig struct {
-	Level             string `json:"level"`
-	MaxLogSize        int64  `json:"max_log_size"`
-	FlushIntervalInMs int64  `json:"flush_interval"`
-	Storage           string `json:"storage"`
-	UseFileBackend    bool   `json:"use_file_backend"`
+	Level                 string `json:"level,omitempty"`
+	MaxLogSize            int64  `json:"max_log_size"`
+	FlushIntervalInMs     int64  `json:"flush_interval"`
+	MetaFlushIntervalInMs int64  `json:"meta_flush_interval"`
+	Storage               string `json:"storage,omitempty"`
+	UseFileBackend        bool   `json:"use_file_backend"`
 }
 
 // ChangefeedSchedulerConfig is per changefeed scheduler settings.

diff --git a/cdc/api/v2/model_test.go b/cdc/api/v2/model_test.go
@@ -56,11 +56,12 @@ var defaultAPIConfig = &ReplicaConfig{
 		AdvanceTimeoutInSec:      config.DefaultAdvanceTimeoutInSec,
 	},
 	Consistent: &ConsistentConfig{
-		Level:             "none",
-		MaxLogSize:        64,
-		FlushIntervalInMs: redo.DefaultFlushIntervalInMs,
-		Storage:           "",
-		UseFileBackend:    false,
+		Level:                 "none",
+		MaxLogSize:            64,
+		FlushIntervalInMs:     redo.DefaultFlushIntervalInMs,
+		MetaFlushIntervalInMs: redo.DefaultMetaFlushIntervalInMs,
+		Storage:               "",
+		UseFileBackend:        false,
 	},
 }
 

diff --git a/cdc/owner/changefeed_test.go b/cdc/owner/changefeed_test.go
@@ -504,9 +504,10 @@ func TestRemoveChangefeed(t *testing.T) {
 	info := ctx.ChangefeedVars().Info
 	dir := t.TempDir()
 	info.Config.Consistent = &config.ConsistentConfig{
-		Level:             "eventual",
-		Storage:           filepath.Join("nfs://", dir),
-		FlushIntervalInMs: redo.DefaultFlushIntervalInMs,
+		Level:                 "eventual",
+		Storage:               filepath.Join("nfs://", dir),
+		FlushIntervalInMs:     redo.DefaultFlushIntervalInMs,
+		MetaFlushIntervalInMs: redo.DefaultMetaFlushIntervalInMs,
 	}
 	ctx = cdcContext.WithChangefeedVars(ctx, &cdcContext.ChangefeedVars{
 		ID:   ctx.ChangefeedVars().ID,

diff --git a/cdc/owner/feed_state_manager.go b/cdc/owner/feed_state_manager.go
@@ -63,11 +63,11 @@ type feedStateManager struct {
 
 	// resolvedTs and initCheckpointTs is for checking whether resolved timestamp
 	// has been advanced or not.
-	resolvedTs       model.Ts
-	initCheckpointTs model.Ts
-
+	resolvedTs           model.Ts
+	checkpointTs         model.Ts
 	checkpointTsAdvanced time.Time
-	lastCheckpointTs     model.Ts
+
+	changefeedErrorStuckDuration time.Duration
 }
 
 // newFeedStateManager creates feedStateManager and initialize the exponential backoff
@@ -83,34 +83,61 @@ func newFeedStateManager(up *upstream.Upstream) *feedStateManager {
 	// backoff will stop once the defaultBackoffMaxElapsedTime has elapsed.
 	f.errBackoff.MaxElapsedTime = defaultBackoffMaxElapsedTime
 
-	f.resetErrRetry()
+	f.changefeedErrorStuckDuration = defaultBackoffMaxElapsedTime
 
+	f.resetErrRetry()
+	f.isRetrying = false
 	return f
 }
 
+func (m *feedStateManager) shouldRetry() bool {
+	// changefeed should not retry within [m.lastErrorRetryTime, m.lastErrorRetryTime + m.backoffInterval).
+	return time.Since(m.lastErrorRetryTime) >= m.backoffInterval
+}
+
+func (m *feedStateManager) shouldFailWhenRetry() bool {
+	// retry the changefeed
+	m.backoffInterval = m.errBackoff.NextBackOff()
+	// NextBackOff() will return -1 once the MaxElapsedTime has elapsed,
+	// set the changefeed to failed state.
+	if m.backoffInterval == m.errBackoff.Stop {
+		return true
+	}
+
+	m.lastErrorRetryTime = time.Now()
+	return false
+}
+
+// resetErrRetry reset the error retry related fields
+func (m *feedStateManager) resetErrRetry() {
+	m.errBackoff.Reset()
+	m.backoffInterval = m.errBackoff.NextBackOff()
+	m.lastErrorRetryTime = time.Unix(0, 0)
+}
+
 func (m *feedStateManager) Tick(
 	state *orchestrator.ChangefeedReactorState,
 	resolvedTs model.Ts,
 ) (adminJobPending bool) {
+	m.checkAndInitLastRetryCheckpointTs(state.Status)
+
 	if state.Status != nil {
-		if m.lastCheckpointTs < state.Status.CheckpointTs {
-			m.lastCheckpointTs = state.Status.CheckpointTs
+		if m.checkpointTs < state.Status.CheckpointTs {
+			m.checkpointTs = state.Status.CheckpointTs
 			m.checkpointTsAdvanced = time.Now()
 		}
-		if m.state == nil || m.state.Status == nil {
-			// It's the first time `m.state.Status` gets filled.
-			m.initCheckpointTs = state.Status.CheckpointTs
+		if m.resolvedTs < resolvedTs {
+			m.resolvedTs = resolvedTs
+		}
+		if m.checkpointTs >= m.resolvedTs {
+			m.checkpointTsAdvanced = time.Now()
 		}
 	}
-
-	m.checkAndInitLastRetryCheckpointTs(state.Status)
-
 	m.state = state
-	m.resolvedTs = resolvedTs
 	m.shouldBeRunning = true
 	defer func() {
 		if !m.shouldBeRunning {
-			m.cleanUpTaskPositions()
+			m.cleanUp()
 		}
 	}()
 	if m.handleAdminJob() {
@@ -131,16 +158,12 @@ func (m *feedStateManager) Tick(
 		m.shouldBeRunning = false
 		return
 	case model.StatePending:
-		if time.Since(m.lastErrorRetryTime) < m.backoffInterval {
+		if !m.shouldRetry() {
 			m.shouldBeRunning = false
 			return
 		}
-		// retry the changefeed
-		oldBackoffInterval := m.backoffInterval
-		m.backoffInterval = m.errBackoff.NextBackOff()
-		// NextBackOff() will return -1 once the MaxElapsedTime has elapsed,
-		// set the changefeed to failed state.
-		if m.backoffInterval == m.errBackoff.Stop {
+
+		if m.shouldFailWhenRetry() {
 			log.Error("The changefeed won't be restarted as it has been experiencing failures for "+
 				"an extended duration",
 				zap.Duration("maxElapsedTime", m.errBackoff.MaxElapsedTime),
@@ -154,18 +177,17 @@ func (m *feedStateManager) Tick(
 			return
 		}
 
-		m.lastErrorRetryTime = time.Now()
+		// retry the changefeed
+		m.shouldBeRunning = true
 		if m.state.Status != nil {
 			m.lastErrorRetryCheckpointTs = m.state.Status.CheckpointTs
 		}
-		m.shouldBeRunning = true
 		m.patchState(model.StateWarning)
 		log.Info("changefeed retry backoff interval is elapsed,"+
 			"chengefeed will be restarted",
 			zap.String("namespace", m.state.ID.Namespace),
 			zap.String("changefeed", m.state.ID.ID),
 			zap.Time("lastErrorRetryTime", m.lastErrorRetryTime),
-			zap.Duration("lastRetryInterval", oldBackoffInterval),
 			zap.Duration("nextRetryInterval", m.backoffInterval))
 	case model.StateNormal, model.StateWarning:
 		m.checkAndChangeState()
@@ -274,7 +296,7 @@ func (m *feedStateManager) handleAdminJob() (jobsPending bool) {
 		m.shouldBeRunning = true
 		// when the changefeed is manually resumed, we must reset the backoff
 		m.resetErrRetry()
-		// The lastErrorTime also needs to be cleared before a fresh run.
+		m.isRetrying = false
 		jobsPending = true
 		m.patchState(model.StateNormal)
 
@@ -407,12 +429,15 @@ func (m *feedStateManager) patchState(feedState model.FeedState) {
 	})
 }
 
-func (m *feedStateManager) cleanUpTaskPositions() {
+func (m *feedStateManager) cleanUp() {
 	for captureID := range m.state.TaskPositions {
 		m.state.PatchTaskPosition(captureID, func(position *model.TaskPosition) (*model.TaskPosition, bool, error) {
 			return nil, position != nil, nil
 		})
 	}
+	m.checkpointTs = 0
+	m.checkpointTsAdvanced = time.Time{}
+	m.resolvedTs = 0
 }
 
 func (m *feedStateManager) errorsReportedByProcessors() []*model.RunningError {
@@ -535,12 +560,12 @@ func (m *feedStateManager) handleError(errs ...*model.RunningError) {
 			info.Error = lastError
 			return info, true, nil
 		})
-	}
 
-	// The errBackoff needs to be reset before the first retry.
-	if !m.isRetrying {
-		m.resetErrRetry()
-		m.isRetrying = true
+		// The errBackoff needs to be reset before the first retry.
+		if !m.isRetrying {
+			m.resetErrRetry()
+			m.isRetrying = true
+		}
 	}
 }
 
@@ -554,13 +579,14 @@ func (m *feedStateManager) handleWarning(errs ...*model.RunningError) {
 		currTime := m.upstream.PDClock.CurrentTime()
 		ckptTime := oracle.GetTimeFromTS(m.state.Status.CheckpointTs)
 		m.lastWarningReportCheckpointTs = m.state.Status.CheckpointTs
-		// Conditions:
-		// 1. checkpoint lag is large enough;
-		// 2. checkpoint hasn't been advanced for a long while;
-		// 3. the changefeed has been initialized.
-		if currTime.Sub(ckptTime) > defaultBackoffMaxElapsedTime &&
-			time.Since(m.checkpointTsAdvanced) > defaultBackoffMaxElapsedTime &&
-			m.resolvedTs > m.initCheckpointTs {
+
+		checkpointTsStuck := time.Since(m.checkpointTsAdvanced) > m.changefeedErrorStuckDuration
+		if checkpointTsStuck {
+			log.Info("changefeed retry on warning for a very long time and does not resume, "+
+				"it will be failed", zap.String("changefeed", m.state.ID.ID),
+				zap.Uint64("checkpointTs", m.state.Status.CheckpointTs),
+				zap.Duration("checkpointTime", currTime.Sub(ckptTime)),
+			)
 			code, _ := cerrors.RFCCode(cerrors.ErrChangefeedUnretryable)
 			m.handleError(&model.RunningError{
 				Time:    lastError.Time,
@@ -592,13 +618,6 @@ func GenerateChangefeedEpoch(ctx context.Context, pdClient pd.Client) uint64 {
 	return oracle.ComposeTS(phyTs, logical)
 }
 
-// resetErrRetry reset the error retry related fields
-func (m *feedStateManager) resetErrRetry() {
-	m.errBackoff.Reset()
-	m.backoffInterval = m.errBackoff.NextBackOff()
-	m.lastErrorRetryTime = time.Unix(0, 0)
-}
-
 // checkAndChangeState checks the state of the changefeed and change it if needed.
 // if the state of the changefeed is warning and the changefeed's checkpointTs is
 // greater than the lastRetryCheckpointTs, it will change the state to normal.