Skip to content

Commit

Permalink
Merge branch 'release-7.5' into cherry-pick-8703-to-release-7.5
Browse files Browse the repository at this point in the history
  • Loading branch information
ti-chi-bot[bot] authored Dec 9, 2024
2 parents 5b7dafe + 81a0619 commit 7514094
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 84 deletions.
88 changes: 88 additions & 0 deletions pkg/mock/mockserver/mockserver.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright 2024 TiKV Project Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mockserver

import (
"context"

"github.com/pingcap/kvproto/pkg/pdpb"
"github.com/tikv/pd/pkg/core"
"github.com/tikv/pd/pkg/storage"
"github.com/tikv/pd/pkg/utils/grpcutil"
)

// MockServer is used to mock Server for test use.
type MockServer struct {
ctx context.Context
member, leader *pdpb.Member
storage storage.Storage
bc *core.BasicCluster
}

// NewMockServer creates a new MockServer.
func NewMockServer(ctx context.Context, member, leader *pdpb.Member, storage storage.Storage, bc *core.BasicCluster) *MockServer {
return &MockServer{
ctx: ctx,
member: member,
leader: leader,
storage: storage,
bc: bc,
}
}

// LoopContext returns the context of the server.
func (s *MockServer) LoopContext() context.Context {
return s.ctx
}

// ClusterID returns the cluster ID of the server.
func (*MockServer) ClusterID() uint64 {
return 1
}

// GetMemberInfo returns the member info of the server.
func (s *MockServer) GetMemberInfo() *pdpb.Member {
return s.member
}

// GetLeader returns the leader of the server.
func (s *MockServer) GetLeader() *pdpb.Member {
return s.leader
}

// GetStorage returns the storage of the server.
func (s *MockServer) GetStorage() storage.Storage {
return s.storage
}

// Name returns the name of the server.
func (*MockServer) Name() string {
return "mock-server"
}

// GetRegions returns the regions of the server.
func (s *MockServer) GetRegions() []*core.RegionInfo {
return s.bc.GetRegions()
}

// GetTLSConfig returns the TLS config of the server.
func (*MockServer) GetTLSConfig() *grpcutil.TLSConfig {
return &grpcutil.TLSConfig{}
}

// GetBasicCluster returns the basic cluster of the server.
func (s *MockServer) GetBasicCluster() *core.BasicCluster {
return s.bc
}
15 changes: 13 additions & 2 deletions pkg/replication/replication_mode.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,7 @@ const (
type drAutoSyncStatus struct {
State string `json:"state,omitempty"`
StateID uint64 `json:"state_id,omitempty"`
AsyncStartTime *time.Time `json:"async_start,omitempty"`
RecoverStartTime *time.Time `json:"recover_start,omitempty"`
TotalRegions int `json:"total_regions,omitempty"`
SyncedRegions int `json:"synced_regions,omitempty"`
Expand Down Expand Up @@ -262,7 +263,8 @@ func (m *ModeManager) drSwitchToAsyncWithLock(availableStores []uint64) error {
log.Warn("failed to switch to async state", zap.String("replicate-mode", modeDRAutoSync), errs.ZapError(err))
return err
}
dr := drAutoSyncStatus{State: drStateAsync, StateID: id, AvailableStores: availableStores}
now := time.Now()
dr := drAutoSyncStatus{State: drStateAsync, StateID: id, AvailableStores: availableStores, AsyncStartTime: &now}
if err := m.storage.SaveReplicationStatus(modeDRAutoSync, dr); err != nil {
log.Warn("failed to switch to async state", zap.String("replicate-mode", modeDRAutoSync), errs.ZapError(err))
return err
Expand All @@ -272,6 +274,15 @@ func (m *ModeManager) drSwitchToAsyncWithLock(availableStores []uint64) error {
return nil
}

func (m *ModeManager) drDurationSinceAsyncStart() time.Duration {
m.RLock()
defer m.RUnlock()
if m.drAutoSync.AsyncStartTime == nil {
return 0
}
return time.Since(*m.drAutoSync.AsyncStartTime)
}

func (m *ModeManager) drSwitchToSyncRecover() error {
m.Lock()
defer m.Unlock()
Expand Down Expand Up @@ -477,7 +488,7 @@ func (m *ModeManager) tickUpdateState() {
m.drSwitchToAsync(storeIDs[primaryUp])
}
case drStateAsync:
if canSync {
if canSync && m.drDurationSinceAsyncStart() > m.config.DRAutoSync.WaitRecoverTimeout.Duration {
m.drSwitchToSyncRecover()
break
}
Expand Down
60 changes: 41 additions & 19 deletions pkg/replication/replication_mode_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package replication

import (
"context"
"encoding/json"
"errors"
"fmt"
"testing"
Expand Down Expand Up @@ -159,6 +160,20 @@ func newMockReplicator(ids []uint64) *mockFileReplicator {
}
}

func assertLastData(t *testing.T, data string, state string, stateID uint64, availableStores []uint64) {
type status struct {
State string `json:"state"`
StateID uint64 `json:"state_id"`
AvailableStores []uint64 `json:"available_stores"`
}
var s status
err := json.Unmarshal([]byte(data), &s)
require.NoError(t, err)
require.Equal(t, state, s.State)
require.Equal(t, stateID, s.StateID)
require.Equal(t, availableStores, s.AvailableStores)
}

func TestStateSwitch(t *testing.T) {
re := require.New(t)
ctx, cancel := context.WithCancel(context.Background())
Expand Down Expand Up @@ -190,7 +205,7 @@ func TestStateSwitch(t *testing.T) {
stateID := rep.drAutoSync.StateID
re.NotEqual(uint64(0), stateID)
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "sync", stateID, nil)
assertStateIDUpdate := func() {
re.NotEqual(stateID, rep.drAutoSync.StateID)
stateID = rep.drAutoSync.StateID
Expand All @@ -207,7 +222,7 @@ func TestStateSwitch(t *testing.T) {
re.Equal(drStateAsyncWait, rep.drGetState())
assertStateIDUpdate()
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4]}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async_wait", stateID, []uint64{1, 2, 3, 4})

re.False(rep.GetReplicationStatus().GetDrAutoSync().GetPauseRegionSplit())
conf.DRAutoSync.PauseRegionSplit = true
Expand All @@ -218,7 +233,7 @@ func TestStateSwitch(t *testing.T) {
rep.tickUpdateState()
assertStateIDUpdate()
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async","state_id":%d,"available_stores":[1,2,3,4]}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async", stateID, []uint64{1, 2, 3, 4})

// add new store in dr zone.
cluster.AddLabelsStore(5, 1, map[string]string{"zone": "zone2"})
Expand Down Expand Up @@ -268,18 +283,19 @@ func TestStateSwitch(t *testing.T) {
rep.tickUpdateState()
re.Equal(drStateAsyncWait, rep.drGetState())
assertStateIDUpdate()

rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4]}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async_wait", stateID, []uint64{1, 2, 3, 4})
setStoreState(cluster, "down", "up", "up", "up", "down", "down")
rep.tickUpdateState()
assertStateIDUpdate()
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[2,3,4]}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async_wait", stateID, []uint64{2, 3, 4})
setStoreState(cluster, "up", "down", "up", "up", "down", "down")
rep.tickUpdateState()
assertStateIDUpdate()
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,3,4]}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async_wait", stateID, []uint64{1, 3, 4})

// async_wait -> async
rep.tickUpdateState()
Expand All @@ -291,26 +307,32 @@ func TestStateSwitch(t *testing.T) {
rep.tickUpdateState()
assertStateIDUpdate()
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async","state_id":%d,"available_stores":[1,3,4]}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async", stateID, []uint64{1, 3, 4})

// async -> async
setStoreState(cluster, "up", "up", "up", "up", "down", "down")
rep.tickUpdateState()
// store 2 won't be available before it syncs status.
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async","state_id":%d,"available_stores":[1,3,4]}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async", stateID, []uint64{1, 3, 4})
syncStoreStatus(1, 2, 3, 4)
rep.tickUpdateState()
assertStateIDUpdate()
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async","state_id":%d,"available_stores":[1,2,3,4]}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async", stateID, []uint64{1, 2, 3, 4})

// async -> sync_recover
setStoreState(cluster, "up", "up", "up", "up", "up", "up")
rep.tickUpdateState()
re.Equal(drStateSyncRecover, rep.drGetState())
assertStateIDUpdate()

rep.drSwitchToAsync([]uint64{1, 2, 3, 4, 5})
rep.config.DRAutoSync.WaitRecoverTimeout = typeutil.NewDuration(time.Hour)
rep.tickUpdateState()
re.Equal(drStateAsync, rep.drGetState()) // wait recover timeout

rep.config.DRAutoSync.WaitRecoverTimeout = typeutil.NewDuration(0)
setStoreState(cluster, "down", "up", "up", "up", "up", "up")
rep.tickUpdateState()
re.Equal(drStateSyncRecover, rep.drGetState())
Expand Down Expand Up @@ -387,27 +409,27 @@ func TestReplicateState(t *testing.T) {
stateID := rep.drAutoSync.StateID
// replicate after initialized
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "sync", stateID, nil)

// repliate state to new member
replicator.memberIDs = append(replicator.memberIDs, 2, 3)
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[2])
re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[3])
assertLastData(t, replicator.lastData[2], "sync", stateID, nil)
assertLastData(t, replicator.lastData[3], "sync", stateID, nil)

// inject error
replicator.errors[2] = errors.New("failed to persist")
rep.tickUpdateState() // switch async_wait since there is only one zone
newStateID := rep.drAutoSync.StateID
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2]}`, newStateID), replicator.lastData[1])
re.Equal(fmt.Sprintf(`{"state":"sync","state_id":%d}`, stateID), replicator.lastData[2])
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2]}`, newStateID), replicator.lastData[3])
assertLastData(t, replicator.lastData[1], "async_wait", newStateID, []uint64{1, 2})
assertLastData(t, replicator.lastData[2], "sync", stateID, nil)
assertLastData(t, replicator.lastData[3], "async_wait", newStateID, []uint64{1, 2})

// clear error, replicate to node 2 next time
delete(replicator.errors, 2)
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2]}`, newStateID), replicator.lastData[2])
assertLastData(t, replicator.lastData[2], "async_wait", newStateID, []uint64{1, 2})
}

func TestAsynctimeout(t *testing.T) {
Expand Down Expand Up @@ -637,7 +659,7 @@ func TestComplexPlacementRules(t *testing.T) {
rep.tickUpdateState()
re.Equal(drStateAsyncWait, rep.drGetState())
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4,5,6]}`, rep.drAutoSync.StateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async_wait", rep.drAutoSync.StateID, []uint64{1, 2, 3, 4, 5, 6})

// reset to sync
setStoreState(cluster, "up", "up", "up", "up", "up", "up", "up", "up", "up", "up")
Expand Down Expand Up @@ -698,7 +720,7 @@ func TestComplexPlacementRules2(t *testing.T) {
rep.tickUpdateState()
re.Equal(drStateAsyncWait, rep.drGetState())
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4]}`, rep.drAutoSync.StateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async_wait", rep.drAutoSync.StateID, []uint64{1, 2, 3, 4})
}

func TestComplexPlacementRules3(t *testing.T) {
Expand Down Expand Up @@ -737,7 +759,7 @@ func TestComplexPlacementRules3(t *testing.T) {
rep.tickUpdateState()
re.Equal(drStateAsyncWait, rep.drGetState())
rep.tickReplicateStatus()
re.Equal(fmt.Sprintf(`{"state":"async_wait","state_id":%d,"available_stores":[1,2,3,4]}`, rep.drAutoSync.StateID), replicator.lastData[1])
assertLastData(t, replicator.lastData[1], "async_wait", rep.drAutoSync.StateID, []uint64{1, 2, 3, 4})
}

func genRegions(cluster *mockcluster.Cluster, stateID uint64, n int) []*core.RegionInfo {
Expand Down
20 changes: 18 additions & 2 deletions pkg/syncer/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ func (s *RegionSyncer) StartSyncWithLeader(addr string) {
go func() {
defer logutil.LogPanic()
defer s.wg.Done()
timer := time.NewTimer(retryInterval)
defer timer.Stop()
// used to load region from kv storage to cache storage.
bc := s.server.GetBasicCluster()
regionStorage := s.server.GetStorage()
Expand Down Expand Up @@ -140,11 +142,18 @@ func (s *RegionSyncer) StartSyncWithLeader(addr string) {
}
}
log.Error("server failed to establish sync stream with leader", zap.String("server", s.server.Name()), zap.String("leader", s.server.GetLeader().GetName()), errs.ZapError(err))
if !timer.Stop() {
select {
case <-timer.C: // try to drain from the channel
default:
}
}
timer.Reset(retryInterval)
select {
case <-ctx.Done():
log.Info("stop synchronizing with leader due to context canceled")
return
case <-time.After(retryInterval):
case <-timer.C:
}
continue
}
Expand All @@ -157,11 +166,18 @@ func (s *RegionSyncer) StartSyncWithLeader(addr string) {
if err = stream.CloseSend(); err != nil {
log.Error("failed to terminate client stream", errs.ZapError(errs.ErrGRPCCloseSend, err))
}
if !timer.Stop() {
select {
case <-timer.C: // try to drain from the channel
default:
}
}
timer.Reset(retryInterval)
select {
case <-ctx.Done():
log.Info("stop synchronizing with leader due to context canceled")
return
case <-time.After(retryInterval):
case <-timer.C:
}
break
}
Expand Down
Loading

0 comments on commit 7514094

Please sign in to comment.