From 5b3d0172b0db653a97257373fac9ce9d9699c6f5 Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Thu, 21 Sep 2023 14:24:13 +0800 Subject: [PATCH] *: fix sync isolation level to default placement rule (#7122) close tikv/pd#7121 Signed-off-by: Ryan Leung Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/mcs/scheduling/server/cluster.go | 2 +- pkg/mock/mockcluster/mockcluster.go | 2 +- pkg/schedule/checker/rule_checker_test.go | 35 ++++++++++++++++++ pkg/schedule/placement/rule_manager.go | 5 ++- pkg/schedule/placement/rule_manager_test.go | 6 ++-- pkg/statistics/region_collection_test.go | 4 +-- server/api/operator_test.go | 4 ++- server/cluster/cluster.go | 2 +- server/cluster/cluster_test.go | 10 +++--- server/config/persist_options.go | 7 ++++ server/server.go | 11 +++--- tests/pdctl/config/config_test.go | 40 ++++++++++++++++++--- 12 files changed, 104 insertions(+), 24 deletions(-) diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 81c82d73d33..b2986f722df 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -75,7 +75,7 @@ func NewCluster(parentCtx context.Context, persistConfig *config.PersistConfig, checkMembershipCh: checkMembershipCh, } c.coordinator = schedule.NewCoordinator(ctx, c, hbStreams) - err = c.ruleManager.Initialize(persistConfig.GetMaxReplicas(), persistConfig.GetLocationLabels()) + err = c.ruleManager.Initialize(persistConfig.GetMaxReplicas(), persistConfig.GetLocationLabels(), persistConfig.GetIsolationLevel()) if err != nil { cancel() return nil, err diff --git a/pkg/mock/mockcluster/mockcluster.go b/pkg/mock/mockcluster/mockcluster.go index 1ed7ab4eb9f..01282b40534 100644 --- a/pkg/mock/mockcluster/mockcluster.go +++ b/pkg/mock/mockcluster/mockcluster.go @@ -213,7 +213,7 @@ func (mc *Cluster) AllocPeer(storeID uint64) (*metapb.Peer, error) { func (mc *Cluster) initRuleManager() { if mc.RuleManager == nil { mc.RuleManager = placement.NewRuleManager(mc.GetStorage(), mc, mc.GetSharedConfig()) - mc.RuleManager.Initialize(int(mc.GetReplicationConfig().MaxReplicas), mc.GetReplicationConfig().LocationLabels) + mc.RuleManager.Initialize(int(mc.GetReplicationConfig().MaxReplicas), mc.GetReplicationConfig().LocationLabels, mc.GetReplicationConfig().IsolationLevel) } } diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index cbd7624f3b1..ad140e91606 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -112,6 +112,41 @@ func (suite *ruleCheckerTestSuite) TestAddRulePeerWithIsolationLevel() { suite.Equal(uint64(4), op.Step(0).(operator.AddLearner).ToStore) } +func (suite *ruleCheckerTestSuite) TestReplaceDownPeerWithIsolationLevel() { + suite.cluster.SetMaxStoreDownTime(100 * time.Millisecond) + suite.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1", "host": "h2"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z2", "host": "h3"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z2", "host": "h4"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z3", "host": "h5"}) + suite.cluster.AddLabelsStore(6, 1, map[string]string{"zone": "z3", "host": "h6"}) + suite.cluster.AddLeaderRegionWithRange(1, "", "", 1, 3, 5) + suite.ruleManager.DeleteRule("pd", "default") + suite.ruleManager.SetRule(&placement.Rule{ + GroupID: "pd", + ID: "test", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 3, + LocationLabels: []string{"zone", "host"}, + IsolationLevel: "zone", + }) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) + region := suite.cluster.GetRegion(1) + downPeer := []*pdpb.PeerStats{ + {Peer: region.GetStorePeer(5), DownSeconds: 6000}, + } + region = region.Clone(core.WithDownPeers(downPeer)) + suite.cluster.PutRegion(region) + suite.cluster.SetStoreDown(5) + suite.cluster.SetStoreDown(6) + time.Sleep(200 * time.Millisecond) + op = suite.rc.Check(suite.cluster.GetRegion(1)) + suite.Nil(op) +} + func (suite *ruleCheckerTestSuite) TestFixPeer() { suite.cluster.AddLeaderStore(1, 1) suite.cluster.AddLeaderStore(2, 1) diff --git a/pkg/schedule/placement/rule_manager.go b/pkg/schedule/placement/rule_manager.go index 3bd272a00ac..909c0fa1078 100644 --- a/pkg/schedule/placement/rule_manager.go +++ b/pkg/schedule/placement/rule_manager.go @@ -66,7 +66,7 @@ func NewRuleManager(storage endpoint.RuleStorage, storeSetInformer core.StoreSet // Initialize loads rules from storage. If Placement Rules feature is never enabled, it creates default rule that is // compatible with previous configuration. -func (m *RuleManager) Initialize(maxReplica int, locationLabels []string) error { +func (m *RuleManager) Initialize(maxReplica int, locationLabels []string, isolationLevel string) error { m.Lock() defer m.Unlock() if m.initialized { @@ -93,6 +93,7 @@ func (m *RuleManager) Initialize(maxReplica int, locationLabels []string) error Role: Voter, Count: maxReplica - witnessCount, LocationLabels: locationLabels, + IsolationLevel: isolationLevel, }, { GroupID: "pd", @@ -101,6 +102,7 @@ func (m *RuleManager) Initialize(maxReplica int, locationLabels []string) error Count: witnessCount, IsWitness: true, LocationLabels: locationLabels, + IsolationLevel: isolationLevel, }, }..., ) @@ -111,6 +113,7 @@ func (m *RuleManager) Initialize(maxReplica int, locationLabels []string) error Role: Voter, Count: maxReplica, LocationLabels: locationLabels, + IsolationLevel: isolationLevel, }) } for _, defaultRule := range defaultRules { diff --git a/pkg/schedule/placement/rule_manager_test.go b/pkg/schedule/placement/rule_manager_test.go index e5be8d74cd2..a6454337aa8 100644 --- a/pkg/schedule/placement/rule_manager_test.go +++ b/pkg/schedule/placement/rule_manager_test.go @@ -34,7 +34,7 @@ func newTestManager(t *testing.T, enableWitness bool) (endpoint.RuleStorage, *Ru var err error manager := NewRuleManager(store, nil, mockconfig.NewTestOptions()) manager.conf.SetEnableWitness(enableWitness) - err = manager.Initialize(3, []string{"zone", "rack", "host"}) + err = manager.Initialize(3, []string{"zone", "rack", "host"}, "") re.NoError(err) return store, manager } @@ -157,7 +157,7 @@ func TestSaveLoad(t *testing.T) { } m2 := NewRuleManager(store, nil, nil) - err := m2.Initialize(3, []string{"no", "labels"}) + err := m2.Initialize(3, []string{"no", "labels"}, "") re.NoError(err) re.Len(m2.GetAllRules(), 3) re.Equal(rules[0].String(), m2.GetRule("pd", "default").String()) @@ -173,7 +173,7 @@ func TestSetAfterGet(t *testing.T) { manager.SetRule(rule) m2 := NewRuleManager(store, nil, nil) - err := m2.Initialize(100, []string{}) + err := m2.Initialize(100, []string{}, "") re.NoError(err) rule = m2.GetRule("pd", "default") re.Equal(1, rule.Count) diff --git a/pkg/statistics/region_collection_test.go b/pkg/statistics/region_collection_test.go index 232fb8b73d8..2706ffeb043 100644 --- a/pkg/statistics/region_collection_test.go +++ b/pkg/statistics/region_collection_test.go @@ -30,7 +30,7 @@ func TestRegionStatistics(t *testing.T) { re := require.New(t) store := storage.NewStorageWithMemoryBackend() manager := placement.NewRuleManager(store, nil, nil) - err := manager.Initialize(3, []string{"zone", "rack", "host"}) + err := manager.Initialize(3, []string{"zone", "rack", "host"}, "") re.NoError(err) opt := mockconfig.NewTestOptions() opt.SetPlacementRuleEnabled(false) @@ -120,7 +120,7 @@ func TestRegionStatisticsWithPlacementRule(t *testing.T) { re := require.New(t) store := storage.NewStorageWithMemoryBackend() manager := placement.NewRuleManager(store, nil, nil) - err := manager.Initialize(3, []string{"zone", "rack", "host"}) + err := manager.Initialize(3, []string{"zone", "rack", "host"}, "") re.NoError(err) opt := mockconfig.NewTestOptions() opt.SetPlacementRuleEnabled(true) diff --git a/server/api/operator_test.go b/server/api/operator_test.go index ddb605c7d87..ee849552f09 100644 --- a/server/api/operator_test.go +++ b/server/api/operator_test.go @@ -383,7 +383,9 @@ func (suite *transferRegionOperatorTestSuite) TestTransferRegionWithPlacementRul if testCase.placementRuleEnable { err := suite.svr.GetRaftCluster().GetRuleManager().Initialize( suite.svr.GetRaftCluster().GetOpts().GetMaxReplicas(), - suite.svr.GetRaftCluster().GetOpts().GetLocationLabels()) + suite.svr.GetRaftCluster().GetOpts().GetLocationLabels(), + suite.svr.GetRaftCluster().GetOpts().GetIsolationLevel(), + ) suite.NoError(err) } if len(testCase.rules) > 0 { diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 771fb03ac20..d42dbb21ed1 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -301,7 +301,7 @@ func (c *RaftCluster) Start(s Server) error { c.ruleManager = placement.NewRuleManager(c.storage, c, c.GetOpts()) if c.opt.IsPlacementRulesEnabled() { - err = c.ruleManager.Initialize(c.opt.GetMaxReplicas(), c.opt.GetLocationLabels()) + err = c.ruleManager.Initialize(c.opt.GetMaxReplicas(), c.opt.GetLocationLabels(), c.opt.GetIsolationLevel()) if err != nil { return err } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index c9d4d0f8f61..aa826e34406 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -243,7 +243,7 @@ func TestSetOfflineStore(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) cluster.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), cluster, cluster.GetOpts()) if opt.IsPlacementRulesEnabled() { - err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } @@ -440,7 +440,7 @@ func TestUpStore(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) cluster.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), cluster, cluster.GetOpts()) if opt.IsPlacementRulesEnabled() { - err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } @@ -543,7 +543,7 @@ func TestDeleteStoreUpdatesClusterVersion(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) cluster.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), cluster, cluster.GetOpts()) if opt.IsPlacementRulesEnabled() { - err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } @@ -1270,7 +1270,7 @@ func TestOfflineAndMerge(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) cluster.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), cluster, cluster.GetOpts()) if opt.IsPlacementRulesEnabled() { - err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := cluster.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } @@ -2129,7 +2129,7 @@ func newTestRaftCluster( rc.InitCluster(id, opt, s, basicCluster, nil) rc.ruleManager = placement.NewRuleManager(storage.NewStorageWithMemoryBackend(), rc, opt) if opt.IsPlacementRulesEnabled() { - err := rc.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels()) + err := rc.ruleManager.Initialize(opt.GetMaxReplicas(), opt.GetLocationLabels(), opt.GetIsolationLevel()) if err != nil { panic(err) } diff --git a/server/config/persist_options.go b/server/config/persist_options.go index 3f1c4d4a24e..14fdbf653aa 100644 --- a/server/config/persist_options.go +++ b/server/config/persist_options.go @@ -330,6 +330,13 @@ func (o *PersistOptions) SetEnableWitness(enable bool) { o.SetScheduleConfig(v) } +// SetMaxStoreDownTime to set the max store down time. It's only used to test. +func (o *PersistOptions) SetMaxStoreDownTime(time time.Duration) { + v := o.GetScheduleConfig().Clone() + v.MaxStoreDownTime = typeutil.NewDuration(time) + o.SetScheduleConfig(v) +} + // SetMaxMergeRegionSize sets the max merge region size. func (o *PersistOptions) SetMaxMergeRegionSize(maxMergeRegionSize uint64) { v := o.GetScheduleConfig().Clone() diff --git a/server/server.go b/server/server.go index 03e036a968e..2fb66387d7a 100644 --- a/server/server.go +++ b/server/server.go @@ -1030,7 +1030,7 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { } if cfg.EnablePlacementRules { // initialize rule manager. - if err := rc.GetRuleManager().Initialize(int(cfg.MaxReplicas), cfg.LocationLabels); err != nil { + if err := rc.GetRuleManager().Initialize(int(cfg.MaxReplicas), cfg.LocationLabels, cfg.IsolationLevel); err != nil { return err } } else { @@ -1053,19 +1053,19 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { defaultRule := rc.GetRuleManager().GetRule("pd", "default") CheckInDefaultRule := func() error { - // replication config won't work when placement rule is enabled and exceeds one default rule + // replication config won't work when placement rule is enabled and exceeds one default rule if !(defaultRule != nil && len(defaultRule.StartKey) == 0 && len(defaultRule.EndKey) == 0) { - return errors.New("cannot update MaxReplicas or LocationLabels when placement rules feature is enabled and not only default rule exists, please update rule instead") + return errors.New("cannot update MaxReplicas, LocationLabels or IsolationLevel when placement rules feature is enabled and not only default rule exists, please update rule instead") } - if !(defaultRule.Count == int(old.MaxReplicas) && typeutil.AreStringSlicesEqual(defaultRule.LocationLabels, []string(old.LocationLabels))) { + if !(defaultRule.Count == int(old.MaxReplicas) && typeutil.AreStringSlicesEqual(defaultRule.LocationLabels, []string(old.LocationLabels)) && defaultRule.IsolationLevel == old.IsolationLevel) { return errors.New("cannot to update replication config, the default rules do not consistent with replication config, please update rule instead") } return nil } - if !(cfg.MaxReplicas == old.MaxReplicas && typeutil.AreStringSlicesEqual(cfg.LocationLabels, old.LocationLabels)) { + if !(cfg.MaxReplicas == old.MaxReplicas && typeutil.AreStringSlicesEqual(cfg.LocationLabels, old.LocationLabels) && cfg.IsolationLevel == old.IsolationLevel) { if err := CheckInDefaultRule(); err != nil { return err } @@ -1076,6 +1076,7 @@ func (s *Server) SetReplicationConfig(cfg sc.ReplicationConfig) error { if rule != nil { rule.Count = int(cfg.MaxReplicas) rule.LocationLabels = cfg.LocationLabels + rule.IsolationLevel = cfg.IsolationLevel rc := s.GetRaftCluster() if rc == nil { return errs.ErrNotBootstrapped.GenWithStackByArgs() diff --git a/tests/pdctl/config/config_test.go b/tests/pdctl/config/config_test.go index 3d0146589d5..f43a964b50c 100644 --- a/tests/pdctl/config/config_test.go +++ b/tests/pdctl/config/config_test.go @@ -683,7 +683,7 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.Equal(expect, replicationCfg.MaxReplicas) } - checkLocaltionLabels := func(expect int) { + checkLocationLabels := func(expect int) { args := []string{"-u", pdAddr, "config", "show", "replication"} output, err := pdctl.ExecuteCommand(cmd, args...) re.NoError(err) @@ -692,6 +692,15 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.Len(replicationCfg.LocationLabels, expect) } + checkIsolationLevel := func(expect string) { + args := []string{"-u", pdAddr, "config", "show", "replication"} + output, err := pdctl.ExecuteCommand(cmd, args...) + re.NoError(err) + replicationCfg := sc.ReplicationConfig{} + re.NoError(json.Unmarshal(output, &replicationCfg)) + re.Equal(replicationCfg.IsolationLevel, expect) + } + checkRuleCount := func(expect int) { args := []string{"-u", pdAddr, "config", "placement-rules", "show", "--group", "pd", "--id", "default"} output, err := pdctl.ExecuteCommand(cmd, args...) @@ -710,6 +719,15 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.Len(rule.LocationLabels, expect) } + checkRuleIsolationLevel := func(expect string) { + args := []string{"-u", pdAddr, "config", "placement-rules", "show", "--group", "pd", "--id", "default"} + output, err := pdctl.ExecuteCommand(cmd, args...) + re.NoError(err) + rule := placement.Rule{} + re.NoError(json.Unmarshal(output, &rule)) + re.Equal(rule.IsolationLevel, expect) + } + // update successfully when placement rules is not enabled. output, err := pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "max-replicas", "2") re.NoError(err) @@ -718,8 +736,13 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "location-labels", "zone,host") re.NoError(err) re.Contains(string(output), "Success!") - checkLocaltionLabels(2) + output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "isolation-level", "zone") + re.NoError(err) + re.Contains(string(output), "Success!") + checkLocationLabels(2) checkRuleLocationLabels(2) + checkIsolationLevel("zone") + checkRuleIsolationLevel("zone") // update successfully when only one default rule exists. output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "placement-rules", "enable") @@ -732,11 +755,18 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { checkMaxReplicas(3) checkRuleCount(3) + // We need to change isolation first because we will validate + // if the location label contains the isolation level when setting location labels. + output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "isolation-level", "host") + re.NoError(err) + re.Contains(string(output), "Success!") output, err = pdctl.ExecuteCommand(cmd, "-u", pdAddr, "config", "set", "location-labels", "host") re.NoError(err) re.Contains(string(output), "Success!") - checkLocaltionLabels(1) + checkLocationLabels(1) checkRuleLocationLabels(1) + checkIsolationLevel("host") + checkRuleIsolationLevel("host") // update unsuccessfully when many rule exists. fname := t.TempDir() @@ -760,8 +790,10 @@ func TestUpdateDefaultReplicaConfig(t *testing.T) { re.NoError(err) checkMaxReplicas(4) checkRuleCount(4) - checkLocaltionLabels(1) + checkLocationLabels(1) checkRuleLocationLabels(1) + checkIsolationLevel("host") + checkRuleIsolationLevel("host") } func TestPDServerConfig(t *testing.T) {