diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES new file mode 100644 index 00000000000..516a466c91e --- /dev/null +++ b/OWNERS_ALIASES @@ -0,0 +1,6 @@ +# Sort the member alphabetically. +aliases: + sig-critical-approvers-config: + - easonn7 + - kevin-xianliu + - niubell diff --git a/client/client.go b/client/client.go index 1865fd0866e..1c8ef3cafe8 100644 --- a/client/client.go +++ b/client/client.go @@ -1431,17 +1431,6 @@ func (c *client) scatterRegionsWithOptions(ctx context.Context, regionsID []uint return resp, nil } -// IsLeaderChange will determine whether there is a leader change. -func IsLeaderChange(err error) bool { - if err == errs.ErrClientTSOStreamClosed { - return true - } - errMsg := err.Error() - return strings.Contains(errMsg, errs.NotLeaderErr) || - strings.Contains(errMsg, errs.MismatchLeaderErr) || - strings.Contains(errMsg, errs.NotServedErr) -} - const ( httpSchemePrefix = "http://" httpsSchemePrefix = "https://" diff --git a/client/errs/errno.go b/client/errs/errno.go index 50c136dd5f2..0dbcb4fe147 100644 --- a/client/errs/errno.go +++ b/client/errs/errno.go @@ -20,21 +20,20 @@ import ( "github.com/pingcap/errors" ) +// Note: keep the same as the ones defined on the server side to ensure the client can use them correctly. const ( + // NoLeaderErr indicates there is no leader in the cluster currently. + NoLeaderErr = "no leader" // NotLeaderErr indicates the non-leader member received the requests which should be received by leader. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. - NotLeaderErr = "is not leader" + NotLeaderErr = "not leader" // MismatchLeaderErr indicates the non-leader member received the requests which should be received by leader. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. MismatchLeaderErr = "mismatch leader id" // NotServedErr indicates an tso node/pod received the requests for the keyspace groups which are not served by it. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. NotServedErr = "is not served" // RetryTimeoutErr indicates the server is busy. RetryTimeoutErr = "retry timeout" + // NotPrimaryErr indicates the non-primary member received the requests which should be received by primary. + NotPrimaryErr = "not primary" ) // client errors diff --git a/client/errs/errs.go b/client/errs/errs.go index 47f7c29a467..da333efda4c 100644 --- a/client/errs/errs.go +++ b/client/errs/errs.go @@ -15,11 +15,29 @@ package errs import ( + "strings" + "github.com/pingcap/errors" "go.uber.org/zap" "go.uber.org/zap/zapcore" ) +// IsLeaderChange will determine whether there is a leader/primary change. +func IsLeaderChange(err error) bool { + if err == nil { + return false + } + if err == ErrClientTSOStreamClosed { + return true + } + errMsg := err.Error() + return strings.Contains(errMsg, NoLeaderErr) || + strings.Contains(errMsg, NotLeaderErr) || + strings.Contains(errMsg, MismatchLeaderErr) || + strings.Contains(errMsg, NotServedErr) || + strings.Contains(errMsg, NotPrimaryErr) +} + // ZapError is used to make the log output easier. func ZapError(err error, causeError ...error) zap.Field { if err == nil { diff --git a/client/http/client.go b/client/http/client.go index 7b34193c2a4..123ca616422 100644 --- a/client/http/client.go +++ b/client/http/client.go @@ -120,10 +120,25 @@ func (ci *clientInner) requestWithRetry( headerOpts ...HeaderOption, ) error { var ( + serverURL string + isLeader bool statusCode int err error + logFields = append(reqInfo.logFields(), zap.String("source", ci.source)) ) execFunc := func() error { + defer func() { + // If the status code is 503, it indicates that there may be PD leader/follower changes. + // If the error message contains the leader/primary change information, it indicates that there may be PD leader/primary change. + if statusCode == http.StatusServiceUnavailable || errs.IsLeaderChange(err) { + ci.sd.ScheduleCheckMemberChanged() + } + log.Debug("[pd] http request finished", append(logFields, + zap.String("server-url", serverURL), + zap.Bool("is-leader", isLeader), + zap.Int("status-code", statusCode), + zap.Error(err))...) + }() // It will try to send the request to the PD leader first and then try to send the request to the other PD followers. clients := ci.sd.GetAllServiceClients() if len(clients) == 0 { @@ -131,17 +146,21 @@ func (ci *clientInner) requestWithRetry( } skipNum := 0 for _, cli := range clients { - url := cli.GetURL() - if reqInfo.targetURL != "" && reqInfo.targetURL != url { + serverURL = cli.GetURL() + isLeader = cli.IsConnectedToLeader() + if len(reqInfo.targetURL) > 0 && reqInfo.targetURL != serverURL { skipNum++ continue } - statusCode, err = ci.doRequest(ctx, url, reqInfo, headerOpts...) + statusCode, err = ci.doRequest(ctx, serverURL, reqInfo, headerOpts...) if err == nil || noNeedRetry(statusCode) { return err } - log.Debug("[pd] request url failed", - zap.String("source", ci.source), zap.Bool("is-leader", cli.IsConnectedToLeader()), zap.String("url", url), zap.Error(err)) + log.Debug("[pd] http request url failed", append(logFields, + zap.String("server-url", serverURL), + zap.Bool("is-leader", isLeader), + zap.Int("status-code", statusCode), + zap.Error(err))...) } if skipNum == len(clients) { return errs.ErrClientNoTargetMember @@ -169,26 +188,21 @@ func noNeedRetry(statusCode int) bool { func (ci *clientInner) doRequest( ctx context.Context, - url string, reqInfo *requestInfo, + serverURL string, reqInfo *requestInfo, headerOpts ...HeaderOption, ) (int, error) { var ( - source = ci.source callerID = reqInfo.callerID name = reqInfo.name method = reqInfo.method body = reqInfo.body res = reqInfo.res respHandler = reqInfo.respHandler + url = reqInfo.getURL(serverURL) + logFields = append(reqInfo.logFields(), + zap.String("source", ci.source), + zap.String("url", url)) ) - url = reqInfo.getURL(url) - logFields := []zap.Field{ - zap.String("source", source), - zap.String("name", name), - zap.String("url", url), - zap.String("method", method), - zap.String("caller-id", callerID), - } log.Debug("[pd] request the http url", logFields...) req, err := http.NewRequestWithContext(ctx, method, url, bytes.NewBuffer(body)) if err != nil { @@ -229,11 +243,14 @@ func (ci *clientInner) doRequest( if readErr != nil { logFields = append(logFields, zap.NamedError("read-body-error", err)) } else { + // API server will return a JSON body containing the detailed error message + // when the status code is not `http.StatusOK` 200. + bs = bytes.TrimSpace(bs) logFields = append(logFields, zap.ByteString("body", bs)) } log.Error("[pd] request failed with a non-200 status", logFields...) - return resp.StatusCode, errors.Errorf("request pd http api failed with status: '%s'", resp.Status) + return resp.StatusCode, errors.Errorf("request pd http api failed with status: '%s', body: '%s'", resp.Status, bs) } if res == nil { diff --git a/client/http/interface.go b/client/http/interface.go index 11c24beaefd..3684e19b1f5 100644 --- a/client/http/interface.go +++ b/client/http/interface.go @@ -49,6 +49,7 @@ type Client interface { GetRegionStatusByKeyRange(context.Context, *KeyRange, bool) (*RegionStats, error) GetStores(context.Context) (*StoresInfo, error) GetStore(context.Context, uint64) (*StoreInfo, error) + DeleteStore(context.Context, uint64) error SetStoreLabels(context.Context, int64, map[string]string) error GetHealthStatus(context.Context) ([]Health, error) /* Config-related interfaces */ @@ -440,6 +441,14 @@ func (c *client) GetStore(ctx context.Context, storeID uint64) (*StoreInfo, erro return &store, nil } +// DeleteStore deletes the store by ID. +func (c *client) DeleteStore(ctx context.Context, storeID uint64) error { + return c.request(ctx, newRequestInfo(). + WithName(deleteStoreName). + WithURI(StoreByID(storeID)). + WithMethod(http.MethodDelete)) +} + // GetClusterVersion gets the cluster version. func (c *client) GetClusterVersion(ctx context.Context) (string, error) { var version string diff --git a/client/http/request_info.go b/client/http/request_info.go index 202eab1150f..40bd0368250 100644 --- a/client/http/request_info.go +++ b/client/http/request_info.go @@ -18,6 +18,7 @@ import ( "fmt" "github.com/tikv/pd/client/retry" + "go.uber.org/zap" ) // The following constants are the names of the requests. @@ -38,6 +39,7 @@ const ( getRegionStatusByKeyRangeName = "GetRegionStatusByKeyRange" getStoresName = "GetStores" getStoreName = "GetStore" + deleteStoreName = "DeleteStore" setStoreLabelsName = "SetStoreLabels" getHealthStatusName = "GetHealthStatus" getConfigName = "GetConfig" @@ -157,3 +159,13 @@ func (ri *requestInfo) WithTargetURL(targetURL string) *requestInfo { func (ri *requestInfo) getURL(addr string) string { return fmt.Sprintf("%s%s", addr, ri.uri) } + +func (ri *requestInfo) logFields() []zap.Field { + return []zap.Field{ + zap.String("caller-id", ri.callerID), + zap.String("name", ri.name), + zap.String("uri", ri.uri), + zap.String("method", ri.method), + zap.String("target-url", ri.targetURL), + } +} diff --git a/client/pd_service_discovery_test.go b/client/pd_service_discovery_test.go index f4cde0e1911..44171873b1a 100644 --- a/client/pd_service_discovery_test.go +++ b/client/pd_service_discovery_test.go @@ -29,6 +29,7 @@ import ( "github.com/pingcap/kvproto/pkg/pdpb" "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" + "github.com/tikv/pd/client/errs" "github.com/tikv/pd/client/grpcutil" "github.com/tikv/pd/client/testutil" "google.golang.org/grpc" @@ -205,7 +206,7 @@ func (suite *serviceClientTestSuite) TestServiceClient() { re.NotNil(leaderConn) _, err := pb.NewGreeterClient(followerConn).SayHello(suite.ctx, &pb.HelloRequest{Name: "pd"}) - re.ErrorContains(err, "not leader") + re.ErrorContains(err, errs.NotLeaderErr) resp, err := pb.NewGreeterClient(leaderConn).SayHello(suite.ctx, &pb.HelloRequest{Name: "pd"}) re.NoError(err) re.Equal("Hello pd", resp.GetMessage()) diff --git a/client/resource_group/controller/OWNERS b/client/resource_group/controller/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/client/resource_group/controller/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/client/resource_manager_client.go b/client/resource_manager_client.go index 872b241cfe7..98b123c0823 100644 --- a/client/resource_manager_client.go +++ b/client/resource_manager_client.go @@ -16,7 +16,6 @@ package pd import ( "context" - "strings" "time" "github.com/gogo/protobuf/proto" @@ -35,10 +34,6 @@ const ( modify actionType = 1 groupSettingsPathPrefix = "resource_group/settings" controllerConfigPathPrefix = "resource_group/controller" - // errNotPrimary is returned when the requested server is not primary. - errNotPrimary = "not primary" - // errNotLeader is returned when the requested server is not pd leader. - errNotLeader = "not leader" ) // GroupSettingsPathPrefixBytes is used to watch or get resource groups. @@ -83,7 +78,7 @@ func (c *client) resourceManagerClient() (rmpb.ResourceManagerClient, error) { // gRPCErrorHandler is used to handle the gRPC error returned by the resource manager service. func (c *client) gRPCErrorHandler(err error) { - if strings.Contains(err.Error(), errNotPrimary) || strings.Contains(err.Error(), errNotLeader) { + if errs.IsLeaderChange(err) { c.pdSvcDiscovery.ScheduleCheckMemberChanged() } } diff --git a/client/tlsutil/OWNERS b/client/tlsutil/OWNERS new file mode 100644 index 00000000000..211db06feee --- /dev/null +++ b/client/tlsutil/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|tlsconfig\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/client/tso_dispatcher.go b/client/tso_dispatcher.go index d5b52ad6039..0919fd84744 100644 --- a/client/tso_dispatcher.go +++ b/client/tso_dispatcher.go @@ -303,7 +303,7 @@ tsoBatchLoop: cancel() stream = nil // Because ScheduleCheckMemberChanged is asynchronous, if the leader changes, we better call `updateMember` ASAP. - if IsLeaderChange(err) { + if errs.IsLeaderChange(err) { if err := bo.Exec(ctx, svcDiscovery.CheckMemberChanged); err != nil { select { case <-ctx.Done(): diff --git a/conf/OWNERS b/conf/OWNERS new file mode 100644 index 00000000000..1a435c49089 --- /dev/null +++ b/conf/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.toml)$": + approvers: + - sig-critical-approvers-config diff --git a/errors.toml b/errors.toml index 64101000478..a61c23a6fbd 100644 --- a/errors.toml +++ b/errors.toml @@ -16,11 +16,21 @@ error = ''' redirect failed ''' +["PD:apiutil:ErrRedirectNoLeader"] +error = ''' +redirect finds no leader +''' + ["PD:apiutil:ErrRedirectToNotLeader"] error = ''' redirect to not leader ''' +["PD:apiutil:ErrRedirectToNotPrimary"] +error = ''' +redirect to not primary +''' + ["PD:autoscaling:ErrEmptyMetricsResponse"] error = ''' metrics response from Prometheus is empty diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 69afb93f531..7965a341f6c 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -2096,7 +2096,7 @@ { "format": "dtdurations", "label": null, - "logBase": 1, + "logBase": 2, "max": null, "min": "0", "show": true diff --git a/pkg/core/storelimit/store_limit.go b/pkg/core/storelimit/store_limit.go index d2c79debb9e..e35ec773d80 100644 --- a/pkg/core/storelimit/store_limit.go +++ b/pkg/core/storelimit/store_limit.go @@ -129,15 +129,15 @@ func (l *StoreRateLimit) Reset(rate float64, typ Type) { // limit the operators of a store type limit struct { - syncutil.RWMutex - limiter *ratelimit.RateLimiter - ratePerSec float64 + limiter *ratelimit.RateLimiter + ratePerSecMutex syncutil.RWMutex + ratePerSec float64 } // Reset resets the rate limit. func (l *limit) Reset(ratePerSec float64) { - l.Lock() - defer l.Unlock() + l.ratePerSecMutex.Lock() + defer l.ratePerSecMutex.Unlock() if l.ratePerSec == ratePerSec { return } @@ -159,8 +159,8 @@ func (l *limit) Reset(ratePerSec float64) { // Available returns the number of available tokens // It returns true if the rate per second is zero. func (l *limit) Available(n int64) bool { - l.RLock() - defer l.RUnlock() + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() if l.ratePerSec == 0 { return true } @@ -170,8 +170,8 @@ func (l *limit) Available(n int64) bool { // Take takes count tokens from the bucket without blocking. func (l *limit) Take(count int64) bool { - l.RLock() - defer l.RUnlock() + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() if l.ratePerSec == 0 { return true } @@ -179,7 +179,7 @@ func (l *limit) Take(count int64) bool { } func (l *limit) GetRatePerSec() float64 { - l.RLock() - defer l.RUnlock() + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() return l.ratePerSec } diff --git a/pkg/encryption/OWNERS b/pkg/encryption/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/encryption/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go index 8c3e914531b..1f56a821032 100644 --- a/pkg/errs/errno.go +++ b/pkg/errs/errno.go @@ -195,10 +195,11 @@ var ( // apiutil errors var ( - ErrRedirect = errors.Normalize("redirect failed", errors.RFCCodeText("PD:apiutil:ErrRedirect")) - ErrOptionNotExist = errors.Normalize("the option %s does not exist", errors.RFCCodeText("PD:apiutil:ErrOptionNotExist")) - // ErrRedirectToNotLeader is the error message for redirect to not leader. - ErrRedirectToNotLeader = errors.Normalize("redirect to not leader", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotLeader")) + ErrRedirect = errors.Normalize("redirect failed", errors.RFCCodeText("PD:apiutil:ErrRedirect")) + ErrOptionNotExist = errors.Normalize("the option %s does not exist", errors.RFCCodeText("PD:apiutil:ErrOptionNotExist")) + ErrRedirectNoLeader = errors.Normalize("redirect finds no leader", errors.RFCCodeText("PD:apiutil:ErrRedirectNoLeader")) + ErrRedirectToNotLeader = errors.Normalize("redirect to not leader", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotLeader")) + ErrRedirectToNotPrimary = errors.Normalize("redirect to not primary", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotPrimary")) ) // grpcutil errors diff --git a/pkg/mcs/resourcemanager/server/OWNERS b/pkg/mcs/resourcemanager/server/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/mcs/resourcemanager/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/resourcemanager/server/manager.go b/pkg/mcs/resourcemanager/server/manager.go index ef402b8cbf9..418d188823f 100644 --- a/pkg/mcs/resourcemanager/server/manager.go +++ b/pkg/mcs/resourcemanager/server/manager.go @@ -129,7 +129,9 @@ func (m *Manager) Init(ctx context.Context) error { return err } // Load resource group meta info from storage. + m.Lock() m.groups = make(map[string]*ResourceGroup) + m.Unlock() handler := func(k, v string) { group := &rmpb.ResourceGroup{} if err := proto.Unmarshal([]byte(v), group); err != nil { diff --git a/pkg/mcs/scheduling/server/config/OWNERS b/pkg/mcs/scheduling/server/config/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/mcs/scheduling/server/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/tso/server/OWNERS b/pkg/mcs/tso/server/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/mcs/tso/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/schedule/config/OWNERS b/pkg/schedule/config/OWNERS new file mode 100644 index 00000000000..ce5d15ddc19 --- /dev/null +++ b/pkg/schedule/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|(config|store_config)\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/schedule/coordinator.go b/pkg/schedule/coordinator.go old mode 100755 new mode 100644 index 793a8b04aa9..249e8e8e569 --- a/pkg/schedule/coordinator.go +++ b/pkg/schedule/coordinator.go @@ -292,7 +292,7 @@ func (c *Coordinator) PatrolRegions() { c.patrolRegionContext.updateScanLimit(c.cluster) } failpoint.Inject("break-patrol", func() { - time.Sleep(3 * time.Second) // ensure the regions are handled by the workers + time.Sleep(100 * time.Millisecond) // ensure the regions are handled by the workers failpoint.Return() }) case <-c.ctx.Done(): diff --git a/pkg/schedule/operator/operator.go b/pkg/schedule/operator/operator.go index de197c4fba4..4d57d4fc6c7 100644 --- a/pkg/schedule/operator/operator.go +++ b/pkg/schedule/operator/operator.go @@ -376,10 +376,11 @@ func (o *Operator) Check(region *core.RegionInfo) OpStep { defer func() { _ = o.CheckTimeout() }() for step := atomic.LoadInt32(&o.currentStep); int(step) < len(o.steps); step++ { if o.steps[int(step)].IsFinish(region) { - if atomic.CompareAndSwapInt64(&(o.stepsTime[step]), 0, time.Now().UnixNano()) { + current := time.Now() + if atomic.CompareAndSwapInt64(&(o.stepsTime[step]), 0, current.UnixNano()) { startTime, _ := o.getCurrentTimeAndStep() operatorStepDuration.WithLabelValues(reflect.TypeOf(o.steps[int(step)]).Name()). - Observe(time.Unix(0, o.stepsTime[step]).Sub(startTime).Seconds()) + Observe(current.Sub(startTime).Seconds()) } atomic.StoreInt32(&o.currentStep, step+1) } else { diff --git a/pkg/schedule/operator/operator_controller_test.go b/pkg/schedule/operator/operator_controller_test.go index d3c50667fe0..2b16516c4c7 100644 --- a/pkg/schedule/operator/operator_controller_test.go +++ b/pkg/schedule/operator/operator_controller_test.go @@ -955,3 +955,40 @@ func (suite *operatorControllerTestSuite) TestInvalidStoreId() { // Although store 3 does not exist in PD, PD can also send op to TiKV. re.Equal(pdpb.OperatorStatus_RUNNING, oc.GetOperatorStatus(1).Status) } + +func TestConcurrentAddOperatorAndSetStoreLimit(t *testing.T) { + re := require.New(t) + opt := mockconfig.NewTestOptions() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + tc := mockcluster.NewCluster(ctx, opt) + stream := hbstream.NewTestHeartbeatStreams(ctx, tc.ID, tc, false /* no need to run */) + oc := NewController(ctx, tc.GetBasicCluster(), tc.GetSharedConfig(), stream) + + regionNum := 1000 + limit := 1600.0 + storeID := uint64(2) + for i := 1; i < 4; i++ { + tc.AddRegionStore(uint64(i), regionNum) + tc.SetStoreLimit(uint64(i), storelimit.AddPeer, limit) + } + for i := 1; i <= regionNum; i++ { + tc.AddLeaderRegion(uint64(i), 1, 3, 4) + } + + // Add operator and set store limit concurrently + var wg sync.WaitGroup + for i := 1; i < 10; i++ { + wg.Add(1) + go func(i uint64) { + defer wg.Done() + for j := 1; j < 10; j++ { + regionID := uint64(j) + i*100 + op := NewTestOperator(regionID, tc.GetRegion(regionID).GetRegionEpoch(), OpRegion, AddPeer{ToStore: storeID, PeerID: regionID}) + re.True(oc.AddOperator(op)) + tc.SetStoreLimit(storeID, storelimit.AddPeer, limit-float64(j)) // every goroutine set a different limit + } + }(uint64(i)) + } + wg.Wait() +} diff --git a/pkg/schedule/operator/operator_test.go b/pkg/schedule/operator/operator_test.go index 693f5c17475..1f44d813f1e 100644 --- a/pkg/schedule/operator/operator_test.go +++ b/pkg/schedule/operator/operator_test.go @@ -17,6 +17,7 @@ package operator import ( "context" "encoding/json" + "sync" "sync/atomic" "testing" "time" @@ -570,3 +571,27 @@ func (suite *operatorTestSuite) TestToJSONObject() { obj = op.ToJSONObject() suite.Equal(TIMEOUT, obj.Status) } + +func TestOperatorCheckConcurrently(t *testing.T) { + re := require.New(t) + region := newTestRegion(1, 1, [2]uint64{1, 1}, [2]uint64{2, 2}) + // addPeer1, transferLeader1, removePeer3 + steps := []OpStep{ + AddPeer{ToStore: 1, PeerID: 1}, + TransferLeader{FromStore: 3, ToStore: 1}, + RemovePeer{FromStore: 3}, + } + op := NewTestOperator(1, &metapb.RegionEpoch{}, OpAdmin|OpLeader|OpRegion, steps...) + re.Equal(constant.Urgent, op.GetPriorityLevel()) + checkSteps(re, op, steps) + op.Start() + var wg sync.WaitGroup + for i := 0; i < 10; i++ { + wg.Add(1) + go func() { + defer wg.Done() + re.Nil(op.Check(region)) + }() + } + wg.Wait() +} diff --git a/pkg/schedule/schedulers/OWNERS b/pkg/schedule/schedulers/OWNERS new file mode 100644 index 00000000000..ae96e4f1f42 --- /dev/null +++ b/pkg/schedule/schedulers/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|hot_region_config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/utils/apiutil/multiservicesapi/middleware.go b/pkg/utils/apiutil/multiservicesapi/middleware.go index ed34ecc6afb..4343adcc981 100644 --- a/pkg/utils/apiutil/multiservicesapi/middleware.go +++ b/pkg/utils/apiutil/multiservicesapi/middleware.go @@ -48,8 +48,8 @@ func ServiceRedirector() gin.HandlerFunc { // Prevent more than one redirection. if name := c.Request.Header.Get(ServiceRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not primary", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirect)) - c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirect.FastGenByArgs().Error()) + log.Error("redirect but server is not primary", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirectToNotPrimary)) + c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirectToNotPrimary.FastGenByArgs().Error()) return } diff --git a/pkg/utils/apiutil/serverapi/middleware.go b/pkg/utils/apiutil/serverapi/middleware.go index 1cd3d5b53d6..0718702b5a5 100755 --- a/pkg/utils/apiutil/serverapi/middleware.go +++ b/pkg/utils/apiutil/serverapi/middleware.go @@ -210,7 +210,7 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http leader := h.waitForLeader(r) // The leader has not been elected yet. if leader == nil { - http.Error(w, "no leader", http.StatusServiceUnavailable) + http.Error(w, errs.ErrRedirectNoLeader.FastGenByArgs().Error(), http.StatusServiceUnavailable) return } // If the leader is the current server now, we can handle the request directly. @@ -222,7 +222,7 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http r.Header.Set(apiutil.PDRedirectorHeader, h.s.Name()) } else { // Prevent more than one redirection among PD/API servers. - log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirect)) + log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirectToNotLeader)) http.Error(w, errs.ErrRedirectToNotLeader.FastGenByArgs().Error(), http.StatusInternalServerError) return } diff --git a/server/apiv2/middlewares/redirector.go b/server/apiv2/middlewares/redirector.go index 37c06de1585..9c2c4081175 100644 --- a/server/apiv2/middlewares/redirector.go +++ b/server/apiv2/middlewares/redirector.go @@ -43,8 +43,8 @@ func Redirector() gin.HandlerFunc { // Prevent more than one redirection. if name := c.Request.Header.Get(apiutil.PDRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirect)) - c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirect.FastGenByArgs().Error()) + log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirectToNotLeader)) + c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirectToNotLeader.FastGenByArgs().Error()) return } diff --git a/server/config/OWNERS b/server/config/OWNERS new file mode 100644 index 00000000000..179de4843e6 --- /dev/null +++ b/server/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|(config|service_middleware_config)\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/server/join/join.go b/server/join/join.go index d1711063313..1319dc08d07 100644 --- a/server/join/join.go +++ b/server/join/join.go @@ -136,7 +136,11 @@ func PrepareJoinCluster(cfg *config.Config) error { existed := false for _, m := range listResp.Members { if len(m.Name) == 0 { - return errors.New("there is a member that has not joined successfully") + log.Error("there is an abnormal joined member in the current member list", + zap.Uint64("id", m.ID), + zap.Strings("peer-urls", m.PeerURLs), + zap.Strings("client-urls", m.ClientURLs)) + return errors.Errorf("there is a member %d that has not joined successfully", m.ID) } if m.Name == cfg.Name { existed = true @@ -184,7 +188,11 @@ func PrepareJoinCluster(cfg *config.Config) error { listSucc = true } if len(n) == 0 { - return errors.New("there is a member that has not joined successfully") + log.Error("there is an abnormal joined member in the current member list", + zap.Uint64("id", memb.ID), + zap.Strings("peer-urls", memb.PeerURLs), + zap.Strings("client-urls", memb.ClientURLs)) + return errors.Errorf("there is a member %d that has not joined successfully", memb.ID) } for _, m := range memb.PeerURLs { pds = append(pds, fmt.Sprintf("%s=%s", n, m)) diff --git a/tests/integrations/client/client_test.go b/tests/integrations/client/client_test.go index dfe7a6980c7..65acd897726 100644 --- a/tests/integrations/client/client_test.go +++ b/tests/integrations/client/client_test.go @@ -40,6 +40,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" pd "github.com/tikv/pd/client" + clierrs "github.com/tikv/pd/client/errs" "github.com/tikv/pd/client/retry" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/errs" @@ -528,7 +529,7 @@ func TestGlobalAndLocalTSO(t *testing.T) { re.NotEmpty(cluster.WaitLeader()) _, _, err = cli.GetTS(ctx) re.Error(err) - re.True(pd.IsLeaderChange(err)) + re.True(clierrs.IsLeaderChange(err)) _, _, err = cli.GetTS(ctx) re.NoError(err) re.NoError(failpoint.Disable("github.com/tikv/pd/client/skipUpdateMember")) diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index 9d7e0985940..f4a48dcd63e 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -26,6 +26,7 @@ import ( "time" "github.com/pingcap/errors" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" @@ -80,6 +81,15 @@ func (suite *httpClientTestSuite) SetupSuite() { leaderServer := cluster.GetLeaderServer() err = leaderServer.BootstrapCluster() + // Add 2 more stores to the cluster. + for i := 2; i <= 4; i++ { + tests.MustPutStore(re, cluster, &metapb.Store{ + Id: uint64(i), + State: metapb.StoreState_Up, + NodeState: metapb.NodeState_Serving, + LastHeartbeat: time.Now().UnixNano(), + }) + } re.NoError(err) for _, region := range []*core.RegionInfo{ core.NewTestRegionInfo(10, 1, []byte("a1"), []byte("a2")), @@ -165,29 +175,29 @@ func (suite *httpClientTestSuite) TestMeta() { re.Empty(regionStats.StoreLeaderCount) hotReadRegions, err := client.GetHotReadRegions(ctx) re.NoError(err) - re.Len(hotReadRegions.AsPeer, 1) - re.Len(hotReadRegions.AsLeader, 1) + re.Len(hotReadRegions.AsPeer, 4) + re.Len(hotReadRegions.AsLeader, 4) hotWriteRegions, err := client.GetHotWriteRegions(ctx) re.NoError(err) - re.Len(hotWriteRegions.AsPeer, 1) - re.Len(hotWriteRegions.AsLeader, 1) + re.Len(hotWriteRegions.AsPeer, 4) + re.Len(hotWriteRegions.AsLeader, 4) historyHorRegions, err := client.GetHistoryHotRegions(ctx, &pd.HistoryHotRegionsRequest{ StartTime: 0, EndTime: time.Now().AddDate(0, 0, 1).UnixNano() / int64(time.Millisecond), }) re.NoError(err) re.Empty(historyHorRegions.HistoryHotRegion) - store, err := client.GetStores(ctx) + stores, err := client.GetStores(ctx) re.NoError(err) - re.Equal(1, store.Count) - re.Len(store.Stores, 1) - storeID := uint64(store.Stores[0].Store.ID) // TODO: why type is different? + re.Equal(4, stores.Count) + re.Len(stores.Stores, 4) + storeID := uint64(stores.Stores[0].Store.ID) // TODO: why type is different? store2, err := client.GetStore(ctx, storeID) re.NoError(err) re.EqualValues(storeID, store2.Store.ID) version, err := client.GetClusterVersion(ctx) re.NoError(err) - re.Equal("0.0.0", version) + re.Equal("1.0.0", version) rgs, _ := client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a"), []byte("a1")), 100) re.Equal(int64(0), rgs.Count) rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), 100) @@ -196,6 +206,12 @@ func (suite *httpClientTestSuite) TestMeta() { re.Equal(int64(1), rgs.Count) rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte(""), []byte("")), 100) re.Equal(int64(2), rgs.Count) + // store 2 origin status:offline + err = client.DeleteStore(ctx, 2) + re.NoError(err) + store2, err = client.GetStore(ctx, 2) + re.NoError(err) + re.Equal(int64(metapb.StoreState_Offline), store2.Store.State) } func (suite *httpClientTestSuite) TestGetMinResolvedTSByStoresIDs() { diff --git a/tests/integrations/mcs/tso/keyspace_group_manager_test.go b/tests/integrations/mcs/tso/keyspace_group_manager_test.go index 25d9516bf63..6d861962d9b 100644 --- a/tests/integrations/mcs/tso/keyspace_group_manager_test.go +++ b/tests/integrations/mcs/tso/keyspace_group_manager_test.go @@ -28,6 +28,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" pd "github.com/tikv/pd/client" + clierrs "github.com/tikv/pd/client/errs" "github.com/tikv/pd/pkg/election" "github.com/tikv/pd/pkg/errs" mcsutils "github.com/tikv/pd/pkg/mcs/utils" @@ -467,8 +468,8 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) dispatchClient( errMsg := err.Error() // Ignore the errors caused by the split and context cancellation. if strings.Contains(errMsg, "context canceled") || - strings.Contains(errMsg, "not leader") || - strings.Contains(errMsg, "not served") || + strings.Contains(errMsg, clierrs.NotLeaderErr) || + strings.Contains(errMsg, clierrs.NotServedErr) || strings.Contains(errMsg, "ErrKeyspaceNotAssigned") || strings.Contains(errMsg, "ErrKeyspaceGroupIsMerging") { continue diff --git a/tests/integrations/realcluster/deploy.sh b/tests/integrations/realcluster/deploy.sh index d6cd0b27f72..8cce60e8ee6 100755 --- a/tests/integrations/realcluster/deploy.sh +++ b/tests/integrations/realcluster/deploy.sh @@ -15,10 +15,12 @@ curl --proto '=https' --tlsv1.2 -sSf https://tiup-mirrors.pingcap.com/install.sh $TIUP_BIN_DIR update playground cd ../../.. -if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && [ ! -e "bin/pd-server" ] && [ ! -e "bin/tiflash" ]; then +if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && [ ! -e "bin/tiflash" ]; then color-green "downloading binaries..." color-green "this may take a few minutes, you can also download them manually and put them in the bin directory." + make pd-server WITH_RACE=1 $TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor --tag pd_test \ + --pd.binpath ./bin/pd-server \ > $CUR_PATH/playground.log 2>&1 & else color-green "using existing binaries..." diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 07bcf3ee2a1..9e70a52d11d 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -662,7 +662,7 @@ func TestNotLeader(t *testing.T) { grpcStatus, ok := status.FromError(err) re.True(ok) re.Equal(codes.Unavailable, grpcStatus.Code()) - re.Equal("not leader", grpcStatus.Message()) + re.ErrorContains(server.ErrNotLeader, grpcStatus.Message()) } func TestStoreVersionChange(t *testing.T) { diff --git a/tools/pd-ctl/pdctl/command/global.go b/tools/pd-ctl/pdctl/command/global.go index f7c04c3ca5c..b29e2b63278 100644 --- a/tools/pd-ctl/pdctl/command/global.go +++ b/tools/pd-ctl/pdctl/command/global.go @@ -55,23 +55,15 @@ var PDCli pd.Client func requirePDClient(cmd *cobra.Command, _ []string) error { var ( - caPath string - err error + tlsConfig *tls.Config + err error ) - caPath, err = cmd.Flags().GetString("cacert") - if err == nil && len(caPath) != 0 { - var certPath, keyPath string - certPath, err = cmd.Flags().GetString("cert") - if err != nil { - return err - } - keyPath, err = cmd.Flags().GetString("key") - if err != nil { - return err - } - return initNewPDClientWithTLS(cmd, caPath, certPath, keyPath) + tlsConfig, err = parseTLSConfig(cmd) + if err != nil { + return err } - return initNewPDClient(cmd) + + return initNewPDClient(cmd, pd.WithTLSConfig(tlsConfig)) } // shouldInitPDClient checks whether we should create a new PD client according to the cluster information. @@ -111,44 +103,36 @@ func initNewPDClient(cmd *cobra.Command, opts ...pd.ClientOption) error { return nil } -func initNewPDClientWithTLS(cmd *cobra.Command, caPath, certPath, keyPath string) error { - tlsConfig, err := initTLSConfig(caPath, certPath, keyPath) - if err != nil { - return err - } - initNewPDClient(cmd, pd.WithTLSConfig(tlsConfig)) - return nil -} - // TODO: replace dialClient with the PD HTTP client completely. var dialClient = &http.Client{ Transport: apiutil.NewCallerIDRoundTripper(http.DefaultTransport, PDControlCallerID), } -// RequireHTTPSClient creates a HTTPS client if the related flags are set -func RequireHTTPSClient(cmd *cobra.Command, _ []string) error { +func parseTLSConfig(cmd *cobra.Command) (*tls.Config, error) { caPath, err := cmd.Flags().GetString("cacert") - if err == nil && len(caPath) != 0 { - certPath, err := cmd.Flags().GetString("cert") - if err != nil { - return err - } - keyPath, err := cmd.Flags().GetString("key") - if err != nil { - return err - } - err = initHTTPSClient(caPath, certPath, keyPath) - if err != nil { - cmd.Println(err) - return err - } + if err != nil || len(caPath) == 0 { + return nil, err + } + certPath, err := cmd.Flags().GetString("cert") + if err != nil { + return nil, err + } + keyPath, err := cmd.Flags().GetString("key") + if err != nil { + return nil, err } - return nil -} - -func initHTTPSClient(caPath, certPath, keyPath string) error { tlsConfig, err := initTLSConfig(caPath, certPath, keyPath) if err != nil { + return nil, err + } + + return tlsConfig, nil +} + +// RequireHTTPSClient creates a HTTPS client if the related flags are set +func RequireHTTPSClient(cmd *cobra.Command, _ []string) error { + tlsConfig, err := parseTLSConfig(cmd) + if err != nil || tlsConfig == nil { return err } dialClient = &http.Client{ diff --git a/tools/pd-ctl/pdctl/command/global_test.go b/tools/pd-ctl/pdctl/command/global_test.go new file mode 100644 index 00000000000..86eb4366d04 --- /dev/null +++ b/tools/pd-ctl/pdctl/command/global_test.go @@ -0,0 +1,58 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package command + +import ( + "os" + "os/exec" + "testing" + + "github.com/spf13/cobra" + "github.com/stretchr/testify/require" +) + +func TestParseTLSConfig(t *testing.T) { + re := require.New(t) + + rootCmd := &cobra.Command{ + Use: "pd-ctl", + Short: "Placement Driver control", + SilenceErrors: true, + } + certPath := "../../tests/cert" + rootCmd.Flags().String("cacert", certPath+"/ca.pem", "path of file that contains list of trusted SSL CAs") + rootCmd.Flags().String("cert", certPath+"/client.pem", "path of file that contains X509 certificate in PEM format") + rootCmd.Flags().String("key", certPath+"/client-key.pem", "path of file that contains X509 key in PEM format") + + // generate certs + if err := os.Mkdir(certPath, 0755); err != nil { + t.Fatal(err) + } + certScript := "../../tests/cert_opt.sh" + if err := exec.Command(certScript, "generate", certPath).Run(); err != nil { + t.Fatal(err) + } + defer func() { + if err := exec.Command(certScript, "cleanup", certPath).Run(); err != nil { + t.Fatal(err) + } + if err := os.RemoveAll(certPath); err != nil { + t.Fatal(err) + } + }() + + tlsConfig, err := parseTLSConfig(rootCmd) + re.NoError(err) + re.NotNil(tlsConfig) +} diff --git a/tools/pd-ctl/pdctl/ctl.go b/tools/pd-ctl/pdctl/ctl.go index f8eaff5e76e..fbacd65dc53 100644 --- a/tools/pd-ctl/pdctl/ctl.go +++ b/tools/pd-ctl/pdctl/ctl.go @@ -30,6 +30,7 @@ import ( func init() { cobra.EnablePrefixMatching = true + cobra.EnableTraverseRunHooks = true } // GetRootCmd is exposed for integration tests. But it can be embedded into another suite, too. diff --git a/tools/pd-ctl/tests/health/health_test.go b/tools/pd-ctl/tests/health/health_test.go index 9150a56c91b..f1d3c7cfbf1 100644 --- a/tools/pd-ctl/tests/health/health_test.go +++ b/tools/pd-ctl/tests/health/health_test.go @@ -17,14 +17,21 @@ package health_test import ( "context" "encoding/json" + "os" + "os/exec" + "path/filepath" + "strings" "testing" "github.com/stretchr/testify/require" + "github.com/tikv/pd/pkg/utils/grpcutil" "github.com/tikv/pd/server/api" "github.com/tikv/pd/server/cluster" + "github.com/tikv/pd/server/config" pdTests "github.com/tikv/pd/tests" ctl "github.com/tikv/pd/tools/pd-ctl/pdctl" "github.com/tikv/pd/tools/pd-ctl/tests" + "go.etcd.io/etcd/pkg/transport" ) func TestHealth(t *testing.T) { @@ -68,3 +75,80 @@ func TestHealth(t *testing.T) { re.NoError(json.Unmarshal(output, &h)) re.Equal(healths, h) } + +func TestHealthTLS(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + certPath := "../cert" + certScript := "../cert_opt.sh" + // generate certs + if err := os.Mkdir(certPath, 0755); err != nil { + t.Fatal(err) + } + if err := exec.Command(certScript, "generate", certPath).Run(); err != nil { + t.Fatal(err) + } + defer func() { + if err := exec.Command(certScript, "cleanup", certPath).Run(); err != nil { + t.Fatal(err) + } + if err := os.RemoveAll(certPath); err != nil { + t.Fatal(err) + } + }() + + tlsInfo := transport.TLSInfo{ + KeyFile: filepath.Join(certPath, "pd-server-key.pem"), + CertFile: filepath.Join(certPath, "pd-server.pem"), + TrustedCAFile: filepath.Join(certPath, "ca.pem"), + } + tc, err := pdTests.NewTestCluster(ctx, 1, func(conf *config.Config, _ string) { + conf.Security.TLSConfig = grpcutil.TLSConfig{ + KeyPath: tlsInfo.KeyFile, + CertPath: tlsInfo.CertFile, + CAPath: tlsInfo.TrustedCAFile, + } + conf.AdvertiseClientUrls = strings.ReplaceAll(conf.AdvertiseClientUrls, "http", "https") + conf.ClientUrls = strings.ReplaceAll(conf.ClientUrls, "http", "https") + conf.AdvertisePeerUrls = strings.ReplaceAll(conf.AdvertisePeerUrls, "http", "https") + conf.PeerUrls = strings.ReplaceAll(conf.PeerUrls, "http", "https") + conf.InitialCluster = strings.ReplaceAll(conf.InitialCluster, "http", "https") + }) + re.NoError(err) + defer tc.Destroy() + err = tc.RunInitialServers() + re.NoError(err) + tc.WaitLeader() + cmd := ctl.GetRootCmd() + + client := tc.GetEtcdClient() + members, err := cluster.GetMembers(client) + re.NoError(err) + healthMembers := cluster.CheckHealth(tc.GetHTTPClient(), members) + healths := []api.Health{} + for _, member := range members { + h := api.Health{ + Name: member.Name, + MemberID: member.MemberId, + ClientUrls: member.ClientUrls, + Health: false, + } + if _, ok := healthMembers[member.GetMemberId()]; ok { + h.Health = true + } + healths = append(healths, h) + } + + pdAddr := tc.GetConfig().GetClientURL() + pdAddr = strings.ReplaceAll(pdAddr, "http", "https") + args := []string{"-u", pdAddr, "health", + "--cacert=../cert/ca.pem", + "--cert=../cert/client.pem", + "--key=../cert/client-key.pem"} + output, err := tests.ExecuteCommand(cmd, args...) + re.NoError(err) + h := make([]api.Health, len(healths)) + re.NoError(json.Unmarshal(output, &h)) + re.Equal(healths, h) +} diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 45b3ecd75c9..05763cc93b8 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -25,6 +25,7 @@ import ( "github.com/BurntSushi/toml" "github.com/pingcap/log" flag "github.com/spf13/pflag" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/utils/logutil" @@ -92,6 +93,7 @@ func main() { func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { + simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{*pdAddr}) simStart(*pdAddr, *statusAddress, simCase, simConfig) } else { local, clean := NewSingleServer(context.Background(), simConfig) @@ -105,6 +107,7 @@ func run(simCase string, simConfig *sc.SimConfig) { } time.Sleep(100 * time.Millisecond) } + simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{local.GetAddr()}) simStart(local.GetAddr(), "", simCase, simConfig, clean) } } @@ -151,6 +154,8 @@ func simStart(pdAddr, statusAddress string, simCase string, simConfig *sc.SimCon tick := time.NewTicker(tickInterval) defer tick.Stop() sc := make(chan os.Signal, 1) + // halt scheduling + simulator.ChooseToHaltPDSchedule(true) signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, @@ -183,6 +188,9 @@ EXIT: analysis.GetTransferCounter().PrintResult() } + if simulator.PDHTTPClient != nil { + simulator.PDHTTPClient.Close() + } if simResult != "OK" { os.Exit(1) } diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index 00b5404669f..c4e2f999978 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -16,8 +16,8 @@ package cases import ( "github.com/pingcap/kvproto/pkg/metapb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" @@ -57,7 +57,7 @@ type Case struct { TableNumber int Checker CheckerFunc // To check the schedule is finished. - Rules []*placement.Rule + Rules []*pdHttp.Rule Labels typeutil.StringSlice } diff --git a/tools/pd-simulator/simulator/cases/diagnose_rule.go b/tools/pd-simulator/simulator/cases/diagnose_rule.go index 5d34e051071..2cd11b9624a 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_rule.go +++ b/tools/pd-simulator/simulator/cases/diagnose_rule.go @@ -19,6 +19,7 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" @@ -30,15 +31,15 @@ import ( func newRule1(_ *sc.SimConfig) *Case { var simCase Case - simCase.Rules = make([]*placement.Rule, 0) - simCase.Rules = append(simCase.Rules, &placement.Rule{ + simCase.Rules = make([]*pdHttp.Rule, 0) + simCase.Rules = append(simCase.Rules, &pdHttp.Rule{ GroupID: "test1", ID: "test1", StartKeyHex: "", EndKeyHex: "", - Role: placement.Learner, + Role: pdHttp.Learner, Count: 1, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", @@ -46,14 +47,14 @@ func newRule1(_ *sc.SimConfig) *Case { }, }, LocationLabels: []string{"host"}, - }, &placement.Rule{ + }, &pdHttp.Rule{ GroupID: placement.DefaultGroupID, ID: placement.DefaultRuleID, StartKeyHex: "", EndKeyHex: "", - Role: placement.Voter, + Role: pdHttp.Voter, Count: 5, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", @@ -130,16 +131,16 @@ func newRule1(_ *sc.SimConfig) *Case { func newRule2(_ *sc.SimConfig) *Case { var simCase Case - simCase.Rules = make([]*placement.Rule, 0) + simCase.Rules = make([]*pdHttp.Rule, 0) simCase.Rules = append(simCase.Rules, - &placement.Rule{ + &pdHttp.Rule{ GroupID: "test1", ID: "test1", StartKeyHex: "", EndKeyHex: "", - Role: placement.Leader, + Role: pdHttp.Leader, Count: 1, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 50ed57995df..0bbbebe4602 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -15,11 +15,8 @@ package simulator import ( - "bytes" "context" - "encoding/json" - "fmt" - "net/http" + "strconv" "strings" "sync" "time" @@ -27,8 +24,8 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" @@ -54,12 +51,12 @@ type Client interface { const ( pdTimeout = time.Second maxInitClusterRetries = 100 - httpPrefix = "pd/api/v1" ) var ( // errFailInitClusterID is returned when failed to load clusterID from all supplied PD addresses. errFailInitClusterID = errors.New("[pd] failed to get cluster id") + PDHTTPClient pdHttp.Client ) type client struct { @@ -67,7 +64,6 @@ type client struct { tag string clusterID uint64 clientConn *grpc.ClientConn - httpClient *http.Client reportRegionHeartbeatCh chan *core.RegionInfo receiveRegionHeartbeatCh chan *pdpb.RegionHeartbeatResponse @@ -88,7 +84,6 @@ func NewClient(pdAddr string, tag string) (Client, <-chan *pdpb.RegionHeartbeatR ctx: ctx, cancel: cancel, tag: tag, - httpClient: &http.Client{}, } cc, err := c.createConn() if err != nil { @@ -319,46 +314,27 @@ func (c *client) PutStore(ctx context.Context, store *metapb.Store) error { func (c *client) PutPDConfig(config *sc.PDConfig) error { if len(config.PlacementRules) > 0 { - path := fmt.Sprintf("%s/%s/config/rules/batch", c.url, httpPrefix) - ruleOps := make([]*placement.RuleOp, 0) + ruleOps := make([]*pdHttp.RuleOp, 0) for _, rule := range config.PlacementRules { - ruleOps = append(ruleOps, &placement.RuleOp{ + ruleOps = append(ruleOps, &pdHttp.RuleOp{ Rule: rule, - Action: placement.RuleOpAdd, + Action: pdHttp.RuleOpAdd, }) } - content, _ := json.Marshal(ruleOps) - req, err := http.NewRequest(http.MethodPost, path, bytes.NewBuffer(content)) - req.Header.Add("Content-Type", "application/json") + err := PDHTTPClient.SetPlacementRuleInBatch(c.ctx, ruleOps) if err != nil { return err } - res, err := c.httpClient.Do(req) - if err != nil { - return err - } - defer res.Body.Close() - simutil.Logger.Info("add placement rule success", zap.String("rules", string(content))) + simutil.Logger.Info("add placement rule success", zap.Any("rules", config.PlacementRules)) } if len(config.LocationLabels) > 0 { - path := fmt.Sprintf("%s/%s/config", c.url, httpPrefix) data := make(map[string]any) data["location-labels"] = config.LocationLabels - content, err := json.Marshal(data) - if err != nil { - return err - } - req, err := http.NewRequest(http.MethodPost, path, bytes.NewBuffer(content)) - req.Header.Add("Content-Type", "application/json") + err := PDHTTPClient.SetConfig(c.ctx, data) if err != nil { return err } - res, err := c.httpClient.Do(req) - if err != nil { - return err - } - defer res.Body.Close() - simutil.Logger.Info("add location labels success", zap.String("labels", string(content))) + simutil.Logger.Info("add location labels success", zap.Any("labels", config.LocationLabels)) } return nil } @@ -391,3 +367,9 @@ func (c *client) requestHeader() *pdpb.RequestHeader { ClusterId: c.clusterID, } } + +func ChooseToHaltPDSchedule(halt bool) { + PDHTTPClient.SetConfig(context.Background(), map[string]any{ + "schedule.halt-scheduling": strconv.FormatBool(halt), + }) +} diff --git a/tools/pd-simulator/simulator/config/config.go b/tools/pd-simulator/simulator/config/config.go index 01bf8199ab4..6598cf35c0f 100644 --- a/tools/pd-simulator/simulator/config/config.go +++ b/tools/pd-simulator/simulator/config/config.go @@ -21,8 +21,8 @@ import ( "github.com/BurntSushi/toml" "github.com/docker/go-units" + pdHttp "github.com/tikv/pd/client/http" sc "github.com/tikv/pd/pkg/schedule/config" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/configutil" "github.com/tikv/pd/pkg/utils/tempurl" "github.com/tikv/pd/pkg/utils/typeutil" @@ -133,6 +133,6 @@ func (sc *SimConfig) Speed() uint64 { // PDConfig saves some config which may be changed in PD. type PDConfig struct { - PlacementRules []*placement.Rule + PlacementRules []*pdHttp.Rule LocationLabels typeutil.StringSlice } diff --git a/tools/pd-simulator/simulator/event.go b/tools/pd-simulator/simulator/event.go index 8be8f89d759..20c75b58384 100644 --- a/tools/pd-simulator/simulator/event.go +++ b/tools/pd-simulator/simulator/event.go @@ -216,6 +216,12 @@ func (*DownNode) Run(raft *RaftEngine, _ int64) bool { return false } delete(raft.conn.Nodes, node.Id) + // delete store + err := PDHTTPClient.DeleteStore(context.Background(), node.Id) + if err != nil { + simutil.Logger.Error("put store failed", zap.Uint64("node-id", node.Id), zap.Error(err)) + return false + } node.Stop() regions := raft.GetRegions() diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index 883b5d4474b..fe8dc74a944 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -72,6 +72,7 @@ func NewNode(s *cases.Store, pdAddr string, config *sc.SimConfig) (*Node, error) StoreId: s.ID, Capacity: uint64(config.RaftStore.Capacity), StartTime: uint32(time.Now().Unix()), + Available: uint64(config.RaftStore.Capacity), }, } tag := fmt.Sprintf("store %d", s.ID) @@ -171,6 +172,8 @@ func (n *Node) stepTask() { } } +var schedulerCheck sync.Once + func (n *Node) stepHeartBeat() { config := n.raftEngine.storeConfig @@ -181,6 +184,7 @@ func (n *Node) stepHeartBeat() { period = uint64(config.RaftStore.RegionHeartBeatInterval.Duration / config.SimTickInterval.Duration) if n.tick%period == 0 { n.regionHeartBeat() + schedulerCheck.Do(func() { ChooseToHaltPDSchedule(false) }) } }