diff --git a/OWNERS b/OWNERS new file mode 100644 index 00000000000..5911dfd3b66 --- /dev/null +++ b/OWNERS @@ -0,0 +1,26 @@ +# See the OWNERS docs at https://go.k8s.io/owners +approvers: + - AndreMouche + - binshi-bing + - bufferflies + - CabinfeverB + - Connor1996 + - disksing + - huachaohuang + - HunDunDM + - HuSharp + - JmPotato + - lhy1024 + - nolouch + - overvenus + - qiuyesuifeng + - rleungx + - siddontang + - Yisaer + - zhouqiang-cl +reviewers: + - BusyJay + - howardlau1999 + - Luffbee + - shafreeck + - xhebox diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES new file mode 100644 index 00000000000..516a466c91e --- /dev/null +++ b/OWNERS_ALIASES @@ -0,0 +1,6 @@ +# Sort the member alphabetically. +aliases: + sig-critical-approvers-config: + - easonn7 + - kevin-xianliu + - niubell diff --git a/client/client.go b/client/client.go index 1865fd0866e..1c8ef3cafe8 100644 --- a/client/client.go +++ b/client/client.go @@ -1431,17 +1431,6 @@ func (c *client) scatterRegionsWithOptions(ctx context.Context, regionsID []uint return resp, nil } -// IsLeaderChange will determine whether there is a leader change. -func IsLeaderChange(err error) bool { - if err == errs.ErrClientTSOStreamClosed { - return true - } - errMsg := err.Error() - return strings.Contains(errMsg, errs.NotLeaderErr) || - strings.Contains(errMsg, errs.MismatchLeaderErr) || - strings.Contains(errMsg, errs.NotServedErr) -} - const ( httpSchemePrefix = "http://" httpsSchemePrefix = "https://" diff --git a/client/errs/errno.go b/client/errs/errno.go index 50c136dd5f2..0dbcb4fe147 100644 --- a/client/errs/errno.go +++ b/client/errs/errno.go @@ -20,21 +20,20 @@ import ( "github.com/pingcap/errors" ) +// Note: keep the same as the ones defined on the server side to ensure the client can use them correctly. const ( + // NoLeaderErr indicates there is no leader in the cluster currently. + NoLeaderErr = "no leader" // NotLeaderErr indicates the non-leader member received the requests which should be received by leader. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. - NotLeaderErr = "is not leader" + NotLeaderErr = "not leader" // MismatchLeaderErr indicates the non-leader member received the requests which should be received by leader. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. MismatchLeaderErr = "mismatch leader id" // NotServedErr indicates an tso node/pod received the requests for the keyspace groups which are not served by it. - // Note: keep the same as the ones defined on the server side, because the client side checks if an error message - // contains this string to judge whether the leader is changed. NotServedErr = "is not served" // RetryTimeoutErr indicates the server is busy. RetryTimeoutErr = "retry timeout" + // NotPrimaryErr indicates the non-primary member received the requests which should be received by primary. + NotPrimaryErr = "not primary" ) // client errors diff --git a/client/errs/errs.go b/client/errs/errs.go index 47f7c29a467..da333efda4c 100644 --- a/client/errs/errs.go +++ b/client/errs/errs.go @@ -15,11 +15,29 @@ package errs import ( + "strings" + "github.com/pingcap/errors" "go.uber.org/zap" "go.uber.org/zap/zapcore" ) +// IsLeaderChange will determine whether there is a leader/primary change. +func IsLeaderChange(err error) bool { + if err == nil { + return false + } + if err == ErrClientTSOStreamClosed { + return true + } + errMsg := err.Error() + return strings.Contains(errMsg, NoLeaderErr) || + strings.Contains(errMsg, NotLeaderErr) || + strings.Contains(errMsg, MismatchLeaderErr) || + strings.Contains(errMsg, NotServedErr) || + strings.Contains(errMsg, NotPrimaryErr) +} + // ZapError is used to make the log output easier. func ZapError(err error, causeError ...error) zap.Field { if err == nil { diff --git a/client/go.mod b/client/go.mod index 89799796521..6baa2f112f4 100644 --- a/client/go.mod +++ b/client/go.mod @@ -16,7 +16,6 @@ require ( github.com/stretchr/testify v1.8.2 go.uber.org/atomic v1.10.0 go.uber.org/goleak v1.1.11 - go.uber.org/multierr v1.11.0 go.uber.org/zap v1.24.0 golang.org/x/exp v0.0.0-20230711005742-c3f37128e5a4 google.golang.org/grpc v1.62.1 @@ -34,6 +33,7 @@ require ( github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.46.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect + go.uber.org/multierr v1.11.0 // indirect golang.org/x/net v0.23.0 // indirect golang.org/x/sys v0.18.0 // indirect golang.org/x/text v0.14.0 // indirect diff --git a/client/http/api.go b/client/http/api.go index a1ca96b38f1..3376a48770d 100644 --- a/client/http/api.go +++ b/client/http/api.go @@ -41,6 +41,7 @@ const ( membersPrefix = "/pd/api/v1/members" leaderPrefix = "/pd/api/v1/leader" transferLeader = "/pd/api/v1/leader/transfer" + health = "/pd/api/v1/health" // Config Config = "/pd/api/v1/config" ClusterVersion = "/pd/api/v1/config/cluster-version" diff --git a/client/http/client.go b/client/http/client.go index 30144ebe2c5..123ca616422 100644 --- a/client/http/client.go +++ b/client/http/client.go @@ -120,10 +120,25 @@ func (ci *clientInner) requestWithRetry( headerOpts ...HeaderOption, ) error { var ( + serverURL string + isLeader bool statusCode int err error + logFields = append(reqInfo.logFields(), zap.String("source", ci.source)) ) execFunc := func() error { + defer func() { + // If the status code is 503, it indicates that there may be PD leader/follower changes. + // If the error message contains the leader/primary change information, it indicates that there may be PD leader/primary change. + if statusCode == http.StatusServiceUnavailable || errs.IsLeaderChange(err) { + ci.sd.ScheduleCheckMemberChanged() + } + log.Debug("[pd] http request finished", append(logFields, + zap.String("server-url", serverURL), + zap.Bool("is-leader", isLeader), + zap.Int("status-code", statusCode), + zap.Error(err))...) + }() // It will try to send the request to the PD leader first and then try to send the request to the other PD followers. clients := ci.sd.GetAllServiceClients() if len(clients) == 0 { @@ -131,17 +146,21 @@ func (ci *clientInner) requestWithRetry( } skipNum := 0 for _, cli := range clients { - url := cli.GetURL() - if reqInfo.targetURL != "" && reqInfo.targetURL != url { + serverURL = cli.GetURL() + isLeader = cli.IsConnectedToLeader() + if len(reqInfo.targetURL) > 0 && reqInfo.targetURL != serverURL { skipNum++ continue } - statusCode, err = ci.doRequest(ctx, url, reqInfo, headerOpts...) + statusCode, err = ci.doRequest(ctx, serverURL, reqInfo, headerOpts...) if err == nil || noNeedRetry(statusCode) { return err } - log.Debug("[pd] request url failed", - zap.String("source", ci.source), zap.Bool("is-leader", cli.IsConnectedToLeader()), zap.String("url", url), zap.Error(err)) + log.Debug("[pd] http request url failed", append(logFields, + zap.String("server-url", serverURL), + zap.Bool("is-leader", isLeader), + zap.Int("status-code", statusCode), + zap.Error(err))...) } if skipNum == len(clients) { return errs.ErrClientNoTargetMember @@ -153,10 +172,11 @@ func (ci *clientInner) requestWithRetry( } // Copy a new backoffer for each request. bo := *reqInfo.bo - // Backoffer also needs to check the status code to determine whether to retry. + // Set the retryable checker for the backoffer if it's not set. bo.SetRetryableChecker(func(err error) bool { + // Backoffer also needs to check the status code to determine whether to retry. return err != nil && !noNeedRetry(statusCode) - }) + }, false) return bo.Exec(ctx, execFunc) } @@ -168,26 +188,21 @@ func noNeedRetry(statusCode int) bool { func (ci *clientInner) doRequest( ctx context.Context, - url string, reqInfo *requestInfo, + serverURL string, reqInfo *requestInfo, headerOpts ...HeaderOption, ) (int, error) { var ( - source = ci.source callerID = reqInfo.callerID name = reqInfo.name method = reqInfo.method body = reqInfo.body res = reqInfo.res respHandler = reqInfo.respHandler + url = reqInfo.getURL(serverURL) + logFields = append(reqInfo.logFields(), + zap.String("source", ci.source), + zap.String("url", url)) ) - url = reqInfo.getURL(url) - logFields := []zap.Field{ - zap.String("source", source), - zap.String("name", name), - zap.String("url", url), - zap.String("method", method), - zap.String("caller-id", callerID), - } log.Debug("[pd] request the http url", logFields...) req, err := http.NewRequestWithContext(ctx, method, url, bytes.NewBuffer(body)) if err != nil { @@ -228,11 +243,14 @@ func (ci *clientInner) doRequest( if readErr != nil { logFields = append(logFields, zap.NamedError("read-body-error", err)) } else { + // API server will return a JSON body containing the detailed error message + // when the status code is not `http.StatusOK` 200. + bs = bytes.TrimSpace(bs) logFields = append(logFields, zap.ByteString("body", bs)) } log.Error("[pd] request failed with a non-200 status", logFields...) - return resp.StatusCode, errors.Errorf("request pd http api failed with status: '%s'", resp.Status) + return resp.StatusCode, errors.Errorf("request pd http api failed with status: '%s', body: '%s'", resp.Status, bs) } if res == nil { diff --git a/client/http/interface.go b/client/http/interface.go index 7b15291d9e7..3684e19b1f5 100644 --- a/client/http/interface.go +++ b/client/http/interface.go @@ -49,7 +49,9 @@ type Client interface { GetRegionStatusByKeyRange(context.Context, *KeyRange, bool) (*RegionStats, error) GetStores(context.Context) (*StoresInfo, error) GetStore(context.Context, uint64) (*StoreInfo, error) + DeleteStore(context.Context, uint64) error SetStoreLabels(context.Context, int64, map[string]string) error + GetHealthStatus(context.Context) ([]Health, error) /* Config-related interfaces */ GetConfig(context.Context) (map[string]any, error) SetConfig(context.Context, map[string]any, ...float64) error @@ -337,6 +339,20 @@ func (c *client) SetStoreLabels(ctx context.Context, storeID int64, storeLabels WithBody(jsonInput)) } +// GetHealthStatus gets the health status of the cluster. +func (c *client) GetHealthStatus(ctx context.Context) ([]Health, error) { + var healths []Health + err := c.request(ctx, newRequestInfo(). + WithName(getHealthStatusName). + WithURI(health). + WithMethod(http.MethodGet). + WithResp(&healths)) + if err != nil { + return nil, err + } + return healths, nil +} + // GetConfig gets the configurations. func (c *client) GetConfig(ctx context.Context) (map[string]any, error) { var config map[string]any @@ -425,6 +441,14 @@ func (c *client) GetStore(ctx context.Context, storeID uint64) (*StoreInfo, erro return &store, nil } +// DeleteStore deletes the store by ID. +func (c *client) DeleteStore(ctx context.Context, storeID uint64) error { + return c.request(ctx, newRequestInfo(). + WithName(deleteStoreName). + WithURI(StoreByID(storeID)). + WithMethod(http.MethodDelete)) +} + // GetClusterVersion gets the cluster version. func (c *client) GetClusterVersion(ctx context.Context) (string, error) { var version string diff --git a/client/http/request_info.go b/client/http/request_info.go index 0ce7072d1ba..40bd0368250 100644 --- a/client/http/request_info.go +++ b/client/http/request_info.go @@ -18,6 +18,7 @@ import ( "fmt" "github.com/tikv/pd/client/retry" + "go.uber.org/zap" ) // The following constants are the names of the requests. @@ -38,7 +39,9 @@ const ( getRegionStatusByKeyRangeName = "GetRegionStatusByKeyRange" getStoresName = "GetStores" getStoreName = "GetStore" + deleteStoreName = "DeleteStore" setStoreLabelsName = "SetStoreLabels" + getHealthStatusName = "GetHealthStatus" getConfigName = "GetConfig" setConfigName = "SetConfig" getScheduleConfigName = "GetScheduleConfig" @@ -156,3 +159,13 @@ func (ri *requestInfo) WithTargetURL(targetURL string) *requestInfo { func (ri *requestInfo) getURL(addr string) string { return fmt.Sprintf("%s%s", addr, ri.uri) } + +func (ri *requestInfo) logFields() []zap.Field { + return []zap.Field{ + zap.String("caller-id", ri.callerID), + zap.String("name", ri.name), + zap.String("uri", ri.uri), + zap.String("method", ri.method), + zap.String("target-url", ri.targetURL), + } +} diff --git a/client/http/types.go b/client/http/types.go index 31b2bfdaea7..f7273068b8c 100644 --- a/client/http/types.go +++ b/client/http/types.go @@ -661,3 +661,12 @@ func stringToKeyspaceState(str string) (keyspacepb.KeyspaceState, error) { return keyspacepb.KeyspaceState(0), fmt.Errorf("invalid KeyspaceState string: %s", str) } } + +// Health reflects the cluster's health. +// NOTE: This type is moved from `server/api/health.go`, maybe move them to the same place later. +type Health struct { + Name string `json:"name"` + MemberID uint64 `json:"member_id"` + ClientUrls []string `json:"client_urls"` + Health bool `json:"health"` +} diff --git a/client/pd_service_discovery_test.go b/client/pd_service_discovery_test.go index f4cde0e1911..44171873b1a 100644 --- a/client/pd_service_discovery_test.go +++ b/client/pd_service_discovery_test.go @@ -29,6 +29,7 @@ import ( "github.com/pingcap/kvproto/pkg/pdpb" "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" + "github.com/tikv/pd/client/errs" "github.com/tikv/pd/client/grpcutil" "github.com/tikv/pd/client/testutil" "google.golang.org/grpc" @@ -205,7 +206,7 @@ func (suite *serviceClientTestSuite) TestServiceClient() { re.NotNil(leaderConn) _, err := pb.NewGreeterClient(followerConn).SayHello(suite.ctx, &pb.HelloRequest{Name: "pd"}) - re.ErrorContains(err, "not leader") + re.ErrorContains(err, errs.NotLeaderErr) resp, err := pb.NewGreeterClient(leaderConn).SayHello(suite.ctx, &pb.HelloRequest{Name: "pd"}) re.NoError(err) re.Equal("Hello pd", resp.GetMessage()) diff --git a/client/resource_group/controller/OWNERS b/client/resource_group/controller/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/client/resource_group/controller/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/client/resource_group/controller/controller.go b/client/resource_group/controller/controller.go index 11ea3f7997d..1910e37eff8 100755 --- a/client/resource_group/controller/controller.go +++ b/client/resource_group/controller/controller.go @@ -515,7 +515,7 @@ func (c *ResourceGroupsController) collectTokenBucketRequests(ctx context.Contex request := gc.collectRequestAndConsumption(typ) if request != nil { c.run.currentRequests = append(c.run.currentRequests, request) - gc.tokenRequestCounter.Inc() + gc.metrics.tokenRequestCounter.Inc() } return true }) @@ -632,13 +632,9 @@ type groupCostController struct { calculators []ResourceCalculator handleRespFunc func(*rmpb.TokenBucketResponse) - successfulRequestDuration prometheus.Observer - failedLimitReserveDuration prometheus.Observer - requestRetryCounter prometheus.Counter - failedRequestCounter prometheus.Counter - tokenRequestCounter prometheus.Counter - - mu struct { + // metrics + metrics *groupMetricsCollection + mu struct { sync.Mutex consumption *rmpb.Consumption storeCounter map[uint64]*rmpb.Consumption @@ -685,6 +681,30 @@ type groupCostController struct { tombstone bool } +type groupMetricsCollection struct { + successfulRequestDuration prometheus.Observer + failedLimitReserveDuration prometheus.Observer + requestRetryCounter prometheus.Counter + failedRequestCounterWithOthers prometheus.Counter + failedRequestCounterWithThrottled prometheus.Counter + tokenRequestCounter prometheus.Counter +} + +func initMetrics(oldName, name string) *groupMetricsCollection { + const ( + otherType = "others" + throttledType = "throttled" + ) + return &groupMetricsCollection{ + successfulRequestDuration: successfulRequestDuration.WithLabelValues(oldName, name), + failedLimitReserveDuration: failedLimitReserveDuration.WithLabelValues(oldName, name), + failedRequestCounterWithOthers: failedRequestCounter.WithLabelValues(oldName, name, otherType), + failedRequestCounterWithThrottled: failedRequestCounter.WithLabelValues(oldName, name, throttledType), + requestRetryCounter: requestRetryCounter.WithLabelValues(oldName, name), + tokenRequestCounter: resourceGroupTokenRequestCounter.WithLabelValues(oldName, name), + } +} + type tokenCounter struct { getTokenBucketFunc func() *rmpb.TokenBucket @@ -725,16 +745,13 @@ func newGroupCostController( default: return nil, errs.ErrClientResourceGroupConfigUnavailable.FastGenByArgs("not supports the resource type") } + ms := initMetrics(group.Name, group.Name) gc := &groupCostController{ - meta: group, - name: group.Name, - mainCfg: mainCfg, - mode: group.GetMode(), - successfulRequestDuration: successfulRequestDuration.WithLabelValues(group.Name, group.Name), - failedLimitReserveDuration: failedLimitReserveDuration.WithLabelValues(group.Name, group.Name), - failedRequestCounter: failedRequestCounter.WithLabelValues(group.Name, group.Name), - requestRetryCounter: requestRetryCounter.WithLabelValues(group.Name, group.Name), - tokenRequestCounter: resourceGroupTokenRequestCounter.WithLabelValues(group.Name, group.Name), + meta: group, + name: group.Name, + mainCfg: mainCfg, + mode: group.GetMode(), + metrics: ms, calculators: []ResourceCalculator{ newKVCalculator(mainCfg), newSQLCalculator(mainCfg), @@ -789,7 +806,7 @@ func (gc *groupCostController) initRunState() { case rmpb.GroupMode_RUMode: gc.run.requestUnitTokens = make(map[rmpb.RequestUnitType]*tokenCounter) for typ := range requestUnitLimitTypeList { - limiter := NewLimiterWithCfg(now, cfgFunc(getRUTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) + limiter := NewLimiterWithCfg(gc.name, now, cfgFunc(getRUTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) counter := &tokenCounter{ limiter: limiter, avgRUPerSec: 0, @@ -803,7 +820,7 @@ func (gc *groupCostController) initRunState() { case rmpb.GroupMode_RawMode: gc.run.resourceTokens = make(map[rmpb.RawResourceType]*tokenCounter) for typ := range requestResourceLimitTypeList { - limiter := NewLimiterWithCfg(now, cfgFunc(getRawResourceTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) + limiter := NewLimiterWithCfg(gc.name, now, cfgFunc(getRawResourceTokenBucketSetting(gc.meta, typ)), gc.lowRUNotifyChan) counter := &tokenCounter{ limiter: limiter, avgRUPerSec: 0, @@ -1233,7 +1250,7 @@ func (gc *groupCostController) onRequestWait( res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.LTBMaxWaitDuration, now, v)) } } - if d, err = WaitReservations(ctx, now, res); err == nil { + if d, err = WaitReservations(ctx, now, res); err == nil || errs.ErrClientResourceGroupThrottled.NotEqual(err) { break retryLoop } case rmpb.GroupMode_RUMode: @@ -1243,18 +1260,20 @@ func (gc *groupCostController) onRequestWait( res = append(res, counter.limiter.Reserve(ctx, gc.mainCfg.LTBMaxWaitDuration, now, v)) } } - if d, err = WaitReservations(ctx, now, res); err == nil { + if d, err = WaitReservations(ctx, now, res); err == nil || errs.ErrClientResourceGroupThrottled.NotEqual(err) { break retryLoop } } - gc.requestRetryCounter.Inc() + gc.metrics.requestRetryCounter.Inc() time.Sleep(gc.mainCfg.WaitRetryInterval) waitDuration += gc.mainCfg.WaitRetryInterval } if err != nil { - gc.failedRequestCounter.Inc() - if d.Seconds() > 0 { - gc.failedLimitReserveDuration.Observe(d.Seconds()) + if errs.ErrClientResourceGroupThrottled.Equal(err) { + gc.metrics.failedRequestCounterWithThrottled.Inc() + gc.metrics.failedLimitReserveDuration.Observe(d.Seconds()) + } else { + gc.metrics.failedRequestCounterWithOthers.Inc() } gc.mu.Lock() sub(gc.mu.consumption, delta) @@ -1264,7 +1283,7 @@ func (gc *groupCostController) onRequestWait( }) return nil, nil, waitDuration, 0, err } - gc.successfulRequestDuration.Observe(d.Seconds()) + gc.metrics.successfulRequestDuration.Observe(d.Seconds()) waitDuration += d } diff --git a/client/resource_group/controller/controller_test.go b/client/resource_group/controller/controller_test.go index fea4a133ad0..4f4ec592793 100644 --- a/client/resource_group/controller/controller_test.go +++ b/client/resource_group/controller/controller_test.go @@ -26,6 +26,7 @@ import ( rmpb "github.com/pingcap/kvproto/pkg/resource_manager" "github.com/stretchr/testify/require" + "github.com/tikv/pd/client/errs" ) func createTestGroupCostController(re *require.Assertions) *groupCostController { @@ -117,3 +118,17 @@ func TestRequestAndResponseConsumption(t *testing.T) { re.Equal(expectedConsumption.TotalCpuTimeMs, consumption.TotalCpuTimeMs, caseNum) } } + +func TestResourceGroupThrottledError(t *testing.T) { + re := require.New(t) + gc := createTestGroupCostController(re) + gc.initRunState() + req := &TestRequestInfo{ + isWrite: true, + writeBytes: 10000000, + } + // The group is throttled + _, _, _, _, err := gc.onRequestWait(context.TODO(), req) + re.Error(err) + re.True(errs.ErrClientResourceGroupThrottled.Equal(err)) +} diff --git a/client/resource_group/controller/limiter.go b/client/resource_group/controller/limiter.go index a726b0e219a..2e42f591b8b 100644 --- a/client/resource_group/controller/limiter.go +++ b/client/resource_group/controller/limiter.go @@ -26,6 +26,7 @@ import ( "time" "github.com/pingcap/log" + "github.com/prometheus/client_golang/prometheus" "github.com/tikv/pd/client/errs" "go.uber.org/zap" ) @@ -81,6 +82,15 @@ type Limiter struct { isLowProcess bool // remainingNotifyTimes is used to limit notify when the speed limit is already set. remainingNotifyTimes int + name string + + // metrics + metrics *limiterMetricsCollection +} + +// limiterMetricsCollection is a collection of metrics for a limiter. +type limiterMetricsCollection struct { + lowTokenNotifyCounter prometheus.Counter } // Limit returns the maximum overall event rate. @@ -106,8 +116,9 @@ func NewLimiter(now time.Time, r Limit, b int64, tokens float64, lowTokensNotify // NewLimiterWithCfg returns a new Limiter that allows events up to rate r and permits // bursts of at most b tokens. -func NewLimiterWithCfg(now time.Time, cfg tokenBucketReconfigureArgs, lowTokensNotifyChan chan<- struct{}) *Limiter { +func NewLimiterWithCfg(name string, now time.Time, cfg tokenBucketReconfigureArgs, lowTokensNotifyChan chan<- struct{}) *Limiter { lim := &Limiter{ + name: name, limit: Limit(cfg.NewRate), last: now, tokens: cfg.NewTokens, @@ -115,6 +126,9 @@ func NewLimiterWithCfg(now time.Time, cfg tokenBucketReconfigureArgs, lowTokensN notifyThreshold: cfg.NotifyThreshold, lowTokensNotifyChan: lowTokensNotifyChan, } + lim.metrics = &limiterMetricsCollection{ + lowTokenNotifyCounter: lowTokenRequestNotifyCounter.WithLabelValues(lim.name), + } log.Debug("new limiter", zap.String("limiter", fmt.Sprintf("%+v", lim))) return lim } @@ -224,6 +238,14 @@ func (lim *Limiter) SetupNotificationThreshold(threshold float64) { lim.notifyThreshold = threshold } +// SetName sets the name of the limiter. +func (lim *Limiter) SetName(name string) *Limiter { + lim.mu.Lock() + defer lim.mu.Unlock() + lim.name = name + return lim +} + // notify tries to send a non-blocking notification on notifyCh and disables // further notifications (until the next Reconfigure or StartNotification). func (lim *Limiter) notify() { @@ -234,6 +256,9 @@ func (lim *Limiter) notify() { lim.isLowProcess = true select { case lim.lowTokensNotifyChan <- struct{}{}: + if lim.metrics != nil { + lim.metrics.lowTokenNotifyCounter.Inc() + } default: } } diff --git a/client/resource_group/controller/metrics.go b/client/resource_group/controller/metrics.go index 4261705a6f6..30a0b850c7d 100644 --- a/client/resource_group/controller/metrics.go +++ b/client/resource_group/controller/metrics.go @@ -24,6 +24,8 @@ const ( // TODO: remove old label in 8.x resourceGroupNameLabel = "name" newResourceGroupNameLabel = "resource_group" + + errType = "type" ) var ( @@ -40,7 +42,7 @@ var ( Namespace: namespace, Subsystem: requestSubsystem, Name: "success", - Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 + Buckets: []float64{0.0005, .005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30, 60, 600, 1800, 3600}, // 0.0005 ~ 1h Help: "Bucketed histogram of wait duration of successful request.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) @@ -49,7 +51,7 @@ var ( Namespace: namespace, Subsystem: requestSubsystem, Name: "limit_reserve_time_failed", - Buckets: []float64{.005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30}, // 0.005 ~ 30 + Buckets: []float64{0.0005, .01, .05, .1, .5, 1, 5, 10, 20, 25, 30, 60, 600, 1800, 3600, 86400}, // 0.0005 ~ 24h Help: "Bucketed histogram of wait duration of failed request.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) @@ -59,7 +61,7 @@ var ( Subsystem: requestSubsystem, Name: "fail", Help: "Counter of failed request.", - }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) + }, []string{resourceGroupNameLabel, newResourceGroupNameLabel, errType}) requestRetryCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -73,6 +75,7 @@ var ( prometheus.HistogramOpts{ Namespace: namespace, Subsystem: tokenRequestSubsystem, + Buckets: prometheus.ExponentialBuckets(0.001, 2, 13), // 1ms ~ 8s Name: "duration", Help: "Bucketed histogram of latency(s) of token request.", }, []string{"type"}) @@ -84,6 +87,14 @@ var ( Name: "resource_group", Help: "Counter of token request by every resource group.", }, []string{resourceGroupNameLabel, newResourceGroupNameLabel}) + + lowTokenRequestNotifyCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: tokenRequestSubsystem, + Name: "low_token_notified", + Help: "Counter of low token request.", + }, []string{newResourceGroupNameLabel}) ) var ( @@ -100,4 +111,5 @@ func init() { prometheus.MustRegister(requestRetryCounter) prometheus.MustRegister(tokenRequestDuration) prometheus.MustRegister(resourceGroupTokenRequestCounter) + prometheus.MustRegister(lowTokenRequestNotifyCounter) } diff --git a/client/resource_manager_client.go b/client/resource_manager_client.go index 872b241cfe7..98b123c0823 100644 --- a/client/resource_manager_client.go +++ b/client/resource_manager_client.go @@ -16,7 +16,6 @@ package pd import ( "context" - "strings" "time" "github.com/gogo/protobuf/proto" @@ -35,10 +34,6 @@ const ( modify actionType = 1 groupSettingsPathPrefix = "resource_group/settings" controllerConfigPathPrefix = "resource_group/controller" - // errNotPrimary is returned when the requested server is not primary. - errNotPrimary = "not primary" - // errNotLeader is returned when the requested server is not pd leader. - errNotLeader = "not leader" ) // GroupSettingsPathPrefixBytes is used to watch or get resource groups. @@ -83,7 +78,7 @@ func (c *client) resourceManagerClient() (rmpb.ResourceManagerClient, error) { // gRPCErrorHandler is used to handle the gRPC error returned by the resource manager service. func (c *client) gRPCErrorHandler(err error) { - if strings.Contains(err.Error(), errNotPrimary) || strings.Contains(err.Error(), errNotLeader) { + if errs.IsLeaderChange(err) { c.pdSvcDiscovery.ScheduleCheckMemberChanged() } } diff --git a/client/retry/backoff.go b/client/retry/backoff.go index 580e466badb..9161ad0fea1 100644 --- a/client/retry/backoff.go +++ b/client/retry/backoff.go @@ -24,12 +24,9 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/failpoint" "github.com/pingcap/log" - "go.uber.org/multierr" "go.uber.org/zap" ) -const maxRecordErrorCount = 20 - // Option is used to customize the backoffer. type Option func(*Backoffer) @@ -50,7 +47,7 @@ type Backoffer struct { // total defines the max total time duration cost in retrying. If it's 0, it means infinite retry until success. total time.Duration // retryableChecker is used to check if the error is retryable. - // By default, all errors are retryable. + // If it's not set, it will always retry unconditionally no matter what the error is. retryableChecker func(err error) bool // logInterval defines the log interval for retrying. logInterval time.Duration @@ -69,28 +66,22 @@ func (bo *Backoffer) Exec( ) error { defer bo.resetBackoff() var ( - allErrors error - err error - after *time.Timer + err error + after *time.Timer ) fnName := getFunctionName(fn) for { err = fn() bo.attempt++ - if bo.attempt < maxRecordErrorCount { - // multierr.Append will ignore nil error. - allErrors = multierr.Append(allErrors, err) - } - if !bo.isRetryable(err) { + if err == nil || !bo.isRetryable(err) { break } currentInterval := bo.nextInterval() bo.nextLogTime += currentInterval - if err != nil { - if bo.logInterval > 0 && bo.nextLogTime >= bo.logInterval { - bo.nextLogTime %= bo.logInterval - log.Warn("call PD API failed and retrying", zap.String("api", fnName), zap.Int("retry-time", bo.attempt), zap.Error(err)) - } + if bo.logInterval > 0 && bo.nextLogTime >= bo.logInterval { + bo.nextLogTime %= bo.logInterval + log.Warn("[pd.backoffer] exec fn failed and retrying", + zap.String("fn-name", fnName), zap.Int("retry-time", bo.attempt), zap.Error(err)) } if after == nil { after = time.NewTimer(currentInterval) @@ -100,7 +91,7 @@ func (bo *Backoffer) Exec( select { case <-ctx.Done(): after.Stop() - return multierr.Append(allErrors, errors.Trace(ctx.Err())) + return errors.Trace(ctx.Err()) case <-after.C: failpoint.Inject("backOffExecute", func() { testBackOffExecuteFlag = true @@ -115,7 +106,7 @@ func (bo *Backoffer) Exec( } } } - return allErrors + return err } // InitialBackoffer make the initial state for retrying. @@ -132,12 +123,9 @@ func InitialBackoffer(base, max, total time.Duration, opts ...Option) *Backoffer total = base } bo := &Backoffer{ - base: base, - max: max, - total: total, - retryableChecker: func(err error) bool { - return err != nil - }, + base: base, + max: max, + total: total, next: base, currentTotal: 0, attempt: 0, @@ -148,8 +136,11 @@ func InitialBackoffer(base, max, total time.Duration, opts ...Option) *Backoffer return bo } -// SetRetryableChecker sets the retryable checker. -func (bo *Backoffer) SetRetryableChecker(checker func(err error) bool) { +// SetRetryableChecker sets the retryable checker, `overwrite` flag is used to indicate whether to overwrite the existing checker. +func (bo *Backoffer) SetRetryableChecker(checker func(err error) bool, overwrite bool) { + if !overwrite && bo.retryableChecker != nil { + return + } bo.retryableChecker = checker } diff --git a/client/retry/backoff_test.go b/client/retry/backoff_test.go index 8df06b75f94..22d487b1885 100644 --- a/client/retry/backoff_test.go +++ b/client/retry/backoff_test.go @@ -18,6 +18,7 @@ import ( "bytes" "context" "errors" + "fmt" "testing" "time" @@ -87,24 +88,64 @@ func TestBackoffer(t *testing.T) { return expectedErr }) re.InDelta(total, time.Since(start), float64(250*time.Millisecond)) - re.ErrorContains(err, "test; test; test; test") + re.ErrorContains(err, "test") re.ErrorIs(err, expectedErr) re.Equal(4, execCount) re.True(isBackofferReset(bo)) - // Test the retryable checker. + // Test the error returned. execCount = 0 - bo = InitialBackoffer(base, max, total) - bo.SetRetryableChecker(func(error) bool { - return execCount < 2 + err = bo.Exec(ctx, func() error { + execCount++ + return fmt.Errorf("test %d", execCount) }) + re.Error(err) + re.Equal("test 4", err.Error()) + re.Equal(4, execCount) + re.True(isBackofferReset(bo)) + execCount = 0 err = bo.Exec(ctx, func() error { + if execCount == 1 { + return nil + } execCount++ - return nil + return expectedErr }) + re.Equal(1, execCount) re.NoError(err) + re.True(isBackofferReset(bo)) + + // Test the retryable checker. + execCount = 0 + bo = InitialBackoffer(base, max, total) + retryableChecker := func(error) bool { + return execCount < 2 + } + bo.SetRetryableChecker(retryableChecker, false) + execFunc := func() error { + execCount++ + return expectedErr + } + err = bo.Exec(ctx, execFunc) + re.ErrorIs(err, expectedErr) + re.Equal(2, execCount) + re.True(isBackofferReset(bo)) + // Test the retryable checker with overwrite. + execCount = 0 + retryableChecker = func(error) bool { + return execCount < 4 + } + bo.SetRetryableChecker(retryableChecker, false) + err = bo.Exec(ctx, execFunc) + re.ErrorIs(err, expectedErr) re.Equal(2, execCount) re.True(isBackofferReset(bo)) + execCount = 0 + bo.SetRetryableChecker(retryableChecker, true) + err = bo.Exec(ctx, execFunc) + re.ErrorIs(err, expectedErr) + re.Equal(4, execCount) + re.True(isBackofferReset(bo)) } func isBackofferReset(bo *Backoffer) bool { @@ -129,21 +170,20 @@ func TestBackofferWithLog(t *testing.T) { // 10 + 20 + 40 + 80(log) + 100(log) * 9 >= 1000, so log ten times. re.Len(ms, 10) // 10 + 20 + 40 + 80 + 100 * 9, 13 times retry. - rfc := `["call PD API failed and retrying"] [api=testFn] [retry-time=13] [error=test]` + rfc := `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=13] [error=test]` re.Contains(ms[len(ms)-1], rfc) // 10 + 20 + 40 + 80(log), 4 times retry. - rfc = `["call PD API failed and retrying"] [api=testFn] [retry-time=4] [error=test]` + rfc = `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=4] [error=test]` re.Contains(ms[0], rfc) - bo.resetBackoff() err = bo.Exec(ctx, testFn) re.ErrorIs(err, errTest) ms = lg.Messages() re.Len(ms, 20) - rfc = `["call PD API failed and retrying"] [api=testFn] [retry-time=13] [error=test]` + rfc = `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=13] [error=test]` re.Contains(ms[len(ms)-1], rfc) - rfc = `["call PD API failed and retrying"] [api=testFn] [retry-time=4] [error=test]` + rfc = `["[pd.backoffer] exec fn failed and retrying"] [fn-name=testFn] [retry-time=4] [error=test]` re.Contains(ms[len1], rfc) } diff --git a/client/tlsutil/OWNERS b/client/tlsutil/OWNERS new file mode 100644 index 00000000000..211db06feee --- /dev/null +++ b/client/tlsutil/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|tlsconfig\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/client/tso_dispatcher.go b/client/tso_dispatcher.go index d5b52ad6039..0919fd84744 100644 --- a/client/tso_dispatcher.go +++ b/client/tso_dispatcher.go @@ -303,7 +303,7 @@ tsoBatchLoop: cancel() stream = nil // Because ScheduleCheckMemberChanged is asynchronous, if the leader changes, we better call `updateMember` ASAP. - if IsLeaderChange(err) { + if errs.IsLeaderChange(err) { if err := bo.Exec(ctx, svcDiscovery.CheckMemberChanged); err != nil { select { case <-ctx.Done(): diff --git a/conf/OWNERS b/conf/OWNERS new file mode 100644 index 00000000000..1a435c49089 --- /dev/null +++ b/conf/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.toml)$": + approvers: + - sig-critical-approvers-config diff --git a/errors.toml b/errors.toml index 64101000478..a61c23a6fbd 100644 --- a/errors.toml +++ b/errors.toml @@ -16,11 +16,21 @@ error = ''' redirect failed ''' +["PD:apiutil:ErrRedirectNoLeader"] +error = ''' +redirect finds no leader +''' + ["PD:apiutil:ErrRedirectToNotLeader"] error = ''' redirect to not leader ''' +["PD:apiutil:ErrRedirectToNotPrimary"] +error = ''' +redirect to not primary +''' + ["PD:autoscaling:ErrEmptyMetricsResponse"] error = ''' metrics response from Prometheus is empty diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 69afb93f531..7965a341f6c 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -2096,7 +2096,7 @@ { "format": "dtdurations", "label": null, - "logBase": 1, + "logBase": 2, "max": null, "min": "0", "show": true diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 8bd2616f41f..ab97c7899db 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -35,12 +35,7 @@ type Cluster interface { func HandleStatsAsync(c Cluster, region *core.RegionInfo) { c.GetHotStat().CheckWriteAsync(statistics.NewCheckExpiredItemTask(region)) c.GetHotStat().CheckReadAsync(statistics.NewCheckExpiredItemTask(region)) - reportInterval := region.GetInterval() - interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetWriteLoads(), interval) - c.GetHotStat().CheckWriteAsync(statistics.NewCheckPeerTask(peerInfo, region)) - } + c.GetHotStat().CheckWriteAsync(statistics.NewCheckWritePeerTask(region)) c.GetCoordinator().GetSchedulersController().CheckTransferWitnessLeader(region) } diff --git a/pkg/core/basic_cluster.go b/pkg/core/basic_cluster.go index d70b620db3b..2392b7ddac6 100644 --- a/pkg/core/basic_cluster.go +++ b/pkg/core/basic_cluster.go @@ -14,218 +14,43 @@ package core -import ( - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core/storelimit" - "github.com/tikv/pd/pkg/utils/syncutil" -) - // BasicCluster provides basic data member and interface for a tikv cluster. type BasicCluster struct { - Stores struct { - mu syncutil.RWMutex - *StoresInfo - } - + *StoresInfo *RegionsInfo } // NewBasicCluster creates a BasicCluster. func NewBasicCluster() *BasicCluster { return &BasicCluster{ - Stores: struct { - mu syncutil.RWMutex - *StoresInfo - }{StoresInfo: NewStoresInfo()}, - + StoresInfo: NewStoresInfo(), RegionsInfo: NewRegionsInfo(), } } -/* Stores read operations */ - -// GetStores returns all Stores in the cluster. -func (bc *BasicCluster) GetStores() []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStores() -} - -// GetMetaStores gets a complete set of metapb.Store. -func (bc *BasicCluster) GetMetaStores() []*metapb.Store { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetMetaStores() -} - -// GetStore searches for a store by ID. -func (bc *BasicCluster) GetStore(storeID uint64) *StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStore(storeID) -} - -// GetRegionStores returns all Stores that contains the region's peer. -func (bc *BasicCluster) GetRegionStores(region *RegionInfo) []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - var Stores []*StoreInfo - for id := range region.GetStoreIDs() { - if store := bc.Stores.GetStore(id); store != nil { - Stores = append(Stores, store) - } - } - return Stores -} - -// GetNonWitnessVoterStores returns all Stores that contains the non-witness's voter peer. -func (bc *BasicCluster) GetNonWitnessVoterStores(region *RegionInfo) []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - var Stores []*StoreInfo - for id := range region.GetNonWitnessVoters() { - if store := bc.Stores.GetStore(id); store != nil { - Stores = append(Stores, store) - } - } - return Stores -} - -// GetFollowerStores returns all Stores that contains the region's follower peer. -func (bc *BasicCluster) GetFollowerStores(region *RegionInfo) []*StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - var Stores []*StoreInfo - for id := range region.GetFollowers() { - if store := bc.Stores.GetStore(id); store != nil { - Stores = append(Stores, store) - } - } - return Stores -} - -// GetLeaderStore returns all Stores that contains the region's leader peer. -func (bc *BasicCluster) GetLeaderStore(region *RegionInfo) *StoreInfo { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStore(region.GetLeader().GetStoreId()) -} - -// GetStoreCount returns the total count of storeInfo. -func (bc *BasicCluster) GetStoreCount() int { - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStoreCount() -} - -/* Stores Write operations */ - -// PauseLeaderTransfer prevents the store from been selected as source or -// target store of TransferLeader. -func (bc *BasicCluster) PauseLeaderTransfer(storeID uint64) error { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - return bc.Stores.PauseLeaderTransfer(storeID) -} - -// ResumeLeaderTransfer cleans a store's pause state. The store can be selected -// as source or target of TransferLeader again. -func (bc *BasicCluster) ResumeLeaderTransfer(storeID uint64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.ResumeLeaderTransfer(storeID) -} - -// SlowStoreEvicted marks a store as a slow store and prevents transferring -// leader to the store -func (bc *BasicCluster) SlowStoreEvicted(storeID uint64) error { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - return bc.Stores.SlowStoreEvicted(storeID) -} - -// SlowTrendEvicted marks a store as a slow store by trend and prevents transferring -// leader to the store -func (bc *BasicCluster) SlowTrendEvicted(storeID uint64) error { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - return bc.Stores.SlowTrendEvicted(storeID) -} - -// SlowTrendRecovered cleans the evicted by slow trend state of a store. -func (bc *BasicCluster) SlowTrendRecovered(storeID uint64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.SlowTrendRecovered(storeID) -} - -// SlowStoreRecovered cleans the evicted state of a store. -func (bc *BasicCluster) SlowStoreRecovered(storeID uint64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.SlowStoreRecovered(storeID) -} - -// ResetStoreLimit resets the limit for a specific store. -func (bc *BasicCluster) ResetStoreLimit(storeID uint64, limitType storelimit.Type, ratePerSec ...float64) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.ResetStoreLimit(storeID, limitType, ratePerSec...) -} - // UpdateStoreStatus updates the information of the store. func (bc *BasicCluster) UpdateStoreStatus(storeID uint64) { - leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize := bc.RegionsInfo.GetStoreStats(storeID) - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.UpdateStoreStatus(storeID, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize) -} - -// PutStore put a store. -func (bc *BasicCluster) PutStore(store *StoreInfo) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.SetStore(store) -} - -// ResetStores resets the store cache. -func (bc *BasicCluster) ResetStores() { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.StoresInfo = NewStoresInfo() -} - -// DeleteStore deletes a store. -func (bc *BasicCluster) DeleteStore(store *StoreInfo) { - bc.Stores.mu.Lock() - defer bc.Stores.mu.Unlock() - bc.Stores.DeleteStore(store) + leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize := bc.GetStoreStats(storeID) + bc.StoresInfo.UpdateStoreStatus(storeID, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount, leaderRegionSize, regionSize) } /* Regions read operations */ // GetLeaderStoreByRegionID returns the leader store of the given region. func (bc *BasicCluster) GetLeaderStoreByRegionID(regionID uint64) *StoreInfo { - region := bc.RegionsInfo.GetRegion(regionID) + region := bc.GetRegion(regionID) if region == nil || region.GetLeader() == nil { return nil } - bc.Stores.mu.RLock() - defer bc.Stores.mu.RUnlock() - return bc.Stores.GetStore(region.GetLeader().GetStoreId()) + return bc.GetStore(region.GetLeader().GetStoreId()) } func (bc *BasicCluster) getWriteRate( f func(storeID uint64) (bytesRate, keysRate float64), ) (storeIDs []uint64, bytesRates, keysRates []float64) { - bc.Stores.mu.RLock() - count := len(bc.Stores.stores) - storeIDs = make([]uint64, 0, count) - for _, store := range bc.Stores.stores { - storeIDs = append(storeIDs, store.GetID()) - } - bc.Stores.mu.RUnlock() + storeIDs = bc.GetStoreIDs() + count := len(storeIDs) bytesRates = make([]float64, 0, count) keysRates = make([]float64, 0, count) for _, id := range storeIDs { @@ -238,12 +63,12 @@ func (bc *BasicCluster) getWriteRate( // GetStoresLeaderWriteRate get total write rate of each store's leaders. func (bc *BasicCluster) GetStoresLeaderWriteRate() (storeIDs []uint64, bytesRates, keysRates []float64) { - return bc.getWriteRate(bc.RegionsInfo.GetStoreLeaderWriteRate) + return bc.getWriteRate(bc.GetStoreLeaderWriteRate) } // GetStoresWriteRate get total write rate of each store's regions. func (bc *BasicCluster) GetStoresWriteRate() (storeIDs []uint64, bytesRates, keysRates []float64) { - return bc.getWriteRate(bc.RegionsInfo.GetStoreWriteRate) + return bc.getWriteRate(bc.GetStoreWriteRate) } // UpdateAllStoreStatus updates the information of all stores. diff --git a/pkg/core/peer.go b/pkg/core/peer.go index 659886e6d39..1f888ba58eb 100644 --- a/pkg/core/peer.go +++ b/pkg/core/peer.go @@ -77,34 +77,3 @@ func CountInJointState(peers ...*metapb.Peer) int { } return count } - -// PeerInfo provides peer information -type PeerInfo struct { - *metapb.Peer - loads []float64 - interval uint64 -} - -// NewPeerInfo creates PeerInfo -func NewPeerInfo(meta *metapb.Peer, loads []float64, interval uint64) *PeerInfo { - return &PeerInfo{ - Peer: meta, - loads: loads, - interval: interval, - } -} - -// GetLoads provides loads -func (p *PeerInfo) GetLoads() []float64 { - return p.loads -} - -// GetPeerID provides peer id -func (p *PeerInfo) GetPeerID() uint64 { - return p.GetId() -} - -// GetInterval returns reporting interval -func (p *PeerInfo) GetInterval() uint64 { - return p.interval -} diff --git a/pkg/core/store.go b/pkg/core/store.go index 9b660754496..5baedafdb05 100644 --- a/pkg/core/store.go +++ b/pkg/core/store.go @@ -26,6 +26,7 @@ import ( "github.com/tikv/pd/pkg/core/constant" "github.com/tikv/pd/pkg/core/storelimit" "github.com/tikv/pd/pkg/errs" + "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/pkg/utils/typeutil" "go.uber.org/zap" ) @@ -639,6 +640,7 @@ func MergeLabels(origin []*metapb.StoreLabel, labels []*metapb.StoreLabel) []*me // StoresInfo contains information about all stores. type StoresInfo struct { + syncutil.RWMutex stores map[uint64]*StoreInfo } @@ -649,8 +651,12 @@ func NewStoresInfo() *StoresInfo { } } +/* Stores read operations */ + // GetStore returns a copy of the StoreInfo with the specified storeID. func (s *StoresInfo) GetStore(storeID uint64) *StoreInfo { + s.RLock() + defer s.RUnlock() store, ok := s.stores[storeID] if !ok { return nil @@ -658,13 +664,121 @@ func (s *StoresInfo) GetStore(storeID uint64) *StoreInfo { return store } -// SetStore sets a StoreInfo with storeID. -func (s *StoresInfo) SetStore(store *StoreInfo) { +// GetStores gets a complete set of StoreInfo. +func (s *StoresInfo) GetStores() []*StoreInfo { + s.RLock() + defer s.RUnlock() + stores := make([]*StoreInfo, 0, len(s.stores)) + for _, store := range s.stores { + stores = append(stores, store) + } + return stores +} + +// GetMetaStores gets a complete set of metapb.Store. +func (s *StoresInfo) GetMetaStores() []*metapb.Store { + s.RLock() + defer s.RUnlock() + stores := make([]*metapb.Store, 0, len(s.stores)) + for _, store := range s.stores { + stores = append(stores, store.GetMeta()) + } + return stores +} + +// GetStoreIDs returns a list of store ids. +func (s *StoresInfo) GetStoreIDs() []uint64 { + s.RLock() + defer s.RUnlock() + count := len(s.stores) + storeIDs := make([]uint64, 0, count) + for _, store := range s.stores { + storeIDs = append(storeIDs, store.GetID()) + } + return storeIDs +} + +// GetFollowerStores returns all Stores that contains the region's follower peer. +func (s *StoresInfo) GetFollowerStores(region *RegionInfo) []*StoreInfo { + s.RLock() + defer s.RUnlock() + var stores []*StoreInfo + for id := range region.GetFollowers() { + if store, ok := s.stores[id]; ok && store != nil { + stores = append(stores, store) + } + } + return stores +} + +// GetRegionStores returns all Stores that contains the region's peer. +func (s *StoresInfo) GetRegionStores(region *RegionInfo) []*StoreInfo { + s.RLock() + defer s.RUnlock() + var stores []*StoreInfo + for id := range region.GetStoreIDs() { + if store, ok := s.stores[id]; ok && store != nil { + stores = append(stores, store) + } + } + return stores +} + +// GetLeaderStore returns all Stores that contains the region's leader peer. +func (s *StoresInfo) GetLeaderStore(region *RegionInfo) *StoreInfo { + s.RLock() + defer s.RUnlock() + if store, ok := s.stores[region.GetLeader().GetStoreId()]; ok && store != nil { + return store + } + return nil +} + +// GetStoreCount returns the total count of storeInfo. +func (s *StoresInfo) GetStoreCount() int { + s.RLock() + defer s.RUnlock() + return len(s.stores) +} + +// GetNonWitnessVoterStores returns all Stores that contains the non-witness's voter peer. +func (s *StoresInfo) GetNonWitnessVoterStores(region *RegionInfo) []*StoreInfo { + s.RLock() + defer s.RUnlock() + var stores []*StoreInfo + for id := range region.GetNonWitnessVoters() { + if store, ok := s.stores[id]; ok && store != nil { + stores = append(stores, store) + } + } + return stores +} + +/* Stores write operations */ + +// PutStore sets a StoreInfo with storeID. +func (s *StoresInfo) PutStore(store *StoreInfo) { + s.Lock() + defer s.Unlock() + s.putStoreLocked(store) +} + +// putStoreLocked sets a StoreInfo with storeID. +func (s *StoresInfo) putStoreLocked(store *StoreInfo) { s.stores[store.GetID()] = store } +// ResetStores resets the store cache. +func (s *StoresInfo) ResetStores() { + s.Lock() + defer s.Unlock() + s.stores = make(map[uint64]*StoreInfo) +} + // PauseLeaderTransfer pauses a StoreInfo with storeID. func (s *StoresInfo) PauseLeaderTransfer(storeID uint64) error { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -679,6 +793,8 @@ func (s *StoresInfo) PauseLeaderTransfer(storeID uint64) error { // ResumeLeaderTransfer cleans a store's pause state. The store can be selected // as source or target of TransferLeader again. func (s *StoresInfo) ResumeLeaderTransfer(storeID uint64) { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { log.Warn("try to clean a store's pause state, but it is not found. It may be cleanup", @@ -691,6 +807,8 @@ func (s *StoresInfo) ResumeLeaderTransfer(storeID uint64) { // SlowStoreEvicted marks a store as a slow store and prevents transferring // leader to the store func (s *StoresInfo) SlowStoreEvicted(storeID uint64) error { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -704,6 +822,8 @@ func (s *StoresInfo) SlowStoreEvicted(storeID uint64) error { // SlowStoreRecovered cleans the evicted state of a store. func (s *StoresInfo) SlowStoreRecovered(storeID uint64) { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { log.Warn("try to clean a store's evicted as a slow store state, but it is not found. It may be cleanup", @@ -716,6 +836,8 @@ func (s *StoresInfo) SlowStoreRecovered(storeID uint64) { // SlowTrendEvicted marks a store as a slow trend and prevents transferring // leader to the store func (s *StoresInfo) SlowTrendEvicted(storeID uint64) error { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -729,6 +851,8 @@ func (s *StoresInfo) SlowTrendEvicted(storeID uint64) error { // SlowTrendRecovered cleans the evicted by trend state of a store. func (s *StoresInfo) SlowTrendRecovered(storeID uint64) { + s.Lock() + defer s.Unlock() store, ok := s.stores[storeID] if !ok { log.Warn("try to clean a store's evicted by trend as a slow store state, but it is not found. It may be cleanup", @@ -740,76 +864,24 @@ func (s *StoresInfo) SlowTrendRecovered(storeID uint64) { // ResetStoreLimit resets the limit for a specific store. func (s *StoresInfo) ResetStoreLimit(storeID uint64, limitType storelimit.Type, ratePerSec ...float64) { + s.Lock() + defer s.Unlock() if store, ok := s.stores[storeID]; ok { s.stores[storeID] = store.Clone(ResetStoreLimit(limitType, ratePerSec...)) } } -// GetStores gets a complete set of StoreInfo. -func (s *StoresInfo) GetStores() []*StoreInfo { - stores := make([]*StoreInfo, 0, len(s.stores)) - for _, store := range s.stores { - stores = append(stores, store) - } - return stores -} - -// GetMetaStores gets a complete set of metapb.Store. -func (s *StoresInfo) GetMetaStores() []*metapb.Store { - stores := make([]*metapb.Store, 0, len(s.stores)) - for _, store := range s.stores { - stores = append(stores, store.GetMeta()) - } - return stores -} - // DeleteStore deletes tombstone record form store func (s *StoresInfo) DeleteStore(store *StoreInfo) { + s.Lock() + defer s.Unlock() delete(s.stores, store.GetID()) } -// GetStoreCount returns the total count of storeInfo. -func (s *StoresInfo) GetStoreCount() int { - return len(s.stores) -} - -// SetLeaderCount sets the leader count to a storeInfo. -func (s *StoresInfo) SetLeaderCount(storeID uint64, leaderCount int) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetLeaderCount(leaderCount)) - } -} - -// SetRegionCount sets the region count to a storeInfo. -func (s *StoresInfo) SetRegionCount(storeID uint64, regionCount int) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetRegionCount(regionCount)) - } -} - -// SetPendingPeerCount sets the pending count to a storeInfo. -func (s *StoresInfo) SetPendingPeerCount(storeID uint64, pendingPeerCount int) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetPendingPeerCount(pendingPeerCount)) - } -} - -// SetLeaderSize sets the leader size to a storeInfo. -func (s *StoresInfo) SetLeaderSize(storeID uint64, leaderSize int64) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetLeaderSize(leaderSize)) - } -} - -// SetRegionSize sets the region size to a storeInfo. -func (s *StoresInfo) SetRegionSize(storeID uint64, regionSize int64) { - if store, ok := s.stores[storeID]; ok { - s.stores[storeID] = store.Clone(SetRegionSize(regionSize)) - } -} - // UpdateStoreStatus updates the information of the store. func (s *StoresInfo) UpdateStoreStatus(storeID uint64, leaderCount, regionCount, witnessCount, learnerCount, pendingPeerCount int, leaderSize int64, regionSize int64) { + s.Lock() + defer s.Unlock() if store, ok := s.stores[storeID]; ok { newStore := store.ShallowClone(SetLeaderCount(leaderCount), SetRegionCount(regionCount), @@ -818,7 +890,7 @@ func (s *StoresInfo) UpdateStoreStatus(storeID uint64, leaderCount, regionCount, SetPendingPeerCount(pendingPeerCount), SetLeaderSize(leaderSize), SetRegionSize(regionSize)) - s.SetStore(newStore) + s.putStoreLocked(newStore) } } diff --git a/pkg/core/storelimit/store_limit.go b/pkg/core/storelimit/store_limit.go index 8d70b2918a1..e35ec773d80 100644 --- a/pkg/core/storelimit/store_limit.go +++ b/pkg/core/storelimit/store_limit.go @@ -17,6 +17,7 @@ package storelimit import ( "github.com/tikv/pd/pkg/core/constant" "github.com/tikv/pd/pkg/ratelimit" + "github.com/tikv/pd/pkg/utils/syncutil" ) const ( @@ -106,7 +107,7 @@ func (l *StoreRateLimit) Rate(typ Type) float64 { if l.limits[typ] == nil { return 0.0 } - return l.limits[typ].ratePerSec + return l.limits[typ].GetRatePerSec() } // Take takes count tokens from the bucket without blocking. @@ -128,12 +129,15 @@ func (l *StoreRateLimit) Reset(rate float64, typ Type) { // limit the operators of a store type limit struct { - limiter *ratelimit.RateLimiter - ratePerSec float64 + limiter *ratelimit.RateLimiter + ratePerSecMutex syncutil.RWMutex + ratePerSec float64 } // Reset resets the rate limit. func (l *limit) Reset(ratePerSec float64) { + l.ratePerSecMutex.Lock() + defer l.ratePerSecMutex.Unlock() if l.ratePerSec == ratePerSec { return } @@ -155,6 +159,8 @@ func (l *limit) Reset(ratePerSec float64) { // Available returns the number of available tokens // It returns true if the rate per second is zero. func (l *limit) Available(n int64) bool { + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() if l.ratePerSec == 0 { return true } @@ -164,8 +170,16 @@ func (l *limit) Available(n int64) bool { // Take takes count tokens from the bucket without blocking. func (l *limit) Take(count int64) bool { + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() if l.ratePerSec == 0 { return true } return l.limiter.AllowN(int(count)) } + +func (l *limit) GetRatePerSec() float64 { + l.ratePerSecMutex.RLock() + defer l.ratePerSecMutex.RUnlock() + return l.ratePerSec +} diff --git a/pkg/dashboard/uiserver/embedded_assets_rewriter.go b/pkg/dashboard/uiserver/embedded_assets_rewriter.go index 2a5b4a5b3b6..d19db01936f 100644 --- a/pkg/dashboard/uiserver/embedded_assets_rewriter.go +++ b/pkg/dashboard/uiserver/embedded_assets_rewriter.go @@ -28,6 +28,7 @@ import ( var once sync.Once // Assets returns the Assets FileSystem of the dashboard UI +// NOTE: if you see "undefined: assets" error, please run `make dashboard-ui` in the root directory of the repository. func Assets(cfg *config.Config) http.FileSystem { once.Do(func() { resPath := distroutil.MustGetResPath() diff --git a/pkg/election/leadership.go b/pkg/election/leadership.go index 02f519dbc75..3ee413818a5 100644 --- a/pkg/election/leadership.go +++ b/pkg/election/leadership.go @@ -34,11 +34,12 @@ import ( ) const ( - defaultCampaignTimesSlot = 10 - watchLoopUnhealthyTimeout = 60 * time.Second - campaignTimesRecordTimeout = 5 * time.Minute + defaultCampaignTimesSlot = 10 + watchLoopUnhealthyTimeout = 60 * time.Second ) +var campaignTimesRecordTimeout = 5 * time.Minute + // GetLeader gets the corresponding leader from etcd by given leaderPath (as the key). func GetLeader(c *clientv3.Client, leaderPath string) (*pdpb.Member, int64, error) { leader := &pdpb.Member{} @@ -114,6 +115,7 @@ func (ls *Leadership) GetLeaderKey() string { } // GetCampaignTimesNum is used to get the campaign times of the leader within `campaignTimesRecordTimeout`. +// Need to make sure `AddCampaignTimes` is called before this function. func (ls *Leadership) GetCampaignTimesNum() int { if ls == nil { return 0 @@ -129,8 +131,8 @@ func (ls *Leadership) ResetCampaignTimes() { ls.campaignTimes = make([]time.Time, 0, defaultCampaignTimesSlot) } -// addCampaignTimes is used to add the campaign times of the leader. -func (ls *Leadership) addCampaignTimes() { +// AddCampaignTimes is used to add the campaign times of the leader. +func (ls *Leadership) AddCampaignTimes() { if ls == nil { return } @@ -138,7 +140,7 @@ func (ls *Leadership) addCampaignTimes() { if time.Since(ls.campaignTimes[i]) > campaignTimesRecordTimeout { // remove the time which is more than `campaignTimesRecordTimeout` // array is sorted by time - ls.campaignTimes = ls.campaignTimes[i:] + ls.campaignTimes = ls.campaignTimes[i+1:] break } } @@ -148,7 +150,6 @@ func (ls *Leadership) addCampaignTimes() { // Campaign is used to campaign the leader with given lease and returns a leadership func (ls *Leadership) Campaign(leaseTimeout int64, leaderData string, cmps ...clientv3.Cmp) error { - ls.addCampaignTimes() ls.leaderValue = leaderData // Create a new lease to campaign newLease := &lease{ diff --git a/pkg/election/leadership_test.go b/pkg/election/leadership_test.go index 1fde4ddeba7..40f0bcbee23 100644 --- a/pkg/election/leadership_test.go +++ b/pkg/election/leadership_test.go @@ -262,3 +262,36 @@ func TestRequestProgress(t *testing.T) { checkWatcherRequestProgress(false) checkWatcherRequestProgress(true) } + +func TestCampaignTimes(t *testing.T) { + re := require.New(t) + _, client, clean := etcdutil.NewTestEtcdCluster(t, 1) + defer clean() + leadership := NewLeadership(client, "test_leader", "test_leader") + + // all the campaign times are within the timeout. + campaignTimesRecordTimeout = 10 * time.Second + defer func() { + campaignTimesRecordTimeout = 5 * time.Minute + }() + for i := 0; i < 3; i++ { + leadership.AddCampaignTimes() + time.Sleep(100 * time.Millisecond) + } + re.Equal(3, leadership.GetCampaignTimesNum()) + + // only the last 2 records are valid. + campaignTimesRecordTimeout = 200 * time.Millisecond + for i := 0; i < 3; i++ { + leadership.AddCampaignTimes() + time.Sleep(100 * time.Millisecond) + } + re.Equal(2, leadership.GetCampaignTimesNum()) + + time.Sleep(200 * time.Millisecond) + // need to wait for the next addCampaignTimes to update the campaign time. + re.Equal(2, leadership.GetCampaignTimesNum()) + // check campaign leader frequency. + leadership.AddCampaignTimes() + re.Equal(1, leadership.GetCampaignTimesNum()) +} diff --git a/pkg/encryption/OWNERS b/pkg/encryption/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/encryption/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/errs/errno.go b/pkg/errs/errno.go index 8c3e914531b..1f56a821032 100644 --- a/pkg/errs/errno.go +++ b/pkg/errs/errno.go @@ -195,10 +195,11 @@ var ( // apiutil errors var ( - ErrRedirect = errors.Normalize("redirect failed", errors.RFCCodeText("PD:apiutil:ErrRedirect")) - ErrOptionNotExist = errors.Normalize("the option %s does not exist", errors.RFCCodeText("PD:apiutil:ErrOptionNotExist")) - // ErrRedirectToNotLeader is the error message for redirect to not leader. - ErrRedirectToNotLeader = errors.Normalize("redirect to not leader", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotLeader")) + ErrRedirect = errors.Normalize("redirect failed", errors.RFCCodeText("PD:apiutil:ErrRedirect")) + ErrOptionNotExist = errors.Normalize("the option %s does not exist", errors.RFCCodeText("PD:apiutil:ErrOptionNotExist")) + ErrRedirectNoLeader = errors.Normalize("redirect finds no leader", errors.RFCCodeText("PD:apiutil:ErrRedirectNoLeader")) + ErrRedirectToNotLeader = errors.Normalize("redirect to not leader", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotLeader")) + ErrRedirectToNotPrimary = errors.Normalize("redirect to not primary", errors.RFCCodeText("PD:apiutil:ErrRedirectToNotPrimary")) ) // grpcutil errors diff --git a/pkg/keyspace/keyspace.go b/pkg/keyspace/keyspace.go index d84b3698f69..b37ec7f0fca 100644 --- a/pkg/keyspace/keyspace.go +++ b/pkg/keyspace/keyspace.go @@ -343,20 +343,20 @@ func (manager *Manager) splitKeyspaceRegion(id uint32, waitRegionSplit bool) (er for { select { case <-ticker.C: - regionsInfo := manager.cluster.GetBasicCluster().RegionsInfo - region := regionsInfo.GetRegionByKey(rawLeftBound) + c := manager.cluster.GetBasicCluster() + region := c.GetRegionByKey(rawLeftBound) if region == nil || !bytes.Equal(region.GetStartKey(), rawLeftBound) { continue } - region = regionsInfo.GetRegionByKey(rawRightBound) + region = c.GetRegionByKey(rawRightBound) if region == nil || !bytes.Equal(region.GetStartKey(), rawRightBound) { continue } - region = regionsInfo.GetRegionByKey(txnLeftBound) + region = c.GetRegionByKey(txnLeftBound) if region == nil || !bytes.Equal(region.GetStartKey(), txnLeftBound) { continue } - region = regionsInfo.GetRegionByKey(txnRightBound) + region = c.GetRegionByKey(txnRightBound) if region == nil || !bytes.Equal(region.GetStartKey(), txnRightBound) { continue } diff --git a/pkg/mcs/resourcemanager/server/OWNERS b/pkg/mcs/resourcemanager/server/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/mcs/resourcemanager/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/resourcemanager/server/manager.go b/pkg/mcs/resourcemanager/server/manager.go index ef402b8cbf9..418d188823f 100644 --- a/pkg/mcs/resourcemanager/server/manager.go +++ b/pkg/mcs/resourcemanager/server/manager.go @@ -129,7 +129,9 @@ func (m *Manager) Init(ctx context.Context) error { return err } // Load resource group meta info from storage. + m.Lock() m.groups = make(map[string]*ResourceGroup) + m.Unlock() handler := func(k, v string) { group := &rmpb.ResourceGroup{} if err := proto.Unmarshal([]byte(v), group); err != nil { diff --git a/pkg/mcs/scheduling/server/apis/v1/api.go b/pkg/mcs/scheduling/server/apis/v1/api.go index be3277f3fc6..39aa11927ca 100644 --- a/pkg/mcs/scheduling/server/apis/v1/api.go +++ b/pkg/mcs/scheduling/server/apis/v1/api.go @@ -272,7 +272,7 @@ func deleteAllRegionCache(c *gin.Context) { c.String(http.StatusInternalServerError, errs.ErrNotBootstrapped.GenWithStackByArgs().Error()) return } - cluster.DropCacheAllRegion() + cluster.ResetRegionCache() c.String(http.StatusOK, "All regions are removed from server cache.") } @@ -297,7 +297,7 @@ func deleteRegionCacheByID(c *gin.Context) { c.String(http.StatusBadRequest, err.Error()) return } - cluster.DropCacheRegion(regionID) + cluster.RemoveRegionIfExist(regionID) c.String(http.StatusOK, "The region is removed from server cache.") } diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index c6c365b03ad..caaafe42c87 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -9,6 +9,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/failpoint" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/kvproto/pkg/schedulingpb" "github.com/pingcap/log" @@ -68,9 +69,9 @@ const ( collectWaitTime = time.Minute // heartbeat relative const - heartbeatTaskRunner = "heartbeat-task-runner" - statisticsTaskRunner = "statistics-task-runner" - logTaskRunner = "log-task-runner" + heartbeatTaskRunner = "heartbeat-task-runner" + miscTaskRunner = "misc-task-runner" + logTaskRunner = "log-task-runner" ) var syncRunner = ratelimit.NewSyncRunner() @@ -99,7 +100,7 @@ func NewCluster(parentCtx context.Context, persistConfig *config.PersistConfig, checkMembershipCh: checkMembershipCh, heartbeatRunner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), - miscRunner: ratelimit.NewConcurrentRunner(statisticsTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + miscRunner: ratelimit.NewConcurrentRunner(miscTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), } c.coordinator = schedule.NewCoordinator(ctx, c, hbStreams) @@ -442,8 +443,7 @@ func (c *Cluster) HandleStoreHeartbeat(heartbeat *schedulingpb.StoreHeartbeatReq utils.RegionWriteKeys: 0, utils.RegionWriteQueryNum: 0, } - peerInfo := core.NewPeerInfo(peer, loads, interval) - c.hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) + c.hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{peer}, loads, interval)) } // Here we will compare the reported regions with the previous hot peers to decide if it is still hot. @@ -521,7 +521,7 @@ func (c *Cluster) collectMetrics() { // collect hot cache metrics c.hotStat.CollectMetrics() // collect the lock metrics - c.RegionsInfo.CollectWaitLockMetrics() + c.CollectWaitLockMetrics() } func resetMetrics() { @@ -688,16 +688,6 @@ func (c *Cluster) SetPrepared() { c.coordinator.GetPrepareChecker().SetPrepared() } -// DropCacheAllRegion removes all cached regions. -func (c *Cluster) DropCacheAllRegion() { - c.ResetRegionCache() -} - -// DropCacheRegion removes a region from the cache. -func (c *Cluster) DropCacheRegion(id uint64) { - c.RemoveRegionIfExist(id) -} - // IsSchedulingHalted returns whether the scheduling is halted. // Currently, the microservice scheduling is halted when: // - The `HaltScheduling` persist option is set to true. diff --git a/pkg/mcs/scheduling/server/config/OWNERS b/pkg/mcs/scheduling/server/config/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/mcs/scheduling/server/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/mcs/tso/server/OWNERS b/pkg/mcs/tso/server/OWNERS new file mode 100644 index 00000000000..aa02465dbd9 --- /dev/null +++ b/pkg/mcs/tso/server/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/member/member.go b/pkg/member/member.go index af504d83963..bbf46d8f167 100644 --- a/pkg/member/member.go +++ b/pkg/member/member.go @@ -182,11 +182,12 @@ func (m *EmbeddedEtcdMember) GetLastLeaderUpdatedTime() time.Time { // and make it become a PD leader. // leader should be changed when campaign leader frequently. func (m *EmbeddedEtcdMember) CampaignLeader(ctx context.Context, leaseTimeout int64) error { + m.leadership.AddCampaignTimes() failpoint.Inject("skipCampaignLeaderCheck", func() { failpoint.Return(m.leadership.Campaign(leaseTimeout, m.MemberValue())) }) - if m.leadership.GetCampaignTimesNum() >= campaignLeaderFrequencyTimes { + if m.leadership.GetCampaignTimesNum() > campaignLeaderFrequencyTimes { if err := m.ResignEtcdLeader(ctx, m.Name(), ""); err != nil { return err } diff --git a/pkg/mock/mockcluster/mockcluster.go b/pkg/mock/mockcluster/mockcluster.go index e5b3e39a502..5d3aba2d2e8 100644 --- a/pkg/mock/mockcluster/mockcluster.go +++ b/pkg/mock/mockcluster/mockcluster.go @@ -138,11 +138,6 @@ func (mc *Cluster) GetStoresLoads() map[uint64][]float64 { return mc.HotStat.GetStoresLoads() } -// GetStore gets a store with a given store ID. -func (mc *Cluster) GetStore(storeID uint64) *core.StoreInfo { - return mc.Stores.GetStore(storeID) -} - // IsRegionHot checks if the region is hot. func (mc *Cluster) IsRegionHot(region *core.RegionInfo) bool { return mc.HotCache.IsRegionHot(region, mc.GetHotRegionCacheHitsThreshold()) @@ -561,11 +556,6 @@ func (mc *Cluster) AddLeaderRegionWithWriteInfo( return items } -// DropCacheAllRegion removes all regions from the cache. -func (mc *Cluster) DropCacheAllRegion() { - mc.ResetRegionCache() -} - // UpdateStoreLeaderWeight updates store leader weight. func (mc *Cluster) UpdateStoreLeaderWeight(storeID uint64, weight float64) { store := mc.GetStore(storeID) @@ -752,7 +742,7 @@ func (mc *Cluster) UpdateStoreStatus(id uint64) { pendingPeerCount := mc.GetStorePendingPeerCount(id) leaderSize := mc.GetStoreLeaderRegionSize(id) regionSize := mc.GetStoreRegionSize(id) - store := mc.Stores.GetStore(id) + store := mc.GetStore(id) stats := &pdpb.StoreStats{} stats.Capacity = defaultStoreCapacity stats.Available = stats.Capacity - uint64(store.GetRegionSize()*units.MiB) @@ -896,14 +886,7 @@ func (mc *Cluster) CheckRegionRead(region *core.RegionInfo) []*statistics.HotPee items = append(items, expiredItems...) reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := mc.HotCache.CheckReadPeerSync(peerInfo, region) - if item != nil { - items = append(items, item) - } - } - return items + return append(items, mc.HotCache.CheckReadPeerSync(region, region.GetPeers(), region.GetLoads(), interval)...) } // CheckRegionWrite checks region write info with all peers @@ -913,14 +896,7 @@ func (mc *Cluster) CheckRegionWrite(region *core.RegionInfo) []*statistics.HotPe items = append(items, expiredItems...) reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := mc.HotCache.CheckWritePeerSync(peerInfo, region) - if item != nil { - items = append(items, item) - } - } - return items + return append(items, mc.HotCache.CheckWritePeerSync(region, region.GetPeers(), region.GetLoads(), interval)...) } // CheckRegionLeaderRead checks region read info with leader peer @@ -930,13 +906,7 @@ func (mc *Cluster) CheckRegionLeaderRead(region *core.RegionInfo) []*statistics. items = append(items, expiredItems...) reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() - peer := region.GetLeader() - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := mc.HotCache.CheckReadPeerSync(peerInfo, region) - if item != nil { - items = append(items, item) - } - return items + return append(items, mc.HotCache.CheckReadPeerSync(region, []*metapb.Peer{region.GetLeader()}, region.GetLoads(), interval)...) } // ObserveRegionsStats records the current stores stats from region stats. diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index e69b956134b..e1cc702fd36 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -1980,7 +1980,7 @@ func makeStores() placement.StoreSet { if zone == 1 && host == 1 { labels["type"] = "read" } - stores.SetStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now), core.SetStoreState(metapb.StoreState_Up))) + stores.PutStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now), core.SetStoreState(metapb.StoreState_Up))) } } } diff --git a/pkg/schedule/config/OWNERS b/pkg/schedule/config/OWNERS new file mode 100644 index 00000000000..ce5d15ddc19 --- /dev/null +++ b/pkg/schedule/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|(config|store_config)\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/schedule/operator/operator.go b/pkg/schedule/operator/operator.go index de197c4fba4..4d57d4fc6c7 100644 --- a/pkg/schedule/operator/operator.go +++ b/pkg/schedule/operator/operator.go @@ -376,10 +376,11 @@ func (o *Operator) Check(region *core.RegionInfo) OpStep { defer func() { _ = o.CheckTimeout() }() for step := atomic.LoadInt32(&o.currentStep); int(step) < len(o.steps); step++ { if o.steps[int(step)].IsFinish(region) { - if atomic.CompareAndSwapInt64(&(o.stepsTime[step]), 0, time.Now().UnixNano()) { + current := time.Now() + if atomic.CompareAndSwapInt64(&(o.stepsTime[step]), 0, current.UnixNano()) { startTime, _ := o.getCurrentTimeAndStep() operatorStepDuration.WithLabelValues(reflect.TypeOf(o.steps[int(step)]).Name()). - Observe(time.Unix(0, o.stepsTime[step]).Sub(startTime).Seconds()) + Observe(current.Sub(startTime).Seconds()) } atomic.StoreInt32(&o.currentStep, step+1) } else { diff --git a/pkg/schedule/operator/operator_controller.go b/pkg/schedule/operator/operator_controller.go index d63e843f52a..fe93bd98756 100644 --- a/pkg/schedule/operator/operator_controller.go +++ b/pkg/schedule/operator/operator_controller.go @@ -461,7 +461,7 @@ func (oc *Controller) checkAddOperator(isPromoting bool, ops ...*Operator) (bool return false, NotInCreateStatus } if !isPromoting && oc.wopStatus.getCount(op.Desc()) >= oc.config.GetSchedulerMaxWaitingOperator() { - log.Debug("exceed max return false", zap.Uint64("waiting", oc.wopStatus.ops[op.Desc()]), zap.String("desc", op.Desc()), zap.Uint64("max", oc.config.GetSchedulerMaxWaitingOperator())) + log.Debug("exceed max return false", zap.Uint64("waiting", oc.wopStatus.getCount(op.Desc())), zap.String("desc", op.Desc()), zap.Uint64("max", oc.config.GetSchedulerMaxWaitingOperator())) operatorCounter.WithLabelValues(op.Desc(), "exceed-max-waiting").Inc() return false, ExceedWaitLimit } diff --git a/pkg/schedule/operator/operator_controller_test.go b/pkg/schedule/operator/operator_controller_test.go index d3c50667fe0..2b16516c4c7 100644 --- a/pkg/schedule/operator/operator_controller_test.go +++ b/pkg/schedule/operator/operator_controller_test.go @@ -955,3 +955,40 @@ func (suite *operatorControllerTestSuite) TestInvalidStoreId() { // Although store 3 does not exist in PD, PD can also send op to TiKV. re.Equal(pdpb.OperatorStatus_RUNNING, oc.GetOperatorStatus(1).Status) } + +func TestConcurrentAddOperatorAndSetStoreLimit(t *testing.T) { + re := require.New(t) + opt := mockconfig.NewTestOptions() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + tc := mockcluster.NewCluster(ctx, opt) + stream := hbstream.NewTestHeartbeatStreams(ctx, tc.ID, tc, false /* no need to run */) + oc := NewController(ctx, tc.GetBasicCluster(), tc.GetSharedConfig(), stream) + + regionNum := 1000 + limit := 1600.0 + storeID := uint64(2) + for i := 1; i < 4; i++ { + tc.AddRegionStore(uint64(i), regionNum) + tc.SetStoreLimit(uint64(i), storelimit.AddPeer, limit) + } + for i := 1; i <= regionNum; i++ { + tc.AddLeaderRegion(uint64(i), 1, 3, 4) + } + + // Add operator and set store limit concurrently + var wg sync.WaitGroup + for i := 1; i < 10; i++ { + wg.Add(1) + go func(i uint64) { + defer wg.Done() + for j := 1; j < 10; j++ { + regionID := uint64(j) + i*100 + op := NewTestOperator(regionID, tc.GetRegion(regionID).GetRegionEpoch(), OpRegion, AddPeer{ToStore: storeID, PeerID: regionID}) + re.True(oc.AddOperator(op)) + tc.SetStoreLimit(storeID, storelimit.AddPeer, limit-float64(j)) // every goroutine set a different limit + } + }(uint64(i)) + } + wg.Wait() +} diff --git a/pkg/schedule/operator/operator_test.go b/pkg/schedule/operator/operator_test.go index 693f5c17475..1f44d813f1e 100644 --- a/pkg/schedule/operator/operator_test.go +++ b/pkg/schedule/operator/operator_test.go @@ -17,6 +17,7 @@ package operator import ( "context" "encoding/json" + "sync" "sync/atomic" "testing" "time" @@ -570,3 +571,27 @@ func (suite *operatorTestSuite) TestToJSONObject() { obj = op.ToJSONObject() suite.Equal(TIMEOUT, obj.Status) } + +func TestOperatorCheckConcurrently(t *testing.T) { + re := require.New(t) + region := newTestRegion(1, 1, [2]uint64{1, 1}, [2]uint64{2, 2}) + // addPeer1, transferLeader1, removePeer3 + steps := []OpStep{ + AddPeer{ToStore: 1, PeerID: 1}, + TransferLeader{FromStore: 3, ToStore: 1}, + RemovePeer{FromStore: 3}, + } + op := NewTestOperator(1, &metapb.RegionEpoch{}, OpAdmin|OpLeader|OpRegion, steps...) + re.Equal(constant.Urgent, op.GetPriorityLevel()) + checkSteps(re, op, steps) + op.Start() + var wg sync.WaitGroup + for i := 0; i < 10; i++ { + wg.Add(1) + go func() { + defer wg.Done() + re.Nil(op.Check(region)) + }() + } + wg.Wait() +} diff --git a/pkg/schedule/placement/fit_test.go b/pkg/schedule/placement/fit_test.go index aa5c66059f7..cc49d25640c 100644 --- a/pkg/schedule/placement/fit_test.go +++ b/pkg/schedule/placement/fit_test.go @@ -47,7 +47,7 @@ func makeStores() StoreSet { if id == 1111 || id == 2111 || id == 3111 { labels["disk"] = "ssd" } - stores.SetStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now))) + stores.PutStore(core.NewStoreInfoWithLabel(id, labels).Clone(core.SetLastHeartbeatTS(now))) } } } diff --git a/pkg/schedule/scatter/region_scatterer_test.go b/pkg/schedule/scatter/region_scatterer_test.go index b0027e0e415..89e55e5c9c7 100644 --- a/pkg/schedule/scatter/region_scatterer_test.go +++ b/pkg/schedule/scatter/region_scatterer_test.go @@ -216,7 +216,7 @@ func scatterSpecial(re *require.Assertions, numOrdinaryStores, numSpecialStores, leaderStoreID := region.GetLeader().GetStoreId() for _, peer := range region.GetPeers() { storeID := peer.GetStoreId() - store := tc.Stores.GetStore(storeID) + store := tc.GetStore(storeID) if store.GetLabelValue("engine") == "tiflash" { countSpecialPeers[storeID]++ } else { diff --git a/pkg/schedule/schedulers/OWNERS b/pkg/schedule/schedulers/OWNERS new file mode 100644 index 00000000000..ae96e4f1f42 --- /dev/null +++ b/pkg/schedule/schedulers/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|hot_region_config\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/pkg/schedule/schedulers/balance_test.go b/pkg/schedule/schedulers/balance_test.go index 234acfd6d26..26214ed5456 100644 --- a/pkg/schedule/schedulers/balance_test.go +++ b/pkg/schedule/schedulers/balance_test.go @@ -697,7 +697,7 @@ func (suite *balanceLeaderRangeSchedulerTestSuite) TestReSortStores() { suite.tc.AddLeaderStore(4, 100) suite.tc.AddLeaderStore(5, 100) suite.tc.AddLeaderStore(6, 0) - stores := suite.tc.Stores.GetStores() + stores := suite.tc.GetStores() sort.Slice(stores, func(i, j int) bool { return stores[i].GetID() < stores[j].GetID() }) diff --git a/pkg/schedule/schedulers/evict_slow_trend_test.go b/pkg/schedule/schedulers/evict_slow_trend_test.go index 834ef337639..dd6807f4a85 100644 --- a/pkg/schedule/schedulers/evict_slow_trend_test.go +++ b/pkg/schedule/schedulers/evict_slow_trend_test.go @@ -105,7 +105,7 @@ func (suite *evictSlowTrendTestSuite) TestEvictSlowTrendBasicFuncs() { re.Equal(slowCandidate{}, es2.conf.evictCandidate) es2.conf.markCandidateRecovered() lastCapturedCandidate = es2.conf.lastCapturedCandidate() - re.Greater(lastCapturedCandidate.recoverTS.Compare(recoverTS), 0) + re.Positive(lastCapturedCandidate.recoverTS.Compare(recoverTS)) re.Equal(lastCapturedCandidate.storeID, store.GetID()) // Test capture another store 2 diff --git a/pkg/statistics/hot_cache.go b/pkg/statistics/hot_cache.go index 799fb240d10..26548c8b47e 100644 --- a/pkg/statistics/hot_cache.go +++ b/pkg/statistics/hot_cache.go @@ -17,6 +17,7 @@ package statistics import ( "context" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/smallnest/chanx" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/statistics/utils" @@ -172,14 +173,14 @@ func (w *HotCache) Update(item *HotPeerStat, kind utils.RWType) { // CheckWritePeerSync checks the write status, returns update items. // This is used for mockcluster, for test purpose. -func (w *HotCache) CheckWritePeerSync(peer *core.PeerInfo, region *core.RegionInfo) *HotPeerStat { - return w.writeCache.checkPeerFlow(peer, region) +func (w *HotCache) CheckWritePeerSync(region *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) []*HotPeerStat { + return w.writeCache.checkPeerFlow(region, peers, loads, interval) } // CheckReadPeerSync checks the read status, returns update items. // This is used for mockcluster, for test purpose. -func (w *HotCache) CheckReadPeerSync(peer *core.PeerInfo, region *core.RegionInfo) *HotPeerStat { - return w.readCache.checkPeerFlow(peer, region) +func (w *HotCache) CheckReadPeerSync(region *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) []*HotPeerStat { + return w.readCache.checkPeerFlow(region, peers, loads, interval) } // ExpiredReadItems returns the read items which are already expired. diff --git a/pkg/statistics/hot_cache_task.go b/pkg/statistics/hot_cache_task.go index fa224b522ff..01731f3fe4d 100644 --- a/pkg/statistics/hot_cache_task.go +++ b/pkg/statistics/hot_cache_task.go @@ -17,6 +17,7 @@ package statistics import ( "context" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" ) @@ -25,22 +26,46 @@ type FlowItemTask interface { runTask(cache *hotPeerCache) } -type checkPeerTask struct { - peerInfo *core.PeerInfo +type checkReadPeerTask struct { regionInfo *core.RegionInfo + peers []*metapb.Peer + loads []float64 + interval uint64 } -// NewCheckPeerTask creates task to update peerInfo -func NewCheckPeerTask(peerInfo *core.PeerInfo, regionInfo *core.RegionInfo) FlowItemTask { - return &checkPeerTask{ - peerInfo: peerInfo, +// NewCheckReadPeerTask creates task to update peerInfo +func NewCheckReadPeerTask(regionInfo *core.RegionInfo, peers []*metapb.Peer, loads []float64, interval uint64) FlowItemTask { + return &checkReadPeerTask{ regionInfo: regionInfo, + peers: peers, + loads: loads, + interval: interval, } } -func (t *checkPeerTask) runTask(cache *hotPeerCache) { - stat := cache.checkPeerFlow(t.peerInfo, t.regionInfo) - if stat != nil { +func (t *checkReadPeerTask) runTask(cache *hotPeerCache) { + stats := cache.checkPeerFlow(t.regionInfo, t.peers, t.loads, t.interval) + for _, stat := range stats { + cache.updateStat(stat) + } +} + +type checkWritePeerTask struct { + region *core.RegionInfo +} + +// NewCheckWritePeerTask creates task to update peerInfo +func NewCheckWritePeerTask(region *core.RegionInfo) FlowItemTask { + return &checkWritePeerTask{ + region: region, + } +} + +func (t *checkWritePeerTask) runTask(cache *hotPeerCache) { + reportInterval := t.region.GetInterval() + interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() + stats := cache.checkPeerFlow(t.region, t.region.GetPeers(), t.region.GetWriteLoads(), interval) + for _, stat := range stats { cache.updateStat(stat) } } diff --git a/pkg/statistics/hot_peer_cache.go b/pkg/statistics/hot_peer_cache.go index cd27dcad4c8..3a3d3519bd9 100644 --- a/pkg/statistics/hot_peer_cache.go +++ b/pkg/statistics/hot_peer_cache.go @@ -174,58 +174,61 @@ func (f *hotPeerCache) collectExpiredItems(region *core.RegionInfo) []*HotPeerSt // checkPeerFlow checks the flow information of a peer. // Notice: checkPeerFlow couldn't be used concurrently. // checkPeerFlow will update oldItem's rollingLoads into newItem, thus we should use write lock here. -func (f *hotPeerCache) checkPeerFlow(peer *core.PeerInfo, region *core.RegionInfo) *HotPeerStat { - interval := peer.GetInterval() +func (f *hotPeerCache) checkPeerFlow(region *core.RegionInfo, peers []*metapb.Peer, deltaLoads []float64, interval uint64) []*HotPeerStat { if Denoising && interval < HotRegionReportMinInterval { // for test or simulator purpose return nil } - storeID := peer.GetStoreId() - deltaLoads := peer.GetLoads() + f.collectPeerMetrics(deltaLoads, interval) // update metrics regionID := region.GetID() - oldItem := f.getOldHotPeerStat(regionID, storeID) - - // check whether the peer is allowed to be inherited - source := utils.Direct - if oldItem == nil { - for _, storeID := range f.getAllStoreIDs(region) { - oldItem = f.getOldHotPeerStat(regionID, storeID) - if oldItem != nil && oldItem.allowInherited { - source = utils.Inherit - break + + regionPeers := region.GetPeers() + stats := make([]*HotPeerStat, 0, len(peers)) + for _, peer := range peers { + storeID := peer.GetStoreId() + oldItem := f.getOldHotPeerStat(regionID, storeID) + + // check whether the peer is allowed to be inherited + source := utils.Direct + if oldItem == nil { + for _, storeID := range f.getAllStoreIDs(region) { + oldItem = f.getOldHotPeerStat(regionID, storeID) + if oldItem != nil && oldItem.allowInherited { + source = utils.Inherit + break + } } } - } - - // check new item whether is hot - if oldItem == nil { - regionStats := f.kind.RegionStats() - thresholds := f.calcHotThresholds(storeID) - isHot := slice.AnyOf(regionStats, func(i int) bool { - return deltaLoads[regionStats[i]]/float64(interval) >= thresholds[i] - }) - if !isHot { - return nil + // check new item whether is hot + if oldItem == nil { + regionStats := f.kind.RegionStats() + thresholds := f.calcHotThresholds(storeID) + isHot := slice.AnyOf(regionStats, func(i int) bool { + return deltaLoads[regionStats[i]]/float64(interval) >= thresholds[i] + }) + if !isHot { + continue + } } - } - - peers := region.GetPeers() - newItem := &HotPeerStat{ - StoreID: storeID, - RegionID: regionID, - Loads: f.kind.GetLoadRatesFromPeer(peer), - isLeader: region.GetLeader().GetStoreId() == storeID, - actionType: utils.Update, - stores: make([]uint64, len(peers)), - } - for i, peer := range peers { - newItem.stores[i] = peer.GetStoreId() - } - if oldItem == nil { - return f.updateNewHotPeerStat(newItem, deltaLoads, time.Duration(interval)*time.Second) + newItem := &HotPeerStat{ + StoreID: storeID, + RegionID: regionID, + Loads: f.kind.GetLoadRates(deltaLoads, interval), + isLeader: region.GetLeader().GetStoreId() == storeID, + actionType: utils.Update, + stores: make([]uint64, len(regionPeers)), + } + for i, peer := range regionPeers { + newItem.stores[i] = peer.GetStoreId() + } + if oldItem == nil { + stats = append(stats, f.updateNewHotPeerStat(newItem, deltaLoads, time.Duration(interval)*time.Second)) + continue + } + stats = append(stats, f.updateHotPeerStat(region, newItem, oldItem, deltaLoads, time.Duration(interval)*time.Second, source)) } - return f.updateHotPeerStat(region, newItem, oldItem, deltaLoads, time.Duration(interval)*time.Second, source) + return stats } // checkColdPeer checks the collect the un-heartbeat peer and maintain it. diff --git a/pkg/statistics/hot_peer_cache_test.go b/pkg/statistics/hot_peer_cache_test.go index 36f922d3830..db215238604 100644 --- a/pkg/statistics/hot_peer_cache_test.go +++ b/pkg/statistics/hot_peer_cache_test.go @@ -109,14 +109,7 @@ func checkFlow(cache *hotPeerCache, region *core.RegionInfo, peers []*metapb.Pee reportInterval := region.GetInterval() interval := reportInterval.GetEndTimestamp() - reportInterval.GetStartTimestamp() res = append(res, cache.collectExpiredItems(region)...) - for _, peer := range peers { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), interval) - item := cache.checkPeerFlow(peerInfo, region) - if item != nil { - res = append(res, item) - } - } - return res + return append(res, cache.checkPeerFlow(region, peers, region.GetLoads(), interval)...) } func updateFlow(cache *hotPeerCache, res []*HotPeerStat) []*HotPeerStat { @@ -318,13 +311,13 @@ func TestUpdateHotPeerStat(t *testing.T) { }() // skip interval=0 - interval := 0 + interval := uint64(0) deltaLoads := []float64{0.0, 0.0, 0.0} utils.MinHotThresholds[utils.RegionReadBytes] = 0.0 utils.MinHotThresholds[utils.RegionReadKeys] = 0.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 0.0 - newItem := cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) + newItem := cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.Nil(newItem) // new peer, interval is larger than report interval, but no hot @@ -333,8 +326,8 @@ func TestUpdateHotPeerStat(t *testing.T) { utils.MinHotThresholds[utils.RegionReadBytes] = 1.0 utils.MinHotThresholds[utils.RegionReadKeys] = 1.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 1.0 - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Nil(newItem) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Empty(newItem) // new peer, interval is less than report interval interval = 4 @@ -342,50 +335,49 @@ func TestUpdateHotPeerStat(t *testing.T) { utils.MinHotThresholds[utils.RegionReadBytes] = 0.0 utils.MinHotThresholds[utils.RegionReadKeys] = 0.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 0.0 - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) re.NotNil(newItem) - re.Equal(0, newItem.HotDegree) - re.Equal(0, newItem.AntiCount) + re.Equal(0, newItem[0].HotDegree) + re.Equal(0, newItem[0].AntiCount) // sum of interval is less than report interval - interval = 4 deltaLoads = []float64{60.0, 60.0, 60.0} - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(0, newItem.HotDegree) - re.Equal(0, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(0, newItem[0].HotDegree) + re.Equal(0, newItem[0].AntiCount) // sum of interval is larger than report interval, and hot - newItem.AntiCount = utils.Read.DefaultAntiCount() - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(1, newItem.HotDegree) - re.Equal(2*m, newItem.AntiCount) + newItem[0].AntiCount = utils.Read.DefaultAntiCount() + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(1, newItem[0].HotDegree) + re.Equal(2*m, newItem[0].AntiCount) // sum of interval is less than report interval - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(1, newItem.HotDegree) - re.Equal(2*m, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(1, newItem[0].HotDegree) + re.Equal(2*m, newItem[0].AntiCount) // sum of interval is larger than report interval, and hot interval = 10 - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(2, newItem.HotDegree) - re.Equal(2*m, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(2, newItem[0].HotDegree) + re.Equal(2*m, newItem[0].AntiCount) // sum of interval is larger than report interval, and cold utils.MinHotThresholds[utils.RegionReadBytes] = 10.0 utils.MinHotThresholds[utils.RegionReadKeys] = 10.0 utils.MinHotThresholds[utils.RegionReadQueryNum] = 10.0 - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) - re.Equal(1, newItem.HotDegree) - re.Equal(2*m-1, newItem.AntiCount) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) + re.Equal(1, newItem[0].HotDegree) + re.Equal(2*m-1, newItem[0].AntiCount) // sum of interval is larger than report interval, and cold for i := 0; i < 2*m-1; i++ { - cache.updateStat(newItem) - newItem = cache.checkPeerFlow(core.NewPeerInfo(peer, deltaLoads, uint64(interval)), region) + cache.updateStat(newItem[0]) + newItem = cache.checkPeerFlow(region, []*metapb.Peer{peer}, deltaLoads, interval) } - re.Less(newItem.HotDegree, 0) - re.Equal(0, newItem.AntiCount) - re.Equal(utils.Remove, newItem.actionType) + re.Negative(newItem[0].HotDegree) + re.Equal(0, newItem[0].AntiCount) + re.Equal(utils.Remove, newItem[0].actionType) } func TestThresholdWithUpdateHotPeerStat(t *testing.T) { @@ -688,9 +680,8 @@ func TestHotPeerCacheTopNThreshold(t *testing.T) { StartTimestamp: start, EndTimestamp: end, })) - newPeer := core.NewPeerInfo(meta.Peers[0], region.GetLoads(), end-start) - stat := cache.checkPeerFlow(newPeer, newRegion) - if stat != nil { + stats := cache.checkPeerFlow(newRegion, newRegion.GetPeers(), newRegion.GetLoads(), end-start) + for _, stat := range stats { cache.updateStat(stat) } } @@ -717,22 +708,11 @@ func TestHotPeerCacheTopNThreshold(t *testing.T) { func BenchmarkCheckRegionFlow(b *testing.B) { cache := NewHotPeerCache(context.Background(), utils.Read) region := buildRegion(utils.Read, 3, 10) - peerInfos := make([]*core.PeerInfo, 0) - for _, peer := range region.GetPeers() { - peerInfo := core.NewPeerInfo(peer, region.GetLoads(), 10) - peerInfos = append(peerInfos, peerInfo) - } b.ResetTimer() for i := 0; i < b.N; i++ { - items := make([]*HotPeerStat, 0) - for _, peerInfo := range peerInfos { - item := cache.checkPeerFlow(peerInfo, region) - if item != nil { - items = append(items, item) - } - } - for _, ret := range items { - cache.updateStat(ret) + stats := cache.checkPeerFlow(region, region.GetPeers(), region.GetLoads(), 10) + for _, stat := range stats { + cache.updateStat(stat) } } } diff --git a/pkg/statistics/utils/kind.go b/pkg/statistics/utils/kind.go index 4d44b8d57e1..089732f759f 100644 --- a/pkg/statistics/utils/kind.go +++ b/pkg/statistics/utils/kind.go @@ -14,10 +14,6 @@ package utils -import ( - "github.com/tikv/pd/pkg/core" -) - const ( // BytePriority indicates hot-region-scheduler prefer byte dim BytePriority = "byte" @@ -230,10 +226,8 @@ func (rw RWType) DefaultAntiCount() int { } } -// GetLoadRatesFromPeer gets the load rates of the read or write type from PeerInfo. -func (rw RWType) GetLoadRatesFromPeer(peer *core.PeerInfo) []float64 { - deltaLoads := peer.GetLoads() - interval := peer.GetInterval() +// GetLoadRates gets the load rates of the read or write type. +func (rw RWType) GetLoadRates(deltaLoads []float64, interval uint64) []float64 { loads := make([]float64, DimLen) for dim, k := range rw.RegionStats() { loads[dim] = deltaLoads[k] / float64(interval) diff --git a/pkg/storage/leveldb_backend.go b/pkg/storage/leveldb_backend.go old mode 100644 new mode 100755 diff --git a/pkg/storage/storage_test.go b/pkg/storage/storage_test.go index 4525ec6091c..460489ecd10 100644 --- a/pkg/storage/storage_test.go +++ b/pkg/storage/storage_test.go @@ -100,7 +100,7 @@ func TestLoadStores(t *testing.T) { n := 10 stores := mustSaveStores(re, storage, n) - re.NoError(storage.LoadStores(cache.SetStore)) + re.NoError(storage.LoadStores(cache.PutStore)) re.Equal(n, cache.GetStoreCount()) for _, store := range cache.GetMetaStores() { @@ -117,7 +117,7 @@ func TestStoreWeight(t *testing.T) { mustSaveStores(re, storage, n) re.NoError(storage.SaveStoreWeight(1, 2.0, 3.0)) re.NoError(storage.SaveStoreWeight(2, 0.2, 0.3)) - re.NoError(storage.LoadStores(cache.SetStore)) + re.NoError(storage.LoadStores(cache.PutStore)) leaderWeights := []float64{1.0, 2.0, 0.2} regionWeights := []float64{1.0, 3.0, 0.3} for i := 0; i < n; i++ { diff --git a/pkg/unsaferecovery/unsafe_recovery_controller.go b/pkg/unsaferecovery/unsafe_recovery_controller.go index d2f6125c3f3..89cd6e6393c 100644 --- a/pkg/unsaferecovery/unsafe_recovery_controller.go +++ b/pkg/unsaferecovery/unsafe_recovery_controller.go @@ -107,7 +107,7 @@ const ( type cluster interface { core.StoreSetInformer - DropCacheAllRegion() + ResetRegionCache() AllocID() (uint64, error) BuryStore(storeID uint64, forceBury bool) error GetSchedulerConfig() sc.SchedulerConfigProvider @@ -544,7 +544,7 @@ func (u *Controller) changeStage(stage stage) { case Finished: if u.step > 1 { // == 1 means no operation has done, no need to invalid cache - u.cluster.DropCacheAllRegion() + u.cluster.ResetRegionCache() } output.Info = "Unsafe recovery Finished" output.Details = u.getAffectedTableDigest() diff --git a/pkg/utils/apiutil/multiservicesapi/middleware.go b/pkg/utils/apiutil/multiservicesapi/middleware.go index ed34ecc6afb..4343adcc981 100644 --- a/pkg/utils/apiutil/multiservicesapi/middleware.go +++ b/pkg/utils/apiutil/multiservicesapi/middleware.go @@ -48,8 +48,8 @@ func ServiceRedirector() gin.HandlerFunc { // Prevent more than one redirection. if name := c.Request.Header.Get(ServiceRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not primary", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirect)) - c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirect.FastGenByArgs().Error()) + log.Error("redirect but server is not primary", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirectToNotPrimary)) + c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirectToNotPrimary.FastGenByArgs().Error()) return } diff --git a/pkg/utils/apiutil/serverapi/middleware.go b/pkg/utils/apiutil/serverapi/middleware.go index 2432e15c967..0718702b5a5 100755 --- a/pkg/utils/apiutil/serverapi/middleware.go +++ b/pkg/utils/apiutil/serverapi/middleware.go @@ -18,7 +18,9 @@ import ( "net/http" "net/url" "strings" + "time" + "github.com/pingcap/kvproto/pkg/pdpb" "github.com/pingcap/log" "github.com/tikv/pd/pkg/errs" mcsutils "github.com/tikv/pd/pkg/mcs/utils" @@ -204,20 +206,25 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http clientUrls = append(clientUrls, targetAddr) // Add a header to the response, it is used to mark whether the request has been forwarded to the micro service. w.Header().Add(apiutil.XForwardedToMicroServiceHeader, "true") - } else { - leader := h.s.GetMember().GetLeader() + } else if name := r.Header.Get(apiutil.PDRedirectorHeader); len(name) == 0 { + leader := h.waitForLeader(r) + // The leader has not been elected yet. if leader == nil { - http.Error(w, "no leader", http.StatusServiceUnavailable) + http.Error(w, errs.ErrRedirectNoLeader.FastGenByArgs().Error(), http.StatusServiceUnavailable) return } - clientUrls = leader.GetClientUrls() - // Prevent more than one redirection among PD/API servers. - if name := r.Header.Get(apiutil.PDRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirect)) - http.Error(w, errs.ErrRedirectToNotLeader.FastGenByArgs().Error(), http.StatusInternalServerError) + // If the leader is the current server now, we can handle the request directly. + if h.s.GetMember().IsLeader() || leader.GetName() == h.s.Name() { + next(w, r) return } + clientUrls = leader.GetClientUrls() r.Header.Set(apiutil.PDRedirectorHeader, h.s.Name()) + } else { + // Prevent more than one redirection among PD/API servers. + log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", h.s.Name()), errs.ZapError(errs.ErrRedirectToNotLeader)) + http.Error(w, errs.ErrRedirectToNotLeader.FastGenByArgs().Error(), http.StatusInternalServerError) + return } urls := make([]url.URL, 0, len(clientUrls)) @@ -233,3 +240,38 @@ func (h *redirector) ServeHTTP(w http.ResponseWriter, r *http.Request, next http client := h.s.GetHTTPClient() apiutil.NewCustomReverseProxies(client, urls).ServeHTTP(w, r) } + +const ( + backoffMaxDelay = 3 * time.Second + backoffInterval = 100 * time.Millisecond +) + +// If current server does not have a leader, backoff to increase the chance of success. +func (h *redirector) waitForLeader(r *http.Request) (leader *pdpb.Member) { + var ( + interval = backoffInterval + maxDelay = backoffMaxDelay + curDelay = time.Duration(0) + ) + for { + leader = h.s.GetMember().GetLeader() + if leader != nil { + return + } + select { + case <-time.After(interval): + curDelay += interval + if curDelay >= maxDelay { + return + } + interval *= 2 + if curDelay+interval > maxDelay { + interval = maxDelay - curDelay + } + case <-r.Context().Done(): + return + case <-h.s.LoopContext().Done(): + return + } + } +} diff --git a/pkg/utils/configutil/configutil.go b/pkg/utils/configutil/configutil.go index 2e7c74d9f8c..086f74ff842 100644 --- a/pkg/utils/configutil/configutil.go +++ b/pkg/utils/configutil/configutil.go @@ -171,3 +171,10 @@ func AdjustPath(p *string) { *p = absPath } } + +// AdjustBool adjusts the value of a bool variable. +func AdjustBool(v *bool, defValue bool) { + if !*v { + *v = defValue + } +} diff --git a/server/api/admin.go b/server/api/admin.go index ab5ba882287..dd81985b514 100644 --- a/server/api/admin.go +++ b/server/api/admin.go @@ -60,7 +60,7 @@ func (h *adminHandler) DeleteRegionCache(w http.ResponseWriter, r *http.Request) h.rd.JSON(w, http.StatusBadRequest, err.Error()) return } - rc.DropCacheRegion(regionID) + rc.RemoveRegionIfExist(regionID) if h.svr.IsServiceIndependent(utils.SchedulingServiceName) { err = h.DeleteRegionCacheInSchedulingServer(regionID) } @@ -100,7 +100,7 @@ func (h *adminHandler) DeleteRegionStorage(w http.ResponseWriter, r *http.Reques return } // Remove region from cache. - rc.DropCacheRegion(regionID) + rc.RemoveRegionIfExist(regionID) if h.svr.IsServiceIndependent(utils.SchedulingServiceName) { err = h.DeleteRegionCacheInSchedulingServer(regionID) } @@ -116,7 +116,7 @@ func (h *adminHandler) DeleteRegionStorage(w http.ResponseWriter, r *http.Reques func (h *adminHandler) DeleteAllRegionCache(w http.ResponseWriter, r *http.Request) { var err error rc := getCluster(r) - rc.DropCacheAllRegion() + rc.ResetRegionCache() if h.svr.IsServiceIndependent(utils.SchedulingServiceName) { err = h.DeleteRegionCacheInSchedulingServer() } diff --git a/server/api/stats.go b/server/api/stats.go index 915d33ddfdf..5aa8fcb72a6 100644 --- a/server/api/stats.go +++ b/server/api/stats.go @@ -47,7 +47,7 @@ func (h *statsHandler) GetRegionStatus(w http.ResponseWriter, r *http.Request) { startKey, endKey := r.URL.Query().Get("start_key"), r.URL.Query().Get("end_key") var stats *statistics.RegionStats if r.URL.Query().Has("count") { - stats = rc.GetRegionCount([]byte(startKey), []byte(endKey)) + stats = rc.GetRegionStatsCount([]byte(startKey), []byte(endKey)) } else { stats = rc.GetRegionStatsByRange([]byte(startKey), []byte(endKey)) } diff --git a/server/apiv2/middlewares/redirector.go b/server/apiv2/middlewares/redirector.go index 37c06de1585..9c2c4081175 100644 --- a/server/apiv2/middlewares/redirector.go +++ b/server/apiv2/middlewares/redirector.go @@ -43,8 +43,8 @@ func Redirector() gin.HandlerFunc { // Prevent more than one redirection. if name := c.Request.Header.Get(apiutil.PDRedirectorHeader); len(name) != 0 { - log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirect)) - c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirect.FastGenByArgs().Error()) + log.Error("redirect but server is not leader", zap.String("from", name), zap.String("server", svr.Name()), errs.ZapError(errs.ErrRedirectToNotLeader)) + c.AbortWithStatusJSON(http.StatusInternalServerError, errs.ErrRedirectToNotLeader.FastGenByArgs().Error()) return } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index 148b43541a2..70d6b46b980 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -107,9 +107,9 @@ const ( minSnapshotDurationSec = 5 // heartbeat relative const - heartbeatTaskRunner = "heartbeat-async" - statisticsTaskRunner = "statistics-async" - logTaskRunner = "log-async" + heartbeatTaskRunner = "heartbeat-async" + miscTaskRunner = "misc-async" + logTaskRunner = "log-async" ) // Server is the interface for cluster. @@ -143,6 +143,8 @@ type RaftCluster struct { ctx context.Context cancel context.CancelFunc + *core.BasicCluster // cached cluster info + etcdClient *clientv3.Client httpClient *http.Client @@ -159,7 +161,6 @@ type RaftCluster struct { // This below fields are all read-only, we cannot update itself after the raft cluster starts. clusterID uint64 id id.Allocator - core *core.BasicCluster // cached cluster info opt *config.PersistOptions limiter *StoreLimiter *schedulingController @@ -201,10 +202,10 @@ func NewRaftCluster(ctx context.Context, clusterID uint64, basicCluster *core.Ba regionSyncer: regionSyncer, httpClient: httpClient, etcdClient: etcdClient, - core: basicCluster, + BasicCluster: basicCluster, storage: storage, heartbeatRunner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), - miscRunner: ratelimit.NewConcurrentRunner(statisticsTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), + miscRunner: ratelimit.NewConcurrentRunner(miscTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), } } @@ -251,10 +252,10 @@ func (c *RaftCluster) LoadClusterStatus() (*Status, error) { } func (c *RaftCluster) isInitialized() bool { - if c.core.GetTotalRegionCount() > 1 { + if c.GetTotalRegionCount() > 1 { return true } - region := c.core.GetRegionByKey(nil) + region := c.GetRegionByKey(nil) return region != nil && len(region.GetVoters()) >= int(c.opt.GetReplicationConfig().MaxReplicas) && len(region.GetPendingPeers()) == 0 @@ -295,7 +296,7 @@ func (c *RaftCluster) InitCluster( return err } } - c.schedulingController = newSchedulingController(c.ctx, c.core, c.opt, c.ruleManager) + c.schedulingController = newSchedulingController(c.ctx, c.BasicCluster, c.opt, c.ruleManager) return nil } @@ -644,9 +645,9 @@ func (c *RaftCluster) LoadClusterInfo() (*RaftCluster, error) { return nil, nil } - c.core.ResetStores() + c.ResetStores() start := time.Now() - if err := c.storage.LoadStores(c.core.PutStore); err != nil { + if err := c.storage.LoadStores(c.PutStore); err != nil { return nil, err } log.Info("load stores", @@ -657,11 +658,11 @@ func (c *RaftCluster) LoadClusterInfo() (*RaftCluster, error) { start = time.Now() // used to load region from kv storage to cache storage. - if err = storage.TryLoadRegionsOnce(c.ctx, c.storage, c.core.CheckAndPutRegion); err != nil { + if err = storage.TryLoadRegionsOnce(c.ctx, c.storage, c.CheckAndPutRegion); err != nil { return nil, err } log.Info("load regions", - zap.Int("count", c.core.GetTotalRegionCount()), + zap.Int("count", c.GetTotalRegionCount()), zap.Duration("cost", time.Since(start)), ) @@ -729,7 +730,7 @@ func (c *RaftCluster) runUpdateStoreStats() { case <-ticker.C: // Update related stores. start := time.Now() - c.core.UpdateAllStoreStatus() + c.UpdateAllStoreStatus() updateStoreStatsGauge.Set(time.Since(start).Seconds()) } } @@ -868,8 +869,6 @@ func (c *RaftCluster) GetUnsafeRecoveryController() *unsaferecovery.Controller { func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest, resp *pdpb.StoreHeartbeatResponse) error { stats := heartbeat.GetStats() storeID := stats.GetStoreId() - c.Lock() - defer c.Unlock() store := c.GetStore(storeID) if store == nil { return errors.Errorf("store %v not found", storeID) @@ -917,10 +916,10 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest newStore = newStore.Clone(core.SetLastPersistTime(nowTime)) } } - if store := c.core.GetStore(storeID); store != nil { + if store := c.GetStore(storeID); store != nil { statistics.UpdateStoreHeartbeatMetrics(store) } - c.core.PutStore(newStore) + c.PutStore(newStore) var ( regions map[uint64]*core.RegionInfo interval uint64 @@ -959,8 +958,7 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest utils.RegionWriteKeys: 0, utils.RegionWriteQueryNum: 0, } - peerInfo := core.NewPeerInfo(peer, loads, interval) - c.hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) + c.hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{peer}, loads, interval)) } } for _, stat := range stats.GetSnapshotStats() { @@ -990,7 +988,7 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest // processReportBuckets update the bucket information. func (c *RaftCluster) processReportBuckets(buckets *metapb.Buckets) error { - region := c.core.GetRegion(buckets.GetRegionId()) + region := c.GetRegion(buckets.GetRegionId()) if region == nil { regionCacheMissCounter.Inc() return errors.Errorf("region %v not found", buckets.GetRegionId()) @@ -1023,7 +1021,7 @@ var syncRunner = ratelimit.NewSyncRunner() // processRegionHeartbeat updates the region information. func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, region *core.RegionInfo) error { tracer := ctx.Tracer - origin, _, err := c.core.PreCheckPutRegion(region) + origin, _, err := c.PreCheckPutRegion(region) tracer.OnPreCheckFinished() if err != nil { return err @@ -1083,7 +1081,7 @@ func (c *RaftCluster) processRegionHeartbeat(ctx *core.MetaProcessContext, regio // check its validation again here. // // However, it can't solve the race condition of concurrent heartbeats from the same region. - if overlaps, err = c.core.CheckAndPutRootTree(ctx, region); err != nil { + if overlaps, err = c.CheckAndPutRootTree(ctx, region); err != nil { tracer.OnSaveCacheFinished() return err } @@ -1174,158 +1172,7 @@ func (c *RaftCluster) putMetaLocked(meta *metapb.Cluster) error { // GetBasicCluster returns the basic cluster. func (c *RaftCluster) GetBasicCluster() *core.BasicCluster { - return c.core -} - -// GetRegionByKey gets regionInfo by region key from cluster. -func (c *RaftCluster) GetRegionByKey(regionKey []byte) *core.RegionInfo { - return c.core.GetRegionByKey(regionKey) -} - -// GetPrevRegionByKey gets previous region and leader peer by the region key from cluster. -func (c *RaftCluster) GetPrevRegionByKey(regionKey []byte) *core.RegionInfo { - return c.core.GetPrevRegionByKey(regionKey) -} - -// ScanRegions scans region with start key, until the region contains endKey, or -// total number greater than limit. -func (c *RaftCluster) ScanRegions(startKey, endKey []byte, limit int) []*core.RegionInfo { - return c.core.ScanRegions(startKey, endKey, limit) -} - -// GetRegion searches for a region by ID. -func (c *RaftCluster) GetRegion(regionID uint64) *core.RegionInfo { - return c.core.GetRegion(regionID) -} - -// GetMetaRegions gets regions from cluster. -func (c *RaftCluster) GetMetaRegions() []*metapb.Region { - return c.core.GetMetaRegions() -} - -// GetRegions returns all regions' information in detail. -func (c *RaftCluster) GetRegions() []*core.RegionInfo { - return c.core.GetRegions() -} - -// ValidRegion is used to decide if the region is valid. -func (c *RaftCluster) ValidRegion(region *metapb.Region) error { - return c.core.ValidRegion(region) -} - -// GetTotalRegionCount returns total count of regions -func (c *RaftCluster) GetTotalRegionCount() int { - return c.core.GetTotalRegionCount() -} - -// GetStoreRegions returns all regions' information with a given storeID. -func (c *RaftCluster) GetStoreRegions(storeID uint64) []*core.RegionInfo { - return c.core.GetStoreRegions(storeID) -} - -// GetStoreRegions returns all regions' information with a given storeID. -func (c *RaftCluster) GetStoreRegionsByType(storeID uint64) []*core.RegionInfo { - return c.core.GetStoreRegions(storeID) -} - -// RandLeaderRegions returns some random regions that has leader on the store. -func (c *RaftCluster) RandLeaderRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandLeaderRegions(storeID, ranges) -} - -// RandFollowerRegions returns some random regions that has a follower on the store. -func (c *RaftCluster) RandFollowerRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandFollowerRegions(storeID, ranges) -} - -// RandPendingRegions returns some random regions that has a pending peer on the store. -func (c *RaftCluster) RandPendingRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandPendingRegions(storeID, ranges) -} - -// RandLearnerRegions returns some random regions that has a learner peer on the store. -func (c *RaftCluster) RandLearnerRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandLearnerRegions(storeID, ranges) -} - -// RandWitnessRegions returns some random regions that has a witness peer on the store. -func (c *RaftCluster) RandWitnessRegions(storeID uint64, ranges []core.KeyRange) []*core.RegionInfo { - return c.core.RandWitnessRegions(storeID, ranges) -} - -// GetLeaderStore returns all stores that contains the region's leader peer. -func (c *RaftCluster) GetLeaderStore(region *core.RegionInfo) *core.StoreInfo { - return c.core.GetLeaderStore(region) -} - -// GetNonWitnessVoterStores returns all stores that contains the region's non-witness voter peer. -func (c *RaftCluster) GetNonWitnessVoterStores(region *core.RegionInfo) []*core.StoreInfo { - return c.core.GetNonWitnessVoterStores(region) -} - -// GetFollowerStores returns all stores that contains the region's follower peer. -func (c *RaftCluster) GetFollowerStores(region *core.RegionInfo) []*core.StoreInfo { - return c.core.GetFollowerStores(region) -} - -// GetRegionStores returns all stores that contains the region's peer. -func (c *RaftCluster) GetRegionStores(region *core.RegionInfo) []*core.StoreInfo { - return c.core.GetRegionStores(region) -} - -// GetStoreCount returns the count of stores. -func (c *RaftCluster) GetStoreCount() int { - return c.core.GetStoreCount() -} - -// GetStoreRegionCount returns the number of regions for a given store. -func (c *RaftCluster) GetStoreRegionCount(storeID uint64) int { - return c.core.GetStoreRegionCount(storeID) -} - -// GetAverageRegionSize returns the average region approximate size. -func (c *RaftCluster) GetAverageRegionSize() int64 { - return c.core.GetAverageRegionSize() -} - -// DropCacheRegion removes a region from the cache. -func (c *RaftCluster) DropCacheRegion(id uint64) { - c.core.RemoveRegionIfExist(id) -} - -// DropCacheAllRegion removes all regions from the cache. -func (c *RaftCluster) DropCacheAllRegion() { - c.core.ResetRegionCache() -} - -// GetMetaStores gets stores from cluster. -func (c *RaftCluster) GetMetaStores() []*metapb.Store { - return c.core.GetMetaStores() -} - -// GetStores returns all stores in the cluster. -func (c *RaftCluster) GetStores() []*core.StoreInfo { - return c.core.GetStores() -} - -// GetLeaderStoreByRegionID returns the leader store of the given region. -func (c *RaftCluster) GetLeaderStoreByRegionID(regionID uint64) *core.StoreInfo { - return c.core.GetLeaderStoreByRegionID(regionID) -} - -// GetStore gets store from cluster. -func (c *RaftCluster) GetStore(storeID uint64) *core.StoreInfo { - return c.core.GetStore(storeID) -} - -// GetAdjacentRegions returns regions' information that are adjacent with the specific region ID. -func (c *RaftCluster) GetAdjacentRegions(region *core.RegionInfo) (*core.RegionInfo, *core.RegionInfo) { - return c.core.GetAdjacentRegions(region) -} - -// GetRangeHoles returns all range holes, i.e the key ranges without any region info. -func (c *RaftCluster) GetRangeHoles() [][]string { - return c.core.GetRangeHoles() + return c.BasicCluster } // UpdateStoreLabels updates a store's location labels @@ -1361,8 +1208,8 @@ func (c *RaftCluster) DeleteStoreLabel(storeID uint64, labelKey string) error { return c.putStoreImpl(newStore, true) } -// PutStore puts a store. -func (c *RaftCluster) PutStore(store *metapb.Store) error { +// PutMetaStore puts a store. +func (c *RaftCluster) PutMetaStore(store *metapb.Store) error { if err := c.putStoreImpl(store, false); err != nil { return err } @@ -1375,9 +1222,6 @@ func (c *RaftCluster) PutStore(store *metapb.Store) error { // If 'force' is true, the store's labels will overwrite those labels which already existed in the store. // If 'force' is false, the store's labels will merge into those labels which already existed in the store. func (c *RaftCluster) putStoreImpl(store *metapb.Store, force bool) error { - c.Lock() - defer c.Unlock() - if store.GetId() == 0 { return errors.Errorf("invalid put store %v", store) } @@ -1419,7 +1263,7 @@ func (c *RaftCluster) putStoreImpl(store *metapb.Store, force bool) error { if err := c.checkStoreLabels(s); err != nil { return err } - return c.putStoreLocked(s) + return c.setStore(s) } func (c *RaftCluster) checkStoreVersion(store *metapb.Store) error { @@ -1464,9 +1308,6 @@ func (c *RaftCluster) checkStoreLabels(s *core.StoreInfo) error { // RemoveStore marks a store as offline in cluster. // State transition: Up -> Offline. func (c *RaftCluster) RemoveStore(storeID uint64, physicallyDestroyed bool) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1491,9 +1332,9 @@ func (c *RaftCluster) RemoveStore(storeID uint64, physicallyDestroyed bool) erro zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress()), zap.Bool("physically-destroyed", newStore.IsPhysicallyDestroyed())) - err := c.putStoreLocked(newStore) + err := c.setStore(newStore) if err == nil { - regionSize := float64(c.core.GetStoreRegionSize(storeID)) + regionSize := float64(c.GetStoreRegionSize(storeID)) c.resetProgress(storeID, store.GetAddress()) c.progressManager.AddProgress(encodeRemovingProgressKey(storeID), regionSize, regionSize, nodeStateCheckJobInterval, progress.WindowDurationOption(c.GetCoordinator().GetPatrolRegionsDuration())) // record the current store limit in memory @@ -1556,9 +1397,6 @@ func (c *RaftCluster) getUpStores() []uint64 { // BuryStore marks a store as tombstone in cluster. // If forceBury is false, the store should be offlined and emptied before calling this func. func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1583,8 +1421,8 @@ func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { zap.String("store-address", newStore.GetAddress()), zap.String("state", store.GetState().String()), zap.Bool("physically-destroyed", store.IsPhysicallyDestroyed())) - err := c.putStoreLocked(newStore) - c.onStoreVersionChangeLocked() + err := c.setStore(newStore) + c.OnStoreVersionChange() if err == nil { // clean up the residual information. delete(c.prevStoreLimit, storeID) @@ -1600,40 +1438,6 @@ func (c *RaftCluster) BuryStore(storeID uint64, forceBury bool) error { return err } -// PauseLeaderTransfer prevents the store from been selected as source or -// target store of TransferLeader. -func (c *RaftCluster) PauseLeaderTransfer(storeID uint64) error { - return c.core.PauseLeaderTransfer(storeID) -} - -// ResumeLeaderTransfer cleans a store's pause state. The store can be selected -// as source or target of TransferLeader again. -func (c *RaftCluster) ResumeLeaderTransfer(storeID uint64) { - c.core.ResumeLeaderTransfer(storeID) -} - -// SlowStoreEvicted marks a store as a slow store and prevents transferring -// leader to the store -func (c *RaftCluster) SlowStoreEvicted(storeID uint64) error { - return c.core.SlowStoreEvicted(storeID) -} - -// SlowTrendEvicted marks a store as a slow store by trend and prevents transferring -// leader to the store -func (c *RaftCluster) SlowTrendEvicted(storeID uint64) error { - return c.core.SlowTrendEvicted(storeID) -} - -// SlowTrendRecovered cleans the evicted by slow trend state of a store. -func (c *RaftCluster) SlowTrendRecovered(storeID uint64) { - c.core.SlowTrendRecovered(storeID) -} - -// SlowStoreRecovered cleans the evicted state of a store. -func (c *RaftCluster) SlowStoreRecovered(storeID uint64) { - c.core.SlowStoreRecovered(storeID) -} - // NeedAwakenAllRegionsInStore checks whether we should do AwakenRegions operation. func (c *RaftCluster) NeedAwakenAllRegionsInStore(storeID uint64) (needAwaken bool, slowStoreIDs []uint64) { store := c.GetStore(storeID) @@ -1665,9 +1469,6 @@ func (c *RaftCluster) NeedAwakenAllRegionsInStore(storeID uint64) (needAwaken bo // UpStore up a store from offline func (c *RaftCluster) UpStore(storeID uint64) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1698,7 +1499,7 @@ func (c *RaftCluster) UpStore(storeID uint64) error { log.Warn("store has been up", zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress())) - err := c.putStoreLocked(newStore) + err := c.setStore(newStore) if err == nil { if exist { // persist the store limit @@ -1712,9 +1513,6 @@ func (c *RaftCluster) UpStore(storeID uint64) error { // ReadyToServe change store's node state to Serving. func (c *RaftCluster) ReadyToServe(storeID uint64) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) @@ -1736,7 +1534,7 @@ func (c *RaftCluster) ReadyToServe(storeID uint64) error { log.Info("store has changed to serving", zap.Uint64("store-id", storeID), zap.String("store-address", newStore.GetAddress())) - err := c.putStoreLocked(newStore) + err := c.setStore(newStore) if err == nil { c.resetProgress(storeID, store.GetAddress()) } @@ -1759,16 +1557,16 @@ func (c *RaftCluster) SetStoreWeight(storeID uint64, leaderWeight, regionWeight core.SetRegionWeight(regionWeight), ) - return c.putStoreLocked(newStore) + return c.setStore(newStore) } -func (c *RaftCluster) putStoreLocked(store *core.StoreInfo) error { +func (c *RaftCluster) setStore(store *core.StoreInfo) error { if c.storage != nil { if err := c.storage.SaveStoreMeta(store.GetMeta()); err != nil { return err } } - c.core.PutStore(store) + c.PutStore(store) if !c.IsServiceIndependent(mcsutils.SchedulingServiceName) { c.updateStoreStatistics(store.GetID(), store.IsSlow()) } @@ -1834,11 +1632,11 @@ func (c *RaftCluster) checkStores() { offlineStore := store.GetMeta() id := offlineStore.GetId() - regionSize := c.core.GetStoreRegionSize(id) + regionSize := c.GetStoreRegionSize(id) if c.IsPrepared() { c.updateProgress(id, store.GetAddress(), removingAction, float64(regionSize), float64(regionSize), false /* dec */) } - regionCount := c.core.GetStoreRegionCount(id) + regionCount := c.GetStoreRegionCount(id) // If the store is empty, it can be buried. if regionCount == 0 { if err := c.BuryStore(id, false); err != nil { @@ -1866,7 +1664,7 @@ func (c *RaftCluster) checkStores() { func (c *RaftCluster) getThreshold(stores []*core.StoreInfo, store *core.StoreInfo) float64 { start := time.Now() if !c.opt.IsPlacementRulesEnabled() { - regionSize := c.core.GetRegionSizeByRange([]byte(""), []byte("")) * int64(c.opt.GetMaxReplicas()) + regionSize := c.GetRegionSizeByRange([]byte(""), []byte("")) * int64(c.opt.GetMaxReplicas()) weight := getStoreTopoWeight(store, stores, c.opt.GetLocationLabels(), c.opt.GetMaxReplicas()) return float64(regionSize) * weight * 0.9 } @@ -1906,7 +1704,7 @@ func (c *RaftCluster) calculateRange(stores []*core.StoreInfo, store *core.Store matchStores = append(matchStores, s) } } - regionSize := c.core.GetRegionSizeByRange(startKey, endKey) * int64(rule.Count) + regionSize := c.GetRegionSizeByRange(startKey, endKey) * int64(rule.Count) weight := getStoreTopoWeight(store, matchStores, rule.LocationLabels, rule.Count) storeSize += float64(regionSize) * weight log.Debug("calculate range result", @@ -2072,13 +1870,10 @@ func encodePreparingProgressKey(storeID uint64) string { // RemoveTombStoneRecords removes the tombStone Records. func (c *RaftCluster) RemoveTombStoneRecords() error { - c.Lock() - defer c.Unlock() - var failedStores []uint64 for _, store := range c.GetStores() { if store.IsRemoved() { - if c.core.GetStoreRegionCount(store.GetID()) > 0 { + if c.GetStoreRegionCount(store.GetID()) > 0 { log.Warn("skip removing tombstone", zap.Stringer("store", store.GetMeta())) failedStores = append(failedStores, store.GetID()) continue @@ -2116,7 +1911,7 @@ func (c *RaftCluster) deleteStore(store *core.StoreInfo) error { return err } } - c.core.DeleteStore(store) + c.DeleteStore(store) return nil } @@ -2157,12 +1952,6 @@ func (c *RaftCluster) resetProgressIndicator() { // OnStoreVersionChange changes the version of the cluster when needed. func (c *RaftCluster) OnStoreVersionChange() { - c.RLock() - defer c.RUnlock() - c.onStoreVersionChangeLocked() -} - -func (c *RaftCluster) onStoreVersionChangeLocked() { var minVersion *semver.Version stores := c.GetStores() for _, s := range stores { @@ -2220,13 +2009,13 @@ func (c *RaftCluster) PutMetaCluster(meta *metapb.Cluster) error { // GetRegionStatsByRange returns region statistics from cluster. func (c *RaftCluster) GetRegionStatsByRange(startKey, endKey []byte) *statistics.RegionStats { - return statistics.GetRegionStats(c.core.ScanRegions(startKey, endKey, -1)) + return statistics.GetRegionStats(c.ScanRegions(startKey, endKey, -1)) } -// GetRegionCount returns the number of regions in the range. -func (c *RaftCluster) GetRegionCount(startKey, endKey []byte) *statistics.RegionStats { +// GetRegionStatsCount returns the number of regions in the range. +func (c *RaftCluster) GetRegionStatsCount(startKey, endKey []byte) *statistics.RegionStats { stats := &statistics.RegionStats{} - stats.Count = c.core.GetRegionCount(startKey, endKey) + stats.Count = c.GetRegionCount(startKey, endKey) return stats } @@ -2238,7 +2027,7 @@ func (c *RaftCluster) putRegion(region *core.RegionInfo) error { return err } } - c.core.PutRegion(region) + c.PutRegion(region) return nil } @@ -2293,7 +2082,7 @@ func (c *RaftCluster) AddStoreLimit(store *metapb.Store) { func (c *RaftCluster) RemoveStoreLimit(storeID uint64) { cfg := c.opt.GetScheduleConfig().Clone() for _, limitType := range storelimit.TypeNameValue { - c.core.ResetStoreLimit(storeID, limitType) + c.ResetStoreLimit(storeID, limitType) } delete(cfg.StoreLimit, storeID) c.opt.SetScheduleConfig(cfg) @@ -2313,16 +2102,13 @@ func (c *RaftCluster) RemoveStoreLimit(storeID uint64) { // SetMinResolvedTS sets up a store with min resolved ts. func (c *RaftCluster) SetMinResolvedTS(storeID, minResolvedTS uint64) error { - c.Lock() - defer c.Unlock() - store := c.GetStore(storeID) if store == nil { return errs.ErrStoreNotFound.FastGenByArgs(storeID) } newStore := store.Clone(core.SetMinResolvedTS(minResolvedTS)) - c.core.PutStore(newStore) + c.PutStore(newStore) return nil } diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index 945e354bb6c..ee7c477476b 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -33,6 +33,7 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/stretchr/testify/require" + "github.com/tikv/pd/pkg/cluster" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/core/constant" "github.com/tikv/pd/pkg/core/storelimit" @@ -92,7 +93,7 @@ func TestStoreHeartbeat(t *testing.T) { } re.Error(cluster.HandleStoreHeartbeat(req, resp)) - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) re.Equal(i+1, cluster.GetStoreCount()) re.Equal(int64(0), store.GetLastHeartbeatTS().UnixNano()) @@ -214,7 +215,7 @@ func TestFilterUnhealthyStore(t *testing.T) { Available: 50, RegionCount: 1, } - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) re.NoError(cluster.HandleStoreHeartbeat(req, resp)) re.NotNil(cluster.hotStat.GetRollingStoreStats(store.GetID())) } @@ -227,7 +228,7 @@ func TestFilterUnhealthyStore(t *testing.T) { RegionCount: 1, } newStore := store.Clone(core.SetStoreState(metapb.StoreState_Tombstone)) - re.NoError(cluster.putStoreLocked(newStore)) + re.NoError(cluster.setStore(newStore)) re.NoError(cluster.HandleStoreHeartbeat(req, resp)) re.Nil(cluster.hotStat.GetRollingStoreStats(store.GetID())) } @@ -252,7 +253,7 @@ func TestSetOfflineStore(t *testing.T) { // Put 6 stores. for _, store := range newTestStores(6, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } // store 1: up -> offline @@ -294,7 +295,7 @@ func TestSetOfflineStore(t *testing.T) { // test clean up tombstone store toCleanStore := cluster.GetStore(1).Clone().GetMeta() toCleanStore.LastHeartbeat = time.Now().Add(-40 * 24 * time.Hour).UnixNano() - cluster.PutStore(toCleanStore) + cluster.PutMetaStore(toCleanStore) cluster.checkStores() re.Nil(cluster.GetStore(1)) } @@ -311,7 +312,7 @@ func TestSetOfflineWithReplica(t *testing.T) { // Put 4 stores. for _, store := range newTestStores(4, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.NoError(cluster.RemoveStore(2, false)) @@ -350,7 +351,7 @@ func TestSetOfflineStoreWithEvictLeader(t *testing.T) { // Put 3 stores. for _, store := range newTestStores(3, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } _, err = addEvictLeaderScheduler(cluster, 1) @@ -377,7 +378,7 @@ func TestForceBuryStore(t *testing.T) { stores := newTestStores(2, "5.3.0") stores[1] = stores[1].Clone(core.SetLastHeartbeatTS(time.Now())) for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.NoError(cluster.BuryStore(uint64(1), true)) re.Error(cluster.BuryStore(uint64(2), true)) @@ -395,7 +396,7 @@ func TestReuseAddress(t *testing.T) { cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) // Put 4 stores. for _, store := range newTestStores(4, "2.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } // store 1: up // store 2: offline @@ -419,9 +420,9 @@ func TestReuseAddress(t *testing.T) { if storeInfo.IsPhysicallyDestroyed() || storeInfo.IsRemoved() { // try to start a new store with the same address with store which is physically destroyed or tombstone should be success - re.NoError(cluster.PutStore(newStore)) + re.NoError(cluster.PutMetaStore(newStore)) } else { - re.Error(cluster.PutStore(newStore)) + re.Error(cluster.PutMetaStore(newStore)) } } } @@ -449,7 +450,7 @@ func TestUpStore(t *testing.T) { // Put 5 stores. for _, store := range newTestStores(5, "5.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } // set store 1 offline @@ -489,7 +490,7 @@ func TestRemovingProcess(t *testing.T) { // Put 5 stores. stores := newTestStores(5, "5.0.0") for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } regions := newTestRegions(100, 5, 1) var regionInStore1 []*core.RegionInfo @@ -517,7 +518,7 @@ func TestRemovingProcess(t *testing.T) { if i >= 5 { break } - cluster.DropCacheRegion(region.GetID()) + cluster.RemoveRegionIfExist(region.GetID()) i++ } cluster.checkStores() @@ -552,13 +553,13 @@ func TestDeleteStoreUpdatesClusterVersion(t *testing.T) { // Put 3 new 4.0.9 stores. for _, store := range newTestStores(3, "4.0.9") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.Equal("4.0.9", cluster.GetClusterVersion()) // Upgrade 2 stores to 5.0.0. for _, store := range newTestStores(2, "5.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.Equal("4.0.9", cluster.GetClusterVersion()) @@ -581,14 +582,14 @@ func TestStoreClusterVersion(t *testing.T) { s1.Version = "5.0.1" s2.Version = "5.0.3" s3.Version = "5.0.5" - re.NoError(cluster.PutStore(s2)) + re.NoError(cluster.PutMetaStore(s2)) re.Equal(s2.Version, cluster.GetClusterVersion()) - re.NoError(cluster.PutStore(s1)) + re.NoError(cluster.PutMetaStore(s1)) // the cluster version should be 5.0.1(the min one) re.Equal(s1.Version, cluster.GetClusterVersion()) - re.NoError(cluster.PutStore(s3)) + re.NoError(cluster.PutMetaStore(s3)) // the cluster version should be 5.0.1(the min one) re.Equal(s1.Version, cluster.GetClusterVersion()) } @@ -678,7 +679,7 @@ func TestBucketHeartbeat(t *testing.T) { n, np := uint64(2), uint64(2) regions := newTestRegions(n, n, np) for _, store := range stores { - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) } re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), regions[0])) @@ -728,31 +729,31 @@ func TestRegionHeartbeat(t *testing.T) { regions := newTestRegions(n, n, np) for _, store := range stores { - re.NoError(cluster.putStoreLocked(store)) + re.NoError(cluster.setStore(store)) } for i, region := range regions { // region does not exist. re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is the same, not updated. re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) origin := region // region is updated. region = origin.Clone(core.WithIncVersion()) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is stale (Version). stale := origin.Clone(core.WithIncConfVer()) re.Error(cluster.processRegionHeartbeat(core.ContextTODO(), stale)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is updated @@ -762,13 +763,13 @@ func TestRegionHeartbeat(t *testing.T) { ) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // region is stale (ConfVer). stale = origin.Clone(core.WithIncConfVer()) re.Error(cluster.processRegionHeartbeat(core.ContextTODO(), stale)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Add a down peer. @@ -780,38 +781,38 @@ func TestRegionHeartbeat(t *testing.T) { })) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Add a pending peer. region = region.Clone(core.WithPendingPeers([]*metapb.Peer{region.GetPeers()[rand.Intn(len(region.GetPeers()))]})) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Clear down peers. region = region.Clone(core.WithDownPeers(nil)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Clear pending peers. region = region.Clone(core.WithPendingPeers(nil)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Remove peers. origin = region region = origin.Clone(core.SetPeers(region.GetPeers()[:1])) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Add peers. region = origin regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) checkRegionsKV(re, cluster.storage, regions[:i+1]) // Change one peer to witness @@ -821,47 +822,47 @@ func TestRegionHeartbeat(t *testing.T) { ) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change leader. region = region.Clone(core.WithLeader(region.GetPeers()[1])) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change ApproximateSize. region = region.Clone(core.SetApproximateSize(144)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change ApproximateKeys. region = region.Clone(core.SetApproximateKeys(144000)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change bytes written. region = region.Clone(core.SetWrittenBytes(24000)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Change bytes read. region = region.Clone(core.SetReadBytes(1080000)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) // Flashback region = region.Clone(core.WithFlashback(true, 1)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) region = region.Clone(core.WithFlashback(false, 0)) regions[i] = region re.NoError(cluster.processRegionHeartbeat(core.ContextTODO(), region)) - checkRegions(re, cluster.core, regions[:i+1]) + checkRegions(re, cluster.BasicCluster, regions[:i+1]) } regionCounts := make(map[uint64]int) @@ -893,10 +894,10 @@ func TestRegionHeartbeat(t *testing.T) { time.Sleep(50 * time.Millisecond) for _, store := range cluster.GetStores() { - re.Equal(cluster.core.GetStoreLeaderCount(store.GetID()), store.GetLeaderCount()) - re.Equal(cluster.core.GetStoreRegionCount(store.GetID()), store.GetRegionCount()) - re.Equal(cluster.core.GetStoreLeaderRegionSize(store.GetID()), store.GetLeaderSize()) - re.Equal(cluster.core.GetStoreRegionSize(store.GetID()), store.GetRegionSize()) + re.Equal(cluster.GetStoreLeaderCount(store.GetID()), store.GetLeaderCount()) + re.Equal(cluster.GetStoreRegionCount(store.GetID()), store.GetRegionCount()) + re.Equal(cluster.GetStoreLeaderRegionSize(store.GetID()), store.GetLeaderSize()) + re.Equal(cluster.GetStoreRegionSize(store.GetID()), store.GetRegionSize()) } // Test with storage. @@ -1132,7 +1133,7 @@ func TestRegionLabelIsolationLevel(t *testing.T) { State: metapb.StoreState_Up, Labels: labels, } - re.NoError(cluster.putStoreLocked(core.NewStoreInfo(store))) + re.NoError(cluster.setStore(core.NewStoreInfo(store))) } peers := make([]*metapb.Peer, 0, 4) @@ -1295,7 +1296,7 @@ func TestOfflineAndMerge(t *testing.T) { // Put 4 stores. for _, store := range newTestStores(4, "5.0.0") { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } peers := []*metapb.Peer{ @@ -1350,7 +1351,7 @@ func TestStoreConfigUpdate(t *testing.T) { tc := newTestCluster(ctx, opt) stores := newTestStores(5, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } re.Len(tc.getUpStores(), 5) // Case1: big region. @@ -1435,7 +1436,7 @@ func TestSyncConfigContext(t *testing.T) { })) stores := newTestStores(1, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } // trip schema header now := time.Now() @@ -1457,7 +1458,7 @@ func TestStoreConfigSync(t *testing.T) { tc := newTestCluster(ctx, opt) stores := newTestStores(5, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } re.Len(tc.getUpStores(), 5) @@ -1502,7 +1503,7 @@ func TestUpdateStorePendingPeerCount(t *testing.T) { tc.RaftCluster.coordinator = schedule.NewCoordinator(ctx, tc.RaftCluster, nil) stores := newTestStores(5, "2.0.0") for _, s := range stores { - re.NoError(tc.putStoreLocked(s)) + re.NoError(tc.setStore(s)) } tc.RaftCluster.wg.Add(1) go tc.RaftCluster.runUpdateStoreStats() @@ -1677,7 +1678,7 @@ func TestCalculateStoreSize1(t *testing.T) { }, }...) s := store.Clone(core.SetStoreLabels(labels)) - re.NoError(cluster.PutStore(s.GetMeta())) + re.NoError(cluster.PutMetaStore(s.GetMeta())) } cluster.ruleManager.SetRule( @@ -1761,7 +1762,7 @@ func TestCalculateStoreSize2(t *testing.T) { } labels = append(labels, []*metapb.StoreLabel{{Key: "rack", Value: "r1"}, {Key: "host", Value: "h1"}}...) s := store.Clone(core.SetStoreLabels(labels)) - re.NoError(cluster.PutStore(s.GetMeta())) + re.NoError(cluster.PutMetaStore(s.GetMeta())) } cluster.ruleManager.SetRule( @@ -1811,7 +1812,7 @@ func TestStores(t *testing.T) { id := store.GetID() re.Nil(cache.GetStore(id)) re.Error(cache.PauseLeaderTransfer(id)) - cache.SetStore(store) + cache.PutStore(store) re.Equal(store, cache.GetStore(id)) re.Equal(i+1, cache.GetStoreCount()) re.NoError(cache.PauseLeaderTransfer(id)) @@ -1842,7 +1843,7 @@ func Test(t *testing.T) { _, opts, err := newTestScheduleConfig() re.NoError(err) tc := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opts, storage.NewStorageWithMemoryBackend()) - cache := tc.core + cache := tc.BasicCluster for i := uint64(0); i < n; i++ { region := regions[i] @@ -1960,7 +1961,7 @@ func TestAwakenStore(t *testing.T) { stores := newTestStores(n, "6.5.0") re.True(stores[0].NeedAwakenStore()) for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } for i := uint64(1); i <= n; i++ { re.False(cluster.slowStat.ExistsSlowStores()) @@ -1970,7 +1971,7 @@ func TestAwakenStore(t *testing.T) { now := time.Now() store4 := stores[0].Clone(core.SetLastHeartbeatTS(now), core.SetLastAwakenTime(now.Add(-11*time.Minute))) - re.NoError(cluster.putStoreLocked(store4)) + re.NoError(cluster.setStore(store4)) store1 := cluster.GetStore(1) re.True(store1.NeedAwakenStore()) @@ -2012,7 +2013,7 @@ func TestUpdateAndDeleteLabel(t *testing.T) { cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend()) stores := newTestStores(1, "6.5.1") for _, store := range stores { - re.NoError(cluster.PutStore(store.GetMeta())) + re.NoError(cluster.PutMetaStore(store.GetMeta())) } re.Empty(cluster.GetStore(1).GetLabels()) // Update label. @@ -2104,7 +2105,7 @@ func TestUpdateAndDeleteLabel(t *testing.T) { newStore := typeutil.DeepClone(cluster.GetStore(1).GetMeta(), core.StoreFactory) newStore.Labels = nil // Store rebooting will call PutStore. - err = cluster.PutStore(newStore) + err = cluster.PutMetaStore(newStore) re.NoError(err) // Check the label after rebooting. re.Equal([]*metapb.StoreLabel{{Key: "mode", Value: "readonly"}}, cluster.GetStore(1).GetLabels()) @@ -2141,7 +2142,7 @@ func newTestRaftCluster( s storage.Storage, ) *RaftCluster { opt.GetScheduleConfig().EnableHeartbeatConcurrentRunner = false - rc := &RaftCluster{serverCtx: ctx, core: core.NewBasicCluster(), storage: s} + rc := &RaftCluster{serverCtx: ctx, BasicCluster: core.NewBasicCluster(), storage: s} rc.InitCluster(id, opt, nil, nil) rc.ruleManager = placement.NewRuleManager(ctx, storage.NewStorageWithMemoryBackend(), rc, opt) if opt.IsPlacementRulesEnabled() { @@ -2150,7 +2151,7 @@ func newTestRaftCluster( panic(err) } } - rc.schedulingController = newSchedulingController(rc.ctx, rc.core, rc.opt, rc.ruleManager) + rc.schedulingController = newSchedulingController(rc.ctx, rc.BasicCluster, rc.opt, rc.ruleManager) return rc } @@ -2323,7 +2324,7 @@ func (c *testCluster) addRegionStore(storeID uint64, regionCount int, regionSize c.SetStoreLimit(storeID, storelimit.RemovePeer, 60) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) addLeaderRegion(regionID uint64, leaderStoreID uint64, followerStoreIDs ...uint64) error { @@ -2346,7 +2347,7 @@ func (c *testCluster) updateLeaderCount(storeID uint64, leaderCount int) error { ) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) addLeaderStore(storeID uint64, leaderCount int) error { @@ -2362,7 +2363,7 @@ func (c *testCluster) addLeaderStore(storeID uint64, leaderCount int) error { c.SetStoreLimit(storeID, storelimit.RemovePeer, 60) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) setStoreDown(storeID uint64) error { @@ -2373,7 +2374,7 @@ func (c *testCluster) setStoreDown(storeID uint64) error { ) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) setStoreOffline(storeID uint64) error { @@ -2381,7 +2382,7 @@ func (c *testCluster) setStoreOffline(storeID uint64) error { newStore := store.Clone(core.SetStoreState(metapb.StoreState_Offline, false)) c.Lock() defer c.Unlock() - return c.putStoreLocked(newStore) + return c.setStore(newStore) } func (c *testCluster) LoadRegion(regionID uint64, followerStoreIDs ...uint64) error { @@ -2965,7 +2966,7 @@ func TestShouldRun(t *testing.T) { nr := &metapb.Region{Id: 6, Peers: []*metapb.Peer{}} newRegion := core.NewRegionInfo(nr, nil, core.SetSource(core.Heartbeat)) re.Error(tc.processRegionHeartbeat(core.ContextTODO(), newRegion)) - re.Equal(7, tc.core.GetClusterNotFromStorageRegionsCnt()) + re.Equal(7, tc.GetClusterNotFromStorageRegionsCnt()) } func TestShouldRunWithNonLeaderRegions(t *testing.T) { @@ -3008,7 +3009,7 @@ func TestShouldRunWithNonLeaderRegions(t *testing.T) { nr := &metapb.Region{Id: 9, Peers: []*metapb.Peer{}} newRegion := core.NewRegionInfo(nr, nil, core.SetSource(core.Heartbeat)) re.Error(tc.processRegionHeartbeat(core.ContextTODO(), newRegion)) - re.Equal(9, tc.core.GetClusterNotFromStorageRegionsCnt()) + re.Equal(9, tc.GetClusterNotFromStorageRegionsCnt()) // Now, after server is prepared, there exist some regions with no leader. re.Equal(uint64(0), tc.GetRegion(10).GetLeader().GetStoreId()) @@ -3730,3 +3731,34 @@ func waitNoResponse(re *require.Assertions, stream mockhbstream.HeartbeatStream) return res == nil }) } + +func BenchmarkHandleStatsAsync(b *testing.B) { + // Setup: create a new instance of Cluster + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _, opt, _ := newTestScheduleConfig() + c := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend()) + c.coordinator = schedule.NewCoordinator(ctx, c, nil) + c.SetPrepared() + region := core.NewRegionInfo(&metapb.Region{ + Id: 1, + RegionEpoch: &metapb.RegionEpoch{ + ConfVer: 1, + Version: 1, + }, + StartKey: []byte{byte(2)}, + EndKey: []byte{byte(3)}, + Peers: []*metapb.Peer{{Id: 11, StoreId: uint64(1)}}, + }, nil, + core.SetApproximateSize(10), + core.SetReportInterval(0, 10), + ) + + // Reset timer after setup + b.ResetTimer() + // Run HandleStatsAsync b.N times + for i := 0; i < b.N; i++ { + cluster.HandleStatsAsync(c, region) + } +} diff --git a/server/cluster/scheduling_controller.go b/server/cluster/scheduling_controller.go index 20d5a6bceae..ca846eaa885 100644 --- a/server/cluster/scheduling_controller.go +++ b/server/cluster/scheduling_controller.go @@ -195,7 +195,7 @@ func (sc *schedulingController) collectSchedulingMetrics() { // collect hot cache metrics sc.hotStat.CollectMetrics() // collect the lock metrics - sc.RegionsInfo.CollectWaitLockMetrics() + sc.CollectWaitLockMetrics() } func (sc *schedulingController) removeStoreStatistics(storeID uint64) { diff --git a/server/config/OWNERS b/server/config/OWNERS new file mode 100644 index 00000000000..179de4843e6 --- /dev/null +++ b/server/config/OWNERS @@ -0,0 +1,7 @@ +# See the OWNERS docs at https://go.k8s.io/owners +options: + no_parent_owners: true +filters: + "(OWNERS|(config|service_middleware_config)\\.go)$": + approvers: + - sig-critical-approvers-config diff --git a/server/grpc_service.go b/server/grpc_service.go index 2b3ee232686..acfc87fcf71 100644 --- a/server/grpc_service.go +++ b/server/grpc_service.go @@ -826,7 +826,7 @@ func (s *GrpcServer) PutStore(ctx context.Context, request *pdpb.PutStoreRequest }, nil } - if err := rc.PutStore(store); err != nil { + if err := rc.PutMetaStore(store); err != nil { return &pdpb.PutStoreResponse{ Header: s.wrapErrorToHeader(pdpb.ErrorType_UNKNOWN, err.Error()), }, nil diff --git a/server/join/join.go b/server/join/join.go index d1711063313..1319dc08d07 100644 --- a/server/join/join.go +++ b/server/join/join.go @@ -136,7 +136,11 @@ func PrepareJoinCluster(cfg *config.Config) error { existed := false for _, m := range listResp.Members { if len(m.Name) == 0 { - return errors.New("there is a member that has not joined successfully") + log.Error("there is an abnormal joined member in the current member list", + zap.Uint64("id", m.ID), + zap.Strings("peer-urls", m.PeerURLs), + zap.Strings("client-urls", m.ClientURLs)) + return errors.Errorf("there is a member %d that has not joined successfully", m.ID) } if m.Name == cfg.Name { existed = true @@ -184,7 +188,11 @@ func PrepareJoinCluster(cfg *config.Config) error { listSucc = true } if len(n) == 0 { - return errors.New("there is a member that has not joined successfully") + log.Error("there is an abnormal joined member in the current member list", + zap.Uint64("id", memb.ID), + zap.Strings("peer-urls", memb.PeerURLs), + zap.Strings("client-urls", memb.ClientURLs)) + return errors.Errorf("there is a member %d that has not joined successfully", memb.ID) } for _, m := range memb.PeerURLs { pds = append(pds, fmt.Sprintf("%s=%s", n, m)) diff --git a/server/server.go b/server/server.go index af9f48f8c9b..1d38a5ee495 100644 --- a/server/server.go +++ b/server/server.go @@ -1555,8 +1555,6 @@ func (s *Server) UpdateGRPCServiceRateLimiter(serviceLabel string, opts ...ratel // GetClusterStatus gets cluster status. func (s *Server) GetClusterStatus() (*cluster.Status, error) { - s.cluster.Lock() - defer s.cluster.Unlock() return s.cluster.LoadClusterStatus() } diff --git a/tests/integrations/client/client_test.go b/tests/integrations/client/client_test.go index dfe7a6980c7..65acd897726 100644 --- a/tests/integrations/client/client_test.go +++ b/tests/integrations/client/client_test.go @@ -40,6 +40,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" pd "github.com/tikv/pd/client" + clierrs "github.com/tikv/pd/client/errs" "github.com/tikv/pd/client/retry" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/errs" @@ -528,7 +529,7 @@ func TestGlobalAndLocalTSO(t *testing.T) { re.NotEmpty(cluster.WaitLeader()) _, _, err = cli.GetTS(ctx) re.Error(err) - re.True(pd.IsLeaderChange(err)) + re.True(clierrs.IsLeaderChange(err)) _, _, err = cli.GetTS(ctx) re.NoError(err) re.NoError(failpoint.Disable("github.com/tikv/pd/client/skipUpdateMember")) diff --git a/tests/integrations/client/http_client_test.go b/tests/integrations/client/http_client_test.go index d35b7f00584..f4a48dcd63e 100644 --- a/tests/integrations/client/http_client_test.go +++ b/tests/integrations/client/http_client_test.go @@ -21,10 +21,12 @@ import ( "net/url" "sort" "strings" + "sync" "testing" "time" "github.com/pingcap/errors" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/prometheus/client_golang/prometheus" dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" @@ -41,190 +43,184 @@ import ( "github.com/tikv/pd/tests" ) -type mode int - -// We have two ways to create HTTP client. -// 1. using `NewClient` which created `DefaultPDServiceDiscovery` -// 2. using `NewClientWithServiceDiscovery` which pass a `PDServiceDiscovery` as parameter -// test cases should be run in both modes. -const ( - defaultServiceDiscovery mode = iota - specificServiceDiscovery -) - type httpClientTestSuite struct { suite.Suite - env map[mode]*httpClientTestEnv + // 1. Using `NewClient` will create a `DefaultPDServiceDiscovery` internal. + // 2. Using `NewClientWithServiceDiscovery` will need a `PDServiceDiscovery` to be passed in. + withServiceDiscovery bool + ctx context.Context + cancelFunc context.CancelFunc + cluster *tests.TestCluster + endpoints []string + client pd.Client } -type httpClientTestEnv struct { - ctx context.Context - cancelFunc context.CancelFunc - cluster *tests.TestCluster - endpoints []string +func TestHTTPClientTestSuite(t *testing.T) { + suite.Run(t, &httpClientTestSuite{ + withServiceDiscovery: false, + }) } -func TestHTTPClientTestSuite(t *testing.T) { - suite.Run(t, new(httpClientTestSuite)) +func TestHTTPClientTestSuiteWithServiceDiscovery(t *testing.T) { + suite.Run(t, &httpClientTestSuite{ + withServiceDiscovery: true, + }) } func (suite *httpClientTestSuite) SetupSuite() { - suite.env = make(map[mode]*httpClientTestEnv) re := suite.Require() + suite.ctx, suite.cancelFunc = context.WithCancel(context.Background()) - for _, mode := range []mode{defaultServiceDiscovery, specificServiceDiscovery} { - env := &httpClientTestEnv{} - env.ctx, env.cancelFunc = context.WithCancel(context.Background()) + cluster, err := tests.NewTestCluster(suite.ctx, 2) + re.NoError(err) - cluster, err := tests.NewTestCluster(env.ctx, 2) - re.NoError(err) + err = cluster.RunInitialServers() + re.NoError(err) + leader := cluster.WaitLeader() + re.NotEmpty(leader) + leaderServer := cluster.GetLeaderServer() - err = cluster.RunInitialServers() + err = leaderServer.BootstrapCluster() + // Add 2 more stores to the cluster. + for i := 2; i <= 4; i++ { + tests.MustPutStore(re, cluster, &metapb.Store{ + Id: uint64(i), + State: metapb.StoreState_Up, + NodeState: metapb.NodeState_Serving, + LastHeartbeat: time.Now().UnixNano(), + }) + } + re.NoError(err) + for _, region := range []*core.RegionInfo{ + core.NewTestRegionInfo(10, 1, []byte("a1"), []byte("a2")), + core.NewTestRegionInfo(11, 1, []byte("a2"), []byte("a3")), + } { + err := leaderServer.GetRaftCluster().HandleRegionHeartbeat(region) re.NoError(err) - leader := cluster.WaitLeader() - re.NotEmpty(leader) - leaderServer := cluster.GetLeaderServer() - - err = leaderServer.BootstrapCluster() + } + var ( + testServers = cluster.GetServers() + endpoints = make([]string, 0, len(testServers)) + ) + for _, s := range testServers { + addr := s.GetConfig().AdvertiseClientUrls + url, err := url.Parse(addr) re.NoError(err) - for _, region := range []*core.RegionInfo{ - core.NewTestRegionInfo(10, 1, []byte("a1"), []byte("a2")), - core.NewTestRegionInfo(11, 1, []byte("a2"), []byte("a3")), - } { - err := leaderServer.GetRaftCluster().HandleRegionHeartbeat(region) - re.NoError(err) - } - var ( - testServers = cluster.GetServers() - endpoints = make([]string, 0, len(testServers)) - ) - for _, s := range testServers { - addr := s.GetConfig().AdvertiseClientUrls - url, err := url.Parse(addr) - re.NoError(err) - endpoints = append(endpoints, url.Host) - } - env.endpoints = endpoints - env.cluster = cluster - - suite.env[mode] = env + endpoints = append(endpoints, url.Host) } -} - -func (suite *httpClientTestSuite) TearDownSuite() { - for _, env := range suite.env { - env.cancelFunc() - env.cluster.Destroy() + suite.endpoints = endpoints + suite.cluster = cluster + + if suite.withServiceDiscovery { + // Run test with specific service discovery. + cli := setupCli(suite.ctx, re, suite.endpoints) + sd := cli.GetServiceDiscovery() + suite.client = pd.NewClientWithServiceDiscovery("pd-http-client-it-grpc", sd) + } else { + // Run test with default service discovery. + suite.client = pd.NewClient("pd-http-client-it-http", suite.endpoints) } } -// RunTestInTwoModes is to run test in two modes. -func (suite *httpClientTestSuite) RunTestInTwoModes(test func(mode mode, client pd.Client)) { - // Run test with specific service discovery. - cli := setupCli(suite.env[specificServiceDiscovery].ctx, suite.Require(), suite.env[specificServiceDiscovery].endpoints) - sd := cli.GetServiceDiscovery() - client := pd.NewClientWithServiceDiscovery("pd-http-client-it-grpc", sd) - test(specificServiceDiscovery, client) - client.Close() - - // Run test with default service discovery. - client = pd.NewClient("pd-http-client-it-http", suite.env[defaultServiceDiscovery].endpoints) - test(defaultServiceDiscovery, client) - client.Close() +func (suite *httpClientTestSuite) TearDownSuite() { + suite.cancelFunc() + suite.client.Close() + suite.cluster.Destroy() } func (suite *httpClientTestSuite) TestMeta() { - suite.RunTestInTwoModes(suite.checkMeta) -} - -func (suite *httpClientTestSuite) checkMeta(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - replicateConfig, err := client.GetReplicateConfig(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + replicateConfig, err := client.GetReplicateConfig(ctx) re.NoError(err) re.Equal(3.0, replicateConfig["max-replicas"]) - region, err := client.GetRegionByID(env.ctx, 10) + region, err := client.GetRegionByID(ctx, 10) re.NoError(err) re.Equal(int64(10), region.ID) re.Equal(core.HexRegionKeyStr([]byte("a1")), region.StartKey) re.Equal(core.HexRegionKeyStr([]byte("a2")), region.EndKey) - region, err = client.GetRegionByKey(env.ctx, []byte("a2")) + region, err = client.GetRegionByKey(ctx, []byte("a2")) re.NoError(err) re.Equal(int64(11), region.ID) re.Equal(core.HexRegionKeyStr([]byte("a2")), region.StartKey) re.Equal(core.HexRegionKeyStr([]byte("a3")), region.EndKey) - regions, err := client.GetRegions(env.ctx) + regions, err := client.GetRegions(ctx) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - regions, err = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), -1) + regions, err = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), -1) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - regions, err = client.GetRegionsByStoreID(env.ctx, 1) + regions, err = client.GetRegionsByStoreID(ctx, 1) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - regions, err = client.GetEmptyRegions(env.ctx) + regions, err = client.GetEmptyRegions(ctx) re.NoError(err) re.Equal(int64(2), regions.Count) re.Len(regions.Regions, 2) - state, err := client.GetRegionsReplicatedStateByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3"))) + state, err := client.GetRegionsReplicatedStateByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3"))) re.NoError(err) re.Equal("INPROGRESS", state) - regionStats, err := client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), false) + regionStats, err := client.GetRegionStatusByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), false) re.NoError(err) - re.Greater(regionStats.Count, 0) + re.Positive(regionStats.Count) re.NotEmpty(regionStats.StoreLeaderCount) - regionStats, err = client.GetRegionStatusByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), true) + regionStats, err = client.GetRegionStatusByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), true) re.NoError(err) - re.Greater(regionStats.Count, 0) + re.Positive(regionStats.Count) re.Empty(regionStats.StoreLeaderCount) - hotReadRegions, err := client.GetHotReadRegions(env.ctx) + hotReadRegions, err := client.GetHotReadRegions(ctx) re.NoError(err) - re.Len(hotReadRegions.AsPeer, 1) - re.Len(hotReadRegions.AsLeader, 1) - hotWriteRegions, err := client.GetHotWriteRegions(env.ctx) + re.Len(hotReadRegions.AsPeer, 4) + re.Len(hotReadRegions.AsLeader, 4) + hotWriteRegions, err := client.GetHotWriteRegions(ctx) re.NoError(err) - re.Len(hotWriteRegions.AsPeer, 1) - re.Len(hotWriteRegions.AsLeader, 1) - historyHorRegions, err := client.GetHistoryHotRegions(env.ctx, &pd.HistoryHotRegionsRequest{ + re.Len(hotWriteRegions.AsPeer, 4) + re.Len(hotWriteRegions.AsLeader, 4) + historyHorRegions, err := client.GetHistoryHotRegions(ctx, &pd.HistoryHotRegionsRequest{ StartTime: 0, EndTime: time.Now().AddDate(0, 0, 1).UnixNano() / int64(time.Millisecond), }) re.NoError(err) re.Empty(historyHorRegions.HistoryHotRegion) - store, err := client.GetStores(env.ctx) + stores, err := client.GetStores(ctx) re.NoError(err) - re.Equal(1, store.Count) - re.Len(store.Stores, 1) - storeID := uint64(store.Stores[0].Store.ID) // TODO: why type is different? - store2, err := client.GetStore(env.ctx, storeID) + re.Equal(4, stores.Count) + re.Len(stores.Stores, 4) + storeID := uint64(stores.Stores[0].Store.ID) // TODO: why type is different? + store2, err := client.GetStore(ctx, storeID) re.NoError(err) re.EqualValues(storeID, store2.Store.ID) - version, err := client.GetClusterVersion(env.ctx) + version, err := client.GetClusterVersion(ctx) re.NoError(err) - re.Equal("0.0.0", version) - rgs, _ := client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a"), []byte("a1")), 100) + re.Equal("1.0.0", version) + rgs, _ := client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a"), []byte("a1")), 100) re.Equal(int64(0), rgs.Count) - rgs, _ = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), 100) + rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a1"), []byte("a3")), 100) re.Equal(int64(2), rgs.Count) - rgs, _ = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte("a2"), []byte("b")), 100) + rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte("a2"), []byte("b")), 100) re.Equal(int64(1), rgs.Count) - rgs, _ = client.GetRegionsByKeyRange(env.ctx, pd.NewKeyRange([]byte(""), []byte("")), 100) + rgs, _ = client.GetRegionsByKeyRange(ctx, pd.NewKeyRange([]byte(""), []byte("")), 100) re.Equal(int64(2), rgs.Count) + // store 2 origin status:offline + err = client.DeleteStore(ctx, 2) + re.NoError(err) + store2, err = client.GetStore(ctx, 2) + re.NoError(err) + re.Equal(int64(metapb.StoreState_Offline), store2.Store.State) } func (suite *httpClientTestSuite) TestGetMinResolvedTSByStoresIDs() { - suite.RunTestInTwoModes(suite.checkGetMinResolvedTSByStoresIDs) -} - -func (suite *httpClientTestSuite) checkGetMinResolvedTSByStoresIDs(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() testMinResolvedTS := tsoutil.TimeToTS(time.Now()) - raftCluster := env.cluster.GetLeaderServer().GetRaftCluster() + raftCluster := suite.cluster.GetLeaderServer().GetRaftCluster() err := raftCluster.SetMinResolvedTS(1, testMinResolvedTS) re.NoError(err) // Make sure the min resolved TS is updated. @@ -233,18 +229,18 @@ func (suite *httpClientTestSuite) checkGetMinResolvedTSByStoresIDs(mode mode, cl return minResolvedTS == testMinResolvedTS }) // Wait for the cluster-level min resolved TS to be initialized. - minResolvedTS, storeMinResolvedTSMap, err := client.GetMinResolvedTSByStoresIDs(env.ctx, nil) + minResolvedTS, storeMinResolvedTSMap, err := client.GetMinResolvedTSByStoresIDs(ctx, nil) re.NoError(err) re.Equal(testMinResolvedTS, minResolvedTS) re.Empty(storeMinResolvedTSMap) // Get the store-level min resolved TS. - minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(env.ctx, []uint64{1}) + minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(ctx, []uint64{1}) re.NoError(err) re.Equal(testMinResolvedTS, minResolvedTS) re.Len(storeMinResolvedTSMap, 1) re.Equal(minResolvedTS, storeMinResolvedTSMap[1]) // Get the store-level min resolved TS with an invalid store ID. - minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(env.ctx, []uint64{1, 2}) + minResolvedTS, storeMinResolvedTSMap, err = client.GetMinResolvedTSByStoresIDs(ctx, []uint64{1, 2}) re.NoError(err) re.Equal(testMinResolvedTS, minResolvedTS) re.Len(storeMinResolvedTSMap, 2) @@ -253,22 +249,19 @@ func (suite *httpClientTestSuite) checkGetMinResolvedTSByStoresIDs(mode mode, cl } func (suite *httpClientTestSuite) TestRule() { - suite.RunTestInTwoModes(suite.checkRule) -} - -func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - bundles, err := client.GetAllPlacementRuleBundles(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + bundles, err := client.GetAllPlacementRuleBundles(ctx) re.NoError(err) re.Len(bundles, 1) re.Equal(placement.DefaultGroupID, bundles[0].ID) - bundle, err := client.GetPlacementRuleBundleByGroup(env.ctx, placement.DefaultGroupID) + bundle, err := client.GetPlacementRuleBundleByGroup(ctx, placement.DefaultGroupID) re.NoError(err) re.Equal(bundles[0], bundle) // Check if we have the default rule. - checkRuleResult(re, env, client, &pd.Rule{ + suite.checkRuleResult(ctx, re, &pd.Rule{ GroupID: placement.DefaultGroupID, ID: placement.DefaultRuleID, Role: pd.Voter, @@ -277,7 +270,7 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { EndKey: []byte{}, }, 1, true) // Should be the same as the rules in the bundle. - checkRuleResult(re, env, client, bundle.Rules[0], 1, true) + suite.checkRuleResult(ctx, re, bundle.Rules[0], 1, true) testRule := &pd.Rule{ GroupID: placement.DefaultGroupID, ID: "test", @@ -286,39 +279,39 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { StartKey: []byte{}, EndKey: []byte{}, } - err = client.SetPlacementRule(env.ctx, testRule) + err = client.SetPlacementRule(ctx, testRule) re.NoError(err) - checkRuleResult(re, env, client, testRule, 2, true) - err = client.DeletePlacementRule(env.ctx, placement.DefaultGroupID, "test") + suite.checkRuleResult(ctx, re, testRule, 2, true) + err = client.DeletePlacementRule(ctx, placement.DefaultGroupID, "test") re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, false) + suite.checkRuleResult(ctx, re, testRule, 1, false) testRuleOp := &pd.RuleOp{ Rule: testRule, Action: pd.RuleOpAdd, } - err = client.SetPlacementRuleInBatch(env.ctx, []*pd.RuleOp{testRuleOp}) + err = client.SetPlacementRuleInBatch(ctx, []*pd.RuleOp{testRuleOp}) re.NoError(err) - checkRuleResult(re, env, client, testRule, 2, true) + suite.checkRuleResult(ctx, re, testRule, 2, true) testRuleOp = &pd.RuleOp{ Rule: testRule, Action: pd.RuleOpDel, } - err = client.SetPlacementRuleInBatch(env.ctx, []*pd.RuleOp{testRuleOp}) + err = client.SetPlacementRuleInBatch(ctx, []*pd.RuleOp{testRuleOp}) re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, false) - err = client.SetPlacementRuleBundles(env.ctx, []*pd.GroupBundle{ + suite.checkRuleResult(ctx, re, testRule, 1, false) + err = client.SetPlacementRuleBundles(ctx, []*pd.GroupBundle{ { ID: placement.DefaultGroupID, Rules: []*pd.Rule{testRule}, }, }, true) re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, true) - ruleGroups, err := client.GetAllPlacementRuleGroups(env.ctx) + suite.checkRuleResult(ctx, re, testRule, 1, true) + ruleGroups, err := client.GetAllPlacementRuleGroups(ctx) re.NoError(err) re.Len(ruleGroups, 1) re.Equal(placement.DefaultGroupID, ruleGroups[0].ID) - ruleGroup, err := client.GetPlacementRuleGroupByID(env.ctx, placement.DefaultGroupID) + ruleGroup, err := client.GetPlacementRuleGroupByID(ctx, placement.DefaultGroupID) re.NoError(err) re.Equal(ruleGroups[0], ruleGroup) testRuleGroup := &pd.RuleGroup{ @@ -326,14 +319,14 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { Index: 1, Override: true, } - err = client.SetPlacementRuleGroup(env.ctx, testRuleGroup) + err = client.SetPlacementRuleGroup(ctx, testRuleGroup) re.NoError(err) - ruleGroup, err = client.GetPlacementRuleGroupByID(env.ctx, testRuleGroup.ID) + ruleGroup, err = client.GetPlacementRuleGroupByID(ctx, testRuleGroup.ID) re.NoError(err) re.Equal(testRuleGroup, ruleGroup) - err = client.DeletePlacementRuleGroupByID(env.ctx, testRuleGroup.ID) + err = client.DeletePlacementRuleGroupByID(ctx, testRuleGroup.ID) re.NoError(err) - ruleGroup, err = client.GetPlacementRuleGroupByID(env.ctx, testRuleGroup.ID) + ruleGroup, err = client.GetPlacementRuleGroupByID(ctx, testRuleGroup.ID) re.ErrorContains(err, http.StatusText(http.StatusNotFound)) re.Empty(ruleGroup) // Test the start key and end key. @@ -345,34 +338,33 @@ func (suite *httpClientTestSuite) checkRule(mode mode, client pd.Client) { StartKey: []byte("a1"), EndKey: []byte(""), } - err = client.SetPlacementRule(env.ctx, testRule) + err = client.SetPlacementRule(ctx, testRule) re.NoError(err) - checkRuleResult(re, env, client, testRule, 1, true) + suite.checkRuleResult(ctx, re, testRule, 1, true) } -func checkRuleResult( - re *require.Assertions, - env *httpClientTestEnv, - client pd.Client, +func (suite *httpClientTestSuite) checkRuleResult( + ctx context.Context, re *require.Assertions, rule *pd.Rule, totalRuleCount int, exist bool, ) { + client := suite.client if exist { - got, err := client.GetPlacementRule(env.ctx, rule.GroupID, rule.ID) + got, err := client.GetPlacementRule(ctx, rule.GroupID, rule.ID) re.NoError(err) // skip comparison of the generated field got.StartKeyHex = rule.StartKeyHex got.EndKeyHex = rule.EndKeyHex re.Equal(rule, got) } else { - _, err := client.GetPlacementRule(env.ctx, rule.GroupID, rule.ID) + _, err := client.GetPlacementRule(ctx, rule.GroupID, rule.ID) re.ErrorContains(err, http.StatusText(http.StatusNotFound)) } // Check through the `GetPlacementRulesByGroup` API. - rules, err := client.GetPlacementRulesByGroup(env.ctx, rule.GroupID) + rules, err := client.GetPlacementRulesByGroup(ctx, rule.GroupID) re.NoError(err) checkRuleFunc(re, rules, rule, totalRuleCount, exist) // Check through the `GetPlacementRuleBundleByGroup` API. - bundle, err := client.GetPlacementRuleBundleByGroup(env.ctx, rule.GroupID) + bundle, err := client.GetPlacementRuleBundleByGroup(ctx, rule.GroupID) re.NoError(err) checkRuleFunc(re, bundle.Rules, rule, totalRuleCount, exist) } @@ -400,14 +392,11 @@ func checkRuleFunc( } func (suite *httpClientTestSuite) TestRegionLabel() { - suite.RunTestInTwoModes(suite.checkRegionLabel) -} - -func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - labelRules, err := client.GetAllRegionLabelRules(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + labelRules, err := client.GetAllRegionLabelRules(ctx) re.NoError(err) re.Len(labelRules, 1) re.Equal("keyspaces/0", labelRules[0].ID) @@ -418,9 +407,9 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) RuleType: "key-range", Data: labeler.MakeKeyRanges("1234", "5678"), } - err = client.SetRegionLabelRule(env.ctx, labelRule) + err = client.SetRegionLabelRule(ctx, labelRule) re.NoError(err) - labelRules, err = client.GetAllRegionLabelRules(env.ctx) + labelRules, err = client.GetAllRegionLabelRules(ctx) re.NoError(err) re.Len(labelRules, 2) sort.Slice(labelRules, func(i, j int) bool { @@ -440,9 +429,9 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) SetRules: []*pd.LabelRule{labelRule}, DeleteRules: []string{"rule1"}, } - err = client.PatchRegionLabelRules(env.ctx, patch) + err = client.PatchRegionLabelRules(ctx, patch) re.NoError(err) - allLabelRules, err := client.GetAllRegionLabelRules(env.ctx) + allLabelRules, err := client.GetAllRegionLabelRules(ctx) re.NoError(err) re.Len(labelRules, 2) sort.Slice(allLabelRules, func(i, j int) bool { @@ -451,7 +440,7 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) re.Equal(labelRule.ID, allLabelRules[1].ID) re.Equal(labelRule.Labels, allLabelRules[1].Labels) re.Equal(labelRule.RuleType, allLabelRules[1].RuleType) - labelRules, err = client.GetRegionLabelRulesByIDs(env.ctx, []string{"keyspaces/0", "rule2"}) + labelRules, err = client.GetRegionLabelRulesByIDs(ctx, []string{"keyspaces/0", "rule2"}) re.NoError(err) sort.Slice(labelRules, func(i, j int) bool { return labelRules[i].ID < labelRules[j].ID @@ -460,24 +449,21 @@ func (suite *httpClientTestSuite) checkRegionLabel(mode mode, client pd.Client) } func (suite *httpClientTestSuite) TestAccelerateSchedule() { - suite.RunTestInTwoModes(suite.checkAccelerateSchedule) -} - -func (suite *httpClientTestSuite) checkAccelerateSchedule(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - raftCluster := env.cluster.GetLeaderServer().GetRaftCluster() + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + raftCluster := suite.cluster.GetLeaderServer().GetRaftCluster() suspectRegions := raftCluster.GetSuspectRegions() re.Empty(suspectRegions) - err := client.AccelerateSchedule(env.ctx, pd.NewKeyRange([]byte("a1"), []byte("a2"))) + err := client.AccelerateSchedule(ctx, pd.NewKeyRange([]byte("a1"), []byte("a2"))) re.NoError(err) suspectRegions = raftCluster.GetSuspectRegions() re.Len(suspectRegions, 1) raftCluster.ClearSuspectRegions() suspectRegions = raftCluster.GetSuspectRegions() re.Empty(suspectRegions) - err = client.AccelerateScheduleInBatch(env.ctx, []*pd.KeyRange{ + err = client.AccelerateScheduleInBatch(ctx, []*pd.KeyRange{ pd.NewKeyRange([]byte("a1"), []byte("a2")), pd.NewKeyRange([]byte("a2"), []byte("a3")), }) @@ -487,24 +473,21 @@ func (suite *httpClientTestSuite) checkAccelerateSchedule(mode mode, client pd.C } func (suite *httpClientTestSuite) TestConfig() { - suite.RunTestInTwoModes(suite.checkConfig) -} - -func (suite *httpClientTestSuite) checkConfig(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - config, err := client.GetConfig(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + config, err := client.GetConfig(ctx) re.NoError(err) re.Equal(float64(4), config["schedule"].(map[string]any)["leader-schedule-limit"]) newConfig := map[string]any{ "schedule.leader-schedule-limit": float64(8), } - err = client.SetConfig(env.ctx, newConfig) + err = client.SetConfig(ctx, newConfig) re.NoError(err) - config, err = client.GetConfig(env.ctx) + config, err = client.GetConfig(ctx) re.NoError(err) re.Equal(float64(8), config["schedule"].(map[string]any)["leader-schedule-limit"]) @@ -512,15 +495,15 @@ func (suite *httpClientTestSuite) checkConfig(mode mode, client pd.Client) { newConfig = map[string]any{ "schedule.leader-schedule-limit": float64(16), } - err = client.SetConfig(env.ctx, newConfig, 5) + err = client.SetConfig(ctx, newConfig, 5) re.NoError(err) - resp, err := env.cluster.GetEtcdClient().Get(env.ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") + resp, err := suite.cluster.GetEtcdClient().Get(ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") re.NoError(err) re.Equal([]byte("16"), resp.Kvs[0].Value) // delete the config with TTL. - err = client.SetConfig(env.ctx, newConfig, 0) + err = client.SetConfig(ctx, newConfig, 0) re.NoError(err) - resp, err = env.cluster.GetEtcdClient().Get(env.ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") + resp, err = suite.cluster.GetEtcdClient().Get(ctx, sc.TTLConfigPrefix+"/schedule.leader-schedule-limit") re.NoError(err) re.Empty(resp.Kvs) @@ -528,81 +511,73 @@ func (suite *httpClientTestSuite) checkConfig(mode mode, client pd.Client) { newConfig = map[string]any{ "schedule.max-pending-peer-count": uint64(math.MaxInt32), } - err = client.SetConfig(env.ctx, newConfig, 4) + err = client.SetConfig(ctx, newConfig, 4) re.NoError(err) - c := env.cluster.GetLeaderServer().GetRaftCluster().GetOpts().GetMaxPendingPeerCount() + c := suite.cluster.GetLeaderServer().GetRaftCluster().GetOpts().GetMaxPendingPeerCount() re.Equal(uint64(math.MaxInt32), c) - err = client.SetConfig(env.ctx, newConfig, 0) + err = client.SetConfig(ctx, newConfig, 0) re.NoError(err) - resp, err = env.cluster.GetEtcdClient().Get(env.ctx, sc.TTLConfigPrefix+"/schedule.max-pending-peer-count") + resp, err = suite.cluster.GetEtcdClient().Get(ctx, sc.TTLConfigPrefix+"/schedule.max-pending-peer-count") re.NoError(err) re.Empty(resp.Kvs) } func (suite *httpClientTestSuite) TestScheduleConfig() { - suite.RunTestInTwoModes(suite.checkScheduleConfig) -} - -func (suite *httpClientTestSuite) checkScheduleConfig(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - config, err := client.GetScheduleConfig(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + config, err := client.GetScheduleConfig(ctx) re.NoError(err) re.Equal(float64(4), config["hot-region-schedule-limit"]) re.Equal(float64(2048), config["region-schedule-limit"]) config["hot-region-schedule-limit"] = float64(8) - err = client.SetScheduleConfig(env.ctx, config) + err = client.SetScheduleConfig(ctx, config) re.NoError(err) - config, err = client.GetScheduleConfig(env.ctx) + config, err = client.GetScheduleConfig(ctx) re.NoError(err) re.Equal(float64(8), config["hot-region-schedule-limit"]) re.Equal(float64(2048), config["region-schedule-limit"]) } func (suite *httpClientTestSuite) TestSchedulers() { - suite.RunTestInTwoModes(suite.checkSchedulers) -} - -func (suite *httpClientTestSuite) checkSchedulers(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - schedulers, err := client.GetSchedulers(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + schedulers, err := client.GetSchedulers(ctx) re.NoError(err) - re.Empty(schedulers) + const schedulerName = "evict-leader-scheduler" + re.NotContains(schedulers, schedulerName) - err = client.CreateScheduler(env.ctx, "evict-leader-scheduler", 1) + err = client.CreateScheduler(ctx, schedulerName, 1) re.NoError(err) - schedulers, err = client.GetSchedulers(env.ctx) + schedulers, err = client.GetSchedulers(ctx) re.NoError(err) - re.Len(schedulers, 1) - err = client.SetSchedulerDelay(env.ctx, "evict-leader-scheduler", 100) + re.Contains(schedulers, schedulerName) + err = client.SetSchedulerDelay(ctx, schedulerName, 100) re.NoError(err) - err = client.SetSchedulerDelay(env.ctx, "not-exist", 100) + err = client.SetSchedulerDelay(ctx, "not-exist", 100) re.ErrorContains(err, "500 Internal Server Error") // TODO: should return friendly error message } func (suite *httpClientTestSuite) TestSetStoreLabels() { - suite.RunTestInTwoModes(suite.checkSetStoreLabels) -} - -func (suite *httpClientTestSuite) checkSetStoreLabels(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - resp, err := client.GetStores(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + resp, err := client.GetStores(ctx) re.NoError(err) setStore := resp.Stores[0] re.Empty(setStore.Store.Labels, nil) storeLabels := map[string]string{ "zone": "zone1", } - err = client.SetStoreLabels(env.ctx, 1, storeLabels) + err = client.SetStoreLabels(ctx, 1, storeLabels) re.NoError(err) - resp, err = client.GetStores(env.ctx) + resp, err = client.GetStores(ctx) re.NoError(err) for _, store := range resp.Stores { if store.Store.ID == setStore.Store.ID { @@ -614,67 +589,52 @@ func (suite *httpClientTestSuite) checkSetStoreLabels(mode mode, client pd.Clien } func (suite *httpClientTestSuite) TestTransferLeader() { - suite.RunTestInTwoModes(suite.checkTransferLeader) -} - -func (suite *httpClientTestSuite) checkTransferLeader(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - members, err := client.GetMembers(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + members, err := client.GetMembers(ctx) re.NoError(err) re.Len(members.Members, 2) - leader, err := client.GetLeader(env.ctx) + leader, err := client.GetLeader(ctx) re.NoError(err) // Transfer leader to another pd for _, member := range members.Members { if member.GetName() != leader.GetName() { - err = client.TransferLeader(env.ctx, member.GetName()) + err = client.TransferLeader(ctx, member.GetName()) re.NoError(err) break } } - newLeader := env.cluster.WaitLeader() + newLeader := suite.cluster.WaitLeader() re.NotEmpty(newLeader) re.NoError(err) re.NotEqual(leader.GetName(), newLeader) // Force to update the members info. testutil.Eventually(re, func() bool { - leader, err = client.GetLeader(env.ctx) + leader, err = client.GetLeader(ctx) re.NoError(err) return newLeader == leader.GetName() }) - members, err = client.GetMembers(env.ctx) + members, err = client.GetMembers(ctx) re.NoError(err) re.Len(members.Members, 2) re.Equal(leader.GetName(), members.Leader.GetName()) } func (suite *httpClientTestSuite) TestVersion() { - suite.RunTestInTwoModes(suite.checkVersion) -} - -func (suite *httpClientTestSuite) checkVersion(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - ver, err := client.GetPDVersion(env.ctx) + ver, err := suite.client.GetPDVersion(suite.ctx) re.NoError(err) re.Equal(versioninfo.PDReleaseVersion, ver) } func (suite *httpClientTestSuite) TestStatus() { - suite.RunTestInTwoModes(suite.checkStatus) -} - -func (suite *httpClientTestSuite) checkStatus(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - status, err := client.GetStatus(env.ctx) + status, err := suite.client.GetStatus(suite.ctx) re.NoError(err) re.Equal(versioninfo.PDReleaseVersion, status.Version) re.Equal(versioninfo.PDGitHash, status.GitHash) @@ -683,48 +643,41 @@ func (suite *httpClientTestSuite) checkStatus(mode mode, client pd.Client) { } func (suite *httpClientTestSuite) TestAdmin() { - suite.RunTestInTwoModes(suite.checkAdmin) -} - -func (suite *httpClientTestSuite) checkAdmin(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - - err := client.SetSnapshotRecoveringMark(env.ctx) + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + err := client.SetSnapshotRecoveringMark(ctx) re.NoError(err) - err = client.ResetTS(env.ctx, 123, true) + err = client.ResetTS(ctx, 123, true) re.NoError(err) - err = client.ResetBaseAllocID(env.ctx, 456) + err = client.ResetBaseAllocID(ctx, 456) re.NoError(err) - err = client.DeleteSnapshotRecoveringMark(env.ctx) + err = client.DeleteSnapshotRecoveringMark(ctx) re.NoError(err) } func (suite *httpClientTestSuite) TestWithBackoffer() { - suite.RunTestInTwoModes(suite.checkWithBackoffer) -} - -func (suite *httpClientTestSuite) checkWithBackoffer(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] - + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() // Should return with 404 error without backoffer. - rule, err := client.GetPlacementRule(env.ctx, "non-exist-group", "non-exist-rule") + rule, err := client.GetPlacementRule(ctx, "non-exist-group", "non-exist-rule") re.ErrorContains(err, http.StatusText(http.StatusNotFound)) re.Nil(rule) // Should return with 404 error even with an infinite backoffer. rule, err = client. WithBackoffer(retry.InitialBackoffer(100*time.Millisecond, time.Second, 0)). - GetPlacementRule(env.ctx, "non-exist-group", "non-exist-rule") + GetPlacementRule(ctx, "non-exist-group", "non-exist-rule") re.ErrorContains(err, http.StatusText(http.StatusNotFound)) re.Nil(rule) } func (suite *httpClientTestSuite) TestRedirectWithMetrics() { re := suite.Require() - env := suite.env[defaultServiceDiscovery] - cli := setupCli(env.ctx, suite.Require(), env.endpoints) + cli := setupCli(suite.ctx, re, suite.endpoints) defer cli.Close() sd := cli.GetServiceDiscovery() @@ -785,12 +738,10 @@ func (suite *httpClientTestSuite) TestRedirectWithMetrics() { } func (suite *httpClientTestSuite) TestUpdateKeyspaceGCManagementType() { - suite.RunTestInTwoModes(suite.checkUpdateKeyspaceGCManagementType) -} - -func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, client pd.Client) { re := suite.Require() - env := suite.env[mode] + client := suite.client + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() keyspaceName := "DEFAULT" expectGCManagementType := "keyspace_level_gc" @@ -800,10 +751,10 @@ func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, GCManagementType: expectGCManagementType, }, } - err := client.UpdateKeyspaceGCManagementType(env.ctx, keyspaceName, &keyspaceSafePointVersionConfig) + err := client.UpdateKeyspaceGCManagementType(ctx, keyspaceName, &keyspaceSafePointVersionConfig) re.NoError(err) - keyspaceMetaRes, err := client.GetKeyspaceMetaByName(env.ctx, keyspaceName) + keyspaceMetaRes, err := client.GetKeyspaceMetaByName(ctx, keyspaceName) re.NoError(err) val, ok := keyspaceMetaRes.Config["gc_management_type"] @@ -811,3 +762,56 @@ func (suite *httpClientTestSuite) checkUpdateKeyspaceGCManagementType(mode mode, re.True(ok) re.Equal(expectGCManagementType, val) } + +func (suite *httpClientTestSuite) TestGetHealthStatus() { + re := suite.Require() + healths, err := suite.client.GetHealthStatus(suite.ctx) + re.NoError(err) + re.Len(healths, 2) + sort.Slice(healths, func(i, j int) bool { + return healths[i].Name < healths[j].Name + }) + re.Equal("pd1", healths[0].Name) + re.Equal("pd2", healths[1].Name) + re.True(healths[0].Health && healths[1].Health) +} + +func (suite *httpClientTestSuite) TestRetryOnLeaderChange() { + re := suite.Require() + ctx, cancel := context.WithCancel(suite.ctx) + defer cancel() + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + bo := retry.InitialBackoffer(100*time.Millisecond, time.Second, 0) + client := suite.client.WithBackoffer(bo) + for { + healths, err := client.GetHealthStatus(ctx) + if err != nil && strings.Contains(err.Error(), "context canceled") { + return + } + re.NoError(err) + re.Len(healths, 2) + select { + case <-ctx.Done(): + return + default: + } + } + }() + + leader := suite.cluster.GetLeaderServer() + re.NotNil(leader) + for i := 0; i < 3; i++ { + leader.ResignLeader() + re.NotEmpty(suite.cluster.WaitLeader()) + leader = suite.cluster.GetLeaderServer() + re.NotNil(leader) + } + + // Cancel the context to stop the goroutine. + cancel() + wg.Wait() +} diff --git a/tests/integrations/mcs/scheduling/api_test.go b/tests/integrations/mcs/scheduling/api_test.go index cf2c6dd2508..365ab1ca493 100644 --- a/tests/integrations/mcs/scheduling/api_test.go +++ b/tests/integrations/mcs/scheduling/api_test.go @@ -498,19 +498,19 @@ func (suite *apiTestSuite) checkAdminRegionCacheForward(cluster *tests.TestClust apiServer := cluster.GetLeaderServer().GetServer() schedulingServer := cluster.GetSchedulingPrimaryServer() re.Equal(3, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) - re.Equal(3, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + re.Equal(3, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{})) addr := cluster.GetLeaderServer().GetAddr() urlPrefix := fmt.Sprintf("%s/pd/api/v1/admin/cache/region", addr) err := testutil.CheckDelete(tests.TestDialClient, fmt.Sprintf("%s/%s", urlPrefix, "30"), testutil.StatusOK(re)) re.NoError(err) re.Equal(2, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) - re.Equal(2, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + re.Equal(2, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{})) err = testutil.CheckDelete(tests.TestDialClient, urlPrefix+"s", testutil.StatusOK(re)) re.NoError(err) re.Equal(0, schedulingServer.GetCluster().GetRegionCount([]byte{}, []byte{})) - re.Equal(0, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count) + re.Equal(0, apiServer.GetRaftCluster().GetRegionCount([]byte{}, []byte{})) } func (suite *apiTestSuite) TestFollowerForward() { diff --git a/tests/integrations/mcs/scheduling/config_test.go b/tests/integrations/mcs/scheduling/config_test.go index d7883379731..54622d5c515 100644 --- a/tests/integrations/mcs/scheduling/config_test.go +++ b/tests/integrations/mcs/scheduling/config_test.go @@ -175,7 +175,7 @@ func (suite *configTestSuite) TestSchedulerConfigWatch() { }) assertEvictLeaderStoreIDs(re, storage, []uint64{1}) // Update the scheduler by adding a store. - err = suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + err = suite.pdLeaderServer.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{ Id: 2, Address: "mock://2", diff --git a/tests/integrations/mcs/scheduling/meta_test.go b/tests/integrations/mcs/scheduling/meta_test.go index abc1efd9021..11782590ab9 100644 --- a/tests/integrations/mcs/scheduling/meta_test.go +++ b/tests/integrations/mcs/scheduling/meta_test.go @@ -79,7 +79,7 @@ func (suite *metaTestSuite) TestStoreWatch() { ) re.NoError(err) for i := uint64(1); i <= 4; i++ { - suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + suite.pdLeaderServer.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{Id: i, Address: fmt.Sprintf("mock-%d", i), State: metapb.StoreState_Up, NodeState: metapb.NodeState_Serving, LastHeartbeat: time.Now().UnixNano()}, ) } @@ -102,7 +102,7 @@ func (suite *metaTestSuite) TestStoreWatch() { }) // test synchronized store labels - suite.pdLeaderServer.GetServer().GetRaftCluster().PutStore( + suite.pdLeaderServer.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{Id: 5, Address: "mock-5", State: metapb.StoreState_Up, NodeState: metapb.NodeState_Serving, LastHeartbeat: time.Now().UnixNano(), Labels: []*metapb.StoreLabel{{Key: "zone", Value: "z1"}}}, ) testutil.Eventually(re, func() bool { diff --git a/tests/integrations/mcs/scheduling/server_test.go b/tests/integrations/mcs/scheduling/server_test.go index 38c1cc6a41b..82da47d18f3 100644 --- a/tests/integrations/mcs/scheduling/server_test.go +++ b/tests/integrations/mcs/scheduling/server_test.go @@ -310,7 +310,7 @@ func (suite *serverTestSuite) TestSchedulerSync() { checkEvictLeaderSchedulerExist(re, schedulersController, true) checkEvictLeaderStoreIDs(re, schedulersController, []uint64{1}) // Add a store_id to the evict-leader-scheduler through the API server. - err = suite.pdLeader.GetServer().GetRaftCluster().PutStore( + err = suite.pdLeader.GetServer().GetRaftCluster().PutMetaStore( &metapb.Store{ Id: 2, Address: "mock://2", diff --git a/tests/integrations/mcs/tso/keyspace_group_manager_test.go b/tests/integrations/mcs/tso/keyspace_group_manager_test.go index f7b892ce77d..6d861962d9b 100644 --- a/tests/integrations/mcs/tso/keyspace_group_manager_test.go +++ b/tests/integrations/mcs/tso/keyspace_group_manager_test.go @@ -28,6 +28,7 @@ import ( "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" pd "github.com/tikv/pd/client" + clierrs "github.com/tikv/pd/client/errs" "github.com/tikv/pd/pkg/election" "github.com/tikv/pd/pkg/errs" mcsutils "github.com/tikv/pd/pkg/mcs/utils" @@ -300,7 +301,7 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupSplit() { // Check the split TSO from keyspace group `newID` now. splitTS, err := suite.requestTSO(re, 222, newID) re.NoError(err) - re.Greater(tsoutil.CompareTimestamp(&splitTS, &ts), 0) + re.Positive(tsoutil.CompareTimestamp(&splitTS, &ts)) } func (suite *tsoKeyspaceGroupManagerTestSuite) requestTSO( @@ -467,8 +468,8 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) dispatchClient( errMsg := err.Error() // Ignore the errors caused by the split and context cancellation. if strings.Contains(errMsg, "context canceled") || - strings.Contains(errMsg, "not leader") || - strings.Contains(errMsg, "not served") || + strings.Contains(errMsg, clierrs.NotLeaderErr) || + strings.Contains(errMsg, clierrs.NotServedErr) || strings.Contains(errMsg, "ErrKeyspaceNotAssigned") || strings.Contains(errMsg, "ErrKeyspaceGroupIsMerging") { continue @@ -636,7 +637,7 @@ func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMerge() { } return err == nil && tsoutil.CompareTimestamp(&mergedTS, &pdpb.Timestamp{}) > 0 }, testutil.WithTickInterval(5*time.Second), testutil.WithWaitFor(time.Minute)) - re.Greater(tsoutil.CompareTimestamp(&mergedTS, &ts), 0) + re.Positive(tsoutil.CompareTimestamp(&mergedTS, &ts)) } func (suite *tsoKeyspaceGroupManagerTestSuite) TestTSOKeyspaceGroupMergeClient() { diff --git a/tests/integrations/realcluster/deploy.sh b/tests/integrations/realcluster/deploy.sh index d6cd0b27f72..8cce60e8ee6 100755 --- a/tests/integrations/realcluster/deploy.sh +++ b/tests/integrations/realcluster/deploy.sh @@ -15,10 +15,12 @@ curl --proto '=https' --tlsv1.2 -sSf https://tiup-mirrors.pingcap.com/install.sh $TIUP_BIN_DIR update playground cd ../../.. -if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && [ ! -e "bin/pd-server" ] && [ ! -e "bin/tiflash" ]; then +if [ ! -d "bin" ] || [ ! -e "bin/tikv-server" ] && [ ! -e "bin/tidb-server" ] && [ ! -e "bin/tiflash" ]; then color-green "downloading binaries..." color-green "this may take a few minutes, you can also download them manually and put them in the bin directory." + make pd-server WITH_RACE=1 $TIUP_BIN_DIR playground nightly --kv 3 --tiflash 1 --db 1 --pd 3 --without-monitor --tag pd_test \ + --pd.binpath ./bin/pd-server \ > $CUR_PATH/playground.log 2>&1 & else color-green "using existing binaries..." diff --git a/tests/server/api/api_test.go b/tests/server/api/api_test.go index 091d1488177..f59e85651f5 100644 --- a/tests/server/api/api_test.go +++ b/tests/server/api/api_test.go @@ -617,6 +617,24 @@ func (suite *redirectorTestSuite) TestRedirect() { re.Equal(h, header) } } + // Test redirect during leader election. + leader = suite.cluster.GetLeaderServer() + re.NotNil(leader) + err := leader.ResignLeader() + re.NoError(err) + for _, svr := range suite.cluster.GetServers() { + url := fmt.Sprintf("%s/pd/api/v1/version", svr.GetServer().GetAddr()) + testutil.Eventually(re, func() bool { + resp, err := tests.TestDialClient.Get(url) + re.NoError(err) + defer resp.Body.Close() + _, err = io.ReadAll(resp.Body) + re.NoError(err) + // Should not meet 503 since the retry logic ensure the request is sent to the new leader eventually. + re.NotEqual(http.StatusServiceUnavailable, resp.StatusCode) + return resp.StatusCode == http.StatusOK + }) + } } func (suite *redirectorTestSuite) TestAllowFollowerHandle() { diff --git a/tests/server/api/region_test.go b/tests/server/api/region_test.go index 2ff0b5d4b86..23ebceaefd6 100644 --- a/tests/server/api/region_test.go +++ b/tests/server/api/region_test.go @@ -407,7 +407,7 @@ func (suite *regionTestSuite) checkRegionsReplicated(cluster *tests.TestCluster) func checkRegionCount(re *require.Assertions, cluster *tests.TestCluster, count uint64) { leader := cluster.GetLeaderServer() tu.Eventually(re, func() bool { - return leader.GetRaftCluster().GetRegionCount([]byte{}, []byte{}).Count == int(count) + return leader.GetRaftCluster().GetRegionCount([]byte{}, []byte{}) == int(count) }) if sche := cluster.GetSchedulingPrimaryServer(); sche != nil { tu.Eventually(re, func() bool { diff --git a/tests/server/cluster/cluster_test.go b/tests/server/cluster/cluster_test.go index 61a4561c55a..9e70a52d11d 100644 --- a/tests/server/cluster/cluster_test.go +++ b/tests/server/cluster/cluster_test.go @@ -601,7 +601,7 @@ func TestRaftClusterMultipleRestart(t *testing.T) { store := newMetaStore(storeID, "127.0.0.1:4", "2.1.0", metapb.StoreState_Offline, getTestDeployPath(storeID)) rc := leaderServer.GetRaftCluster() re.NotNil(rc) - err = rc.PutStore(store) + err = rc.PutMetaStore(store) re.NoError(err) re.NotNil(tc) rc.Stop() @@ -662,7 +662,7 @@ func TestNotLeader(t *testing.T) { grpcStatus, ok := status.FromError(err) re.True(ok) re.Equal(codes.Unavailable, grpcStatus.Code()) - re.Equal("not leader", grpcStatus.Message()) + re.ErrorContains(server.ErrNotLeader, grpcStatus.Message()) } func TestStoreVersionChange(t *testing.T) { diff --git a/tests/server/member/member_test.go b/tests/server/member/member_test.go index c581eb39390..edff14a3b98 100644 --- a/tests/server/member/member_test.go +++ b/tests/server/member/member_test.go @@ -328,20 +328,26 @@ func TestCampaignLeaderFrequently(t *testing.T) { re := require.New(t) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - cluster, err := tests.NewTestCluster(ctx, 5) + cluster, err := tests.NewTestCluster(ctx, 3) defer cluster.Destroy() re.NoError(err) err = cluster.RunInitialServers() re.NoError(err) + // the 1st time campaign leader. cluster.WaitLeader() leader := cluster.GetLeader() re.NotEmpty(cluster.GetLeader()) - for i := 0; i < 3; i++ { + // need to prevent 3 times(including the above 1st time) campaign leader in 5 min. + for i := 0; i < 2; i++ { cluster.GetLeaderServer().ResetPDLeader() cluster.WaitLeader() + re.Equal(leader, cluster.GetLeader()) } + // check for the 4th time. + cluster.GetLeaderServer().ResetPDLeader() + cluster.WaitLeader() // PD leader should be different from before because etcd leader changed. re.NotEmpty(cluster.GetLeader()) re.NotEqual(leader, cluster.GetLeader()) diff --git a/tools/pd-ctl/pdctl/command/global.go b/tools/pd-ctl/pdctl/command/global.go index fa77df6a101..b29e2b63278 100644 --- a/tools/pd-ctl/pdctl/command/global.go +++ b/tools/pd-ctl/pdctl/command/global.go @@ -33,7 +33,7 @@ import ( ) const ( - pdControlCallerID = "pd-ctl" + PDControlCallerID = "pd-ctl" clusterPrefix = "pd/api/v1/cluster" ) @@ -55,23 +55,15 @@ var PDCli pd.Client func requirePDClient(cmd *cobra.Command, _ []string) error { var ( - caPath string - err error + tlsConfig *tls.Config + err error ) - caPath, err = cmd.Flags().GetString("cacert") - if err == nil && len(caPath) != 0 { - var certPath, keyPath string - certPath, err = cmd.Flags().GetString("cert") - if err != nil { - return err - } - keyPath, err = cmd.Flags().GetString("key") - if err != nil { - return err - } - return initNewPDClientWithTLS(cmd, caPath, certPath, keyPath) + tlsConfig, err = parseTLSConfig(cmd) + if err != nil { + return err } - return initNewPDClient(cmd) + + return initNewPDClient(cmd, pd.WithTLSConfig(tlsConfig)) } // shouldInitPDClient checks whether we should create a new PD client according to the cluster information. @@ -107,53 +99,45 @@ func initNewPDClient(cmd *cobra.Command, opts ...pd.ClientOption) error { if PDCli != nil { PDCli.Close() } - PDCli = pd.NewClient(pdControlCallerID, getEndpoints(cmd), opts...) - return nil -} - -func initNewPDClientWithTLS(cmd *cobra.Command, caPath, certPath, keyPath string) error { - tlsConfig, err := initTLSConfig(caPath, certPath, keyPath) - if err != nil { - return err - } - initNewPDClient(cmd, pd.WithTLSConfig(tlsConfig)) + PDCli = pd.NewClient(PDControlCallerID, getEndpoints(cmd), opts...).WithCallerID(PDControlCallerID) return nil } // TODO: replace dialClient with the PD HTTP client completely. var dialClient = &http.Client{ - Transport: apiutil.NewCallerIDRoundTripper(http.DefaultTransport, pdControlCallerID), + Transport: apiutil.NewCallerIDRoundTripper(http.DefaultTransport, PDControlCallerID), } -// RequireHTTPSClient creates a HTTPS client if the related flags are set -func RequireHTTPSClient(cmd *cobra.Command, _ []string) error { +func parseTLSConfig(cmd *cobra.Command) (*tls.Config, error) { caPath, err := cmd.Flags().GetString("cacert") - if err == nil && len(caPath) != 0 { - certPath, err := cmd.Flags().GetString("cert") - if err != nil { - return err - } - keyPath, err := cmd.Flags().GetString("key") - if err != nil { - return err - } - err = initHTTPSClient(caPath, certPath, keyPath) - if err != nil { - cmd.Println(err) - return err - } + if err != nil || len(caPath) == 0 { + return nil, err + } + certPath, err := cmd.Flags().GetString("cert") + if err != nil { + return nil, err + } + keyPath, err := cmd.Flags().GetString("key") + if err != nil { + return nil, err } - return nil -} - -func initHTTPSClient(caPath, certPath, keyPath string) error { tlsConfig, err := initTLSConfig(caPath, certPath, keyPath) if err != nil { + return nil, err + } + + return tlsConfig, nil +} + +// RequireHTTPSClient creates a HTTPS client if the related flags are set +func RequireHTTPSClient(cmd *cobra.Command, _ []string) error { + tlsConfig, err := parseTLSConfig(cmd) + if err != nil || tlsConfig == nil { return err } dialClient = &http.Client{ Transport: apiutil.NewCallerIDRoundTripper( - &http.Transport{TLSClientConfig: tlsConfig}, pdControlCallerID), + &http.Transport{TLSClientConfig: tlsConfig}, PDControlCallerID), } return nil } diff --git a/tools/pd-ctl/pdctl/command/global_test.go b/tools/pd-ctl/pdctl/command/global_test.go new file mode 100644 index 00000000000..86eb4366d04 --- /dev/null +++ b/tools/pd-ctl/pdctl/command/global_test.go @@ -0,0 +1,58 @@ +// Copyright 2024 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package command + +import ( + "os" + "os/exec" + "testing" + + "github.com/spf13/cobra" + "github.com/stretchr/testify/require" +) + +func TestParseTLSConfig(t *testing.T) { + re := require.New(t) + + rootCmd := &cobra.Command{ + Use: "pd-ctl", + Short: "Placement Driver control", + SilenceErrors: true, + } + certPath := "../../tests/cert" + rootCmd.Flags().String("cacert", certPath+"/ca.pem", "path of file that contains list of trusted SSL CAs") + rootCmd.Flags().String("cert", certPath+"/client.pem", "path of file that contains X509 certificate in PEM format") + rootCmd.Flags().String("key", certPath+"/client-key.pem", "path of file that contains X509 key in PEM format") + + // generate certs + if err := os.Mkdir(certPath, 0755); err != nil { + t.Fatal(err) + } + certScript := "../../tests/cert_opt.sh" + if err := exec.Command(certScript, "generate", certPath).Run(); err != nil { + t.Fatal(err) + } + defer func() { + if err := exec.Command(certScript, "cleanup", certPath).Run(); err != nil { + t.Fatal(err) + } + if err := os.RemoveAll(certPath); err != nil { + t.Fatal(err) + } + }() + + tlsConfig, err := parseTLSConfig(rootCmd) + re.NoError(err) + re.NotNil(tlsConfig) +} diff --git a/tools/pd-ctl/pdctl/command/health_command.go b/tools/pd-ctl/pdctl/command/health_command.go index 50ac7763d28..a10ee118397 100644 --- a/tools/pd-ctl/pdctl/command/health_command.go +++ b/tools/pd-ctl/pdctl/command/health_command.go @@ -15,30 +15,25 @@ package command import ( - "net/http" - "github.com/spf13/cobra" ) -var ( - healthPrefix = "pd/api/v1/health" -) - // NewHealthCommand return a health subcommand of rootCmd func NewHealthCommand() *cobra.Command { m := &cobra.Command{ - Use: "health", - Short: "show all node's health information of the pd cluster", - Run: showHealthCommandFunc, + Use: "health", + Short: "show all node's health information of the PD cluster", + PersistentPreRunE: requirePDClient, + Run: showHealthCommandFunc, } return m } func showHealthCommandFunc(cmd *cobra.Command, _ []string) { - r, err := doRequest(cmd, healthPrefix, http.MethodGet, http.Header{}) + health, err := PDCli.GetHealthStatus(cmd.Context()) if err != nil { cmd.Println(err) return } - cmd.Println(r) + jsonPrint(cmd, health) } diff --git a/tools/pd-ctl/pdctl/ctl.go b/tools/pd-ctl/pdctl/ctl.go index f8eaff5e76e..fbacd65dc53 100644 --- a/tools/pd-ctl/pdctl/ctl.go +++ b/tools/pd-ctl/pdctl/ctl.go @@ -30,6 +30,7 @@ import ( func init() { cobra.EnablePrefixMatching = true + cobra.EnableTraverseRunHooks = true } // GetRootCmd is exposed for integration tests. But it can be embedded into another suite, too. diff --git a/tools/pd-ctl/tests/global_test.go b/tools/pd-ctl/tests/global_test.go index f4f55e2af89..766e357088e 100644 --- a/tools/pd-ctl/tests/global_test.go +++ b/tools/pd-ctl/tests/global_test.go @@ -16,33 +16,39 @@ package tests import ( "context" + "encoding/json" "fmt" "net/http" "testing" - "github.com/pingcap/log" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/stretchr/testify/require" "github.com/tikv/pd/pkg/utils/apiutil" "github.com/tikv/pd/pkg/utils/assertutil" "github.com/tikv/pd/pkg/utils/testutil" "github.com/tikv/pd/server" cmd "github.com/tikv/pd/tools/pd-ctl/pdctl" - "go.uber.org/zap" + "github.com/tikv/pd/tools/pd-ctl/pdctl/command" ) -const pdControlCallerID = "pd-ctl" - func TestSendAndGetComponent(t *testing.T) { re := require.New(t) handler := func(context.Context, *server.Server) (http.Handler, apiutil.APIServiceGroup, error) { mux := http.NewServeMux() - mux.HandleFunc("/pd/api/v1/health", func(w http.ResponseWriter, r *http.Request) { + // check pd http sdk api + mux.HandleFunc("/pd/api/v1/cluster", func(w http.ResponseWriter, r *http.Request) { + callerID := apiutil.GetCallerIDOnHTTP(r) + re.Equal(command.PDControlCallerID, callerID) + cluster := &metapb.Cluster{Id: 1} + clusterBytes, err := json.Marshal(cluster) + re.NoError(err) + w.Write(clusterBytes) + }) + // check http client api + // TODO: remove this comment after replacing dialClient with the PD HTTP client completely. + mux.HandleFunc("/pd/api/v1/stores", func(w http.ResponseWriter, r *http.Request) { callerID := apiutil.GetCallerIDOnHTTP(r) - for k := range r.Header { - log.Info("header", zap.String("key", k)) - } - log.Info("caller id", zap.String("caller-id", callerID)) - re.Equal(pdControlCallerID, callerID) + re.Equal(command.PDControlCallerID, callerID) fmt.Fprint(w, callerID) }) info := apiutil.APIServiceGroup{ @@ -64,8 +70,15 @@ func TestSendAndGetComponent(t *testing.T) { }() cmd := cmd.GetRootCmd() - args := []string{"-u", pdAddr, "health"} + args := []string{"-u", pdAddr, "cluster"} output, err := ExecuteCommand(cmd, args...) re.NoError(err) - re.Equal(fmt.Sprintf("%s\n", pdControlCallerID), string(output)) + re.Equal(fmt.Sprintf("%s\n", `{ + "id": 1 +}`), string(output)) + + args = []string{"-u", pdAddr, "store"} + output, err = ExecuteCommand(cmd, args...) + re.NoError(err) + re.Equal(fmt.Sprintf("%s\n", command.PDControlCallerID), string(output)) } diff --git a/tools/pd-ctl/tests/health/health_test.go b/tools/pd-ctl/tests/health/health_test.go index 9150a56c91b..f1d3c7cfbf1 100644 --- a/tools/pd-ctl/tests/health/health_test.go +++ b/tools/pd-ctl/tests/health/health_test.go @@ -17,14 +17,21 @@ package health_test import ( "context" "encoding/json" + "os" + "os/exec" + "path/filepath" + "strings" "testing" "github.com/stretchr/testify/require" + "github.com/tikv/pd/pkg/utils/grpcutil" "github.com/tikv/pd/server/api" "github.com/tikv/pd/server/cluster" + "github.com/tikv/pd/server/config" pdTests "github.com/tikv/pd/tests" ctl "github.com/tikv/pd/tools/pd-ctl/pdctl" "github.com/tikv/pd/tools/pd-ctl/tests" + "go.etcd.io/etcd/pkg/transport" ) func TestHealth(t *testing.T) { @@ -68,3 +75,80 @@ func TestHealth(t *testing.T) { re.NoError(json.Unmarshal(output, &h)) re.Equal(healths, h) } + +func TestHealthTLS(t *testing.T) { + re := require.New(t) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + certPath := "../cert" + certScript := "../cert_opt.sh" + // generate certs + if err := os.Mkdir(certPath, 0755); err != nil { + t.Fatal(err) + } + if err := exec.Command(certScript, "generate", certPath).Run(); err != nil { + t.Fatal(err) + } + defer func() { + if err := exec.Command(certScript, "cleanup", certPath).Run(); err != nil { + t.Fatal(err) + } + if err := os.RemoveAll(certPath); err != nil { + t.Fatal(err) + } + }() + + tlsInfo := transport.TLSInfo{ + KeyFile: filepath.Join(certPath, "pd-server-key.pem"), + CertFile: filepath.Join(certPath, "pd-server.pem"), + TrustedCAFile: filepath.Join(certPath, "ca.pem"), + } + tc, err := pdTests.NewTestCluster(ctx, 1, func(conf *config.Config, _ string) { + conf.Security.TLSConfig = grpcutil.TLSConfig{ + KeyPath: tlsInfo.KeyFile, + CertPath: tlsInfo.CertFile, + CAPath: tlsInfo.TrustedCAFile, + } + conf.AdvertiseClientUrls = strings.ReplaceAll(conf.AdvertiseClientUrls, "http", "https") + conf.ClientUrls = strings.ReplaceAll(conf.ClientUrls, "http", "https") + conf.AdvertisePeerUrls = strings.ReplaceAll(conf.AdvertisePeerUrls, "http", "https") + conf.PeerUrls = strings.ReplaceAll(conf.PeerUrls, "http", "https") + conf.InitialCluster = strings.ReplaceAll(conf.InitialCluster, "http", "https") + }) + re.NoError(err) + defer tc.Destroy() + err = tc.RunInitialServers() + re.NoError(err) + tc.WaitLeader() + cmd := ctl.GetRootCmd() + + client := tc.GetEtcdClient() + members, err := cluster.GetMembers(client) + re.NoError(err) + healthMembers := cluster.CheckHealth(tc.GetHTTPClient(), members) + healths := []api.Health{} + for _, member := range members { + h := api.Health{ + Name: member.Name, + MemberID: member.MemberId, + ClientUrls: member.ClientUrls, + Health: false, + } + if _, ok := healthMembers[member.GetMemberId()]; ok { + h.Health = true + } + healths = append(healths, h) + } + + pdAddr := tc.GetConfig().GetClientURL() + pdAddr = strings.ReplaceAll(pdAddr, "http", "https") + args := []string{"-u", pdAddr, "health", + "--cacert=../cert/ca.pem", + "--cert=../cert/client.pem", + "--key=../cert/client-key.pem"} + output, err := tests.ExecuteCommand(cmd, args...) + re.NoError(err) + h := make([]api.Health, len(healths)) + re.NoError(json.Unmarshal(output, &h)) + re.Equal(healths, h) +} diff --git a/tools/pd-ctl/tests/hot/hot_test.go b/tools/pd-ctl/tests/hot/hot_test.go index 7661704aa41..f65b811b36a 100644 --- a/tools/pd-ctl/tests/hot/hot_test.go +++ b/tools/pd-ctl/tests/hot/hot_test.go @@ -188,11 +188,10 @@ func (suite *hotTestSuite) checkHot(cluster *pdTests.TestCluster) { Id: 100 + regionIDCounter, StoreId: hotStoreID, } - peerInfo := core.NewPeerInfo(leader, loads, reportInterval) region := core.NewRegionInfo(&metapb.Region{ Id: hotRegionID, }, leader) - hotStat.CheckReadAsync(statistics.NewCheckPeerTask(peerInfo, region)) + hotStat.CheckReadAsync(statistics.NewCheckReadPeerTask(region, []*metapb.Peer{leader}, loads, reportInterval)) testutil.Eventually(re, func() bool { hotPeerStat := getHotPeerStat(utils.Read, hotRegionID, hotStoreID) return hotPeerStat != nil diff --git a/tools/pd-simulator/main.go b/tools/pd-simulator/main.go index 73f4a0bba12..05763cc93b8 100644 --- a/tools/pd-simulator/main.go +++ b/tools/pd-simulator/main.go @@ -17,8 +17,6 @@ package main import ( "context" "fmt" - "net/http" - "net/http/pprof" "os" "os/signal" "syscall" @@ -26,8 +24,8 @@ import ( "github.com/BurntSushi/toml" "github.com/pingcap/log" - "github.com/prometheus/client_golang/prometheus/promhttp" flag "github.com/spf13/pflag" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/schedule/schedulers" "github.com/tikv/pd/pkg/statistics" "github.com/tikv/pd/pkg/utils/logutil" @@ -38,21 +36,19 @@ import ( "github.com/tikv/pd/tools/pd-analysis/analysis" "github.com/tikv/pd/tools/pd-simulator/simulator" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) var ( - pdAddr = flag.String("pd-endpoints", "", "pd address") - configFile = flag.String("config", "conf/simconfig.toml", "config file") - caseName = flag.String("case", "", "case name") - serverLogLevel = flag.String("serverLog", "info", "pd server log level") - simLogLevel = flag.String("simLog", "info", "simulator log level") - simLogFile = flag.String("log-file", "", "simulator log file") - regionNum = flag.Int("regionNum", 0, "regionNum of one store") - storeNum = flag.Int("storeNum", 0, "storeNum") - enableTransferRegionCounter = flag.Bool("enableTransferRegionCounter", false, "enableTransferRegionCounter") - statusAddress = flag.String("status-addr", "0.0.0.0:20180", "status address") + pdAddr = flag.String("pd-endpoints", "", "pd address") + configFile = flag.String("config", "conf/simconfig.toml", "config file") + caseName = flag.String("case", "", "case name") + serverLogLevel = flag.String("serverLog", "info", "pd server log level") + simLogLevel = flag.String("simLog", "info", "simulator log level") + simLogFile = flag.String("log-file", "", "simulator log file") + statusAddress = flag.String("status-addr", "0.0.0.0:20180", "status address") ) func main() { @@ -63,14 +59,12 @@ func main() { flag.Parse() simutil.InitLogger(*simLogLevel, *simLogFile) - simutil.InitCaseConfig(*storeNum, *regionNum, *enableTransferRegionCounter) statistics.Denoising = false - if simutil.CaseConfigure.EnableTransferRegionCounter { - analysis.GetTransferCounter().Init(simutil.CaseConfigure.StoreNum, simutil.CaseConfigure.RegionNum) - } - schedulers.Register() // register schedulers, which is needed by simConfig.Adjust - simConfig := simulator.NewSimConfig(*serverLogLevel) + simConfig := sc.NewSimConfig(*serverLogLevel) + if simConfig.EnableTransferRegionCounter { + analysis.GetTransferCounter().Init(simConfig.TotalStore, simConfig.TotalRegion) + } var meta toml.MetaData var err error if *configFile != "" { @@ -97,10 +91,10 @@ func main() { } } -func run(simCase string, simConfig *simulator.SimConfig) { +func run(simCase string, simConfig *sc.SimConfig) { if *pdAddr != "" { - go runHTTPServer() - simStart(*pdAddr, simCase, simConfig) + simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{*pdAddr}) + simStart(*pdAddr, *statusAddress, simCase, simConfig) } else { local, clean := NewSingleServer(context.Background(), simConfig) err := local.Run() @@ -113,30 +107,13 @@ func run(simCase string, simConfig *simulator.SimConfig) { } time.Sleep(100 * time.Millisecond) } - simStart(local.GetAddr(), simCase, simConfig, clean) + simulator.PDHTTPClient = pdHttp.NewClient("pd-simulator", []string{local.GetAddr()}) + simStart(local.GetAddr(), "", simCase, simConfig, clean) } } -func runHTTPServer() { - http.Handle("/metrics", promhttp.Handler()) - // profile API - http.HandleFunc("/pprof/profile", pprof.Profile) - http.HandleFunc("/pprof/trace", pprof.Trace) - http.HandleFunc("/pprof/symbol", pprof.Symbol) - http.Handle("/pprof/heap", pprof.Handler("heap")) - http.Handle("/pprof/mutex", pprof.Handler("mutex")) - http.Handle("/pprof/allocs", pprof.Handler("allocs")) - http.Handle("/pprof/block", pprof.Handler("block")) - http.Handle("/pprof/goroutine", pprof.Handler("goroutine")) - server := &http.Server{ - Addr: *statusAddress, - ReadHeaderTimeout: 3 * time.Second, - } - server.ListenAndServe() -} - // NewSingleServer creates a pd server for simulator. -func NewSingleServer(ctx context.Context, simConfig *simulator.SimConfig) (*server.Server, testutil.CleanupFunc) { +func NewSingleServer(ctx context.Context, simConfig *sc.SimConfig) (*server.Server, testutil.CleanupFunc) { err := logutil.SetupLogger(simConfig.ServerConfig.Log, &simConfig.ServerConfig.Logger, &simConfig.ServerConfig.LogProps) if err == nil { log.ReplaceGlobals(simConfig.ServerConfig.Logger, simConfig.ServerConfig.LogProps) @@ -161,9 +138,9 @@ func cleanServer(cfg *config.Config) { os.RemoveAll(cfg.DataDir) } -func simStart(pdAddr string, simCase string, simConfig *simulator.SimConfig, clean ...testutil.CleanupFunc) { +func simStart(pdAddr, statusAddress string, simCase string, simConfig *sc.SimConfig, clean ...testutil.CleanupFunc) { start := time.Now() - driver, err := simulator.NewDriver(pdAddr, simCase, simConfig) + driver, err := simulator.NewDriver(pdAddr, statusAddress, simCase, simConfig) if err != nil { simutil.Logger.Fatal("create driver error", zap.Error(err)) } @@ -177,6 +154,8 @@ func simStart(pdAddr string, simCase string, simConfig *simulator.SimConfig, cle tick := time.NewTicker(tickInterval) defer tick.Stop() sc := make(chan os.Signal, 1) + // halt scheduling + simulator.ChooseToHaltPDSchedule(true) signal.Notify(sc, syscall.SIGHUP, syscall.SIGINT, @@ -209,6 +188,9 @@ EXIT: analysis.GetTransferCounter().PrintResult() } + if simulator.PDHTTPClient != nil { + simulator.PDHTTPClient.Close() + } if simResult != "OK" { os.Exit(1) } diff --git a/tools/pd-simulator/simulator/cases/add_nodes.go b/tools/pd-simulator/simulator/cases/add_nodes.go deleted file mode 100644 index 241b34a9473..00000000000 --- a/tools/pd-simulator/simulator/cases/add_nodes.go +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2017 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cases - -import ( - "math/rand" - - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" -) - -func newAddNodes() *Case { - var simCase Case - - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyRatio := rand.Float64() // the ratio of noEmpty store to total store - noEmptyStoreNum := getNoEmptyStoreNum(storeNum, noEmptyRatio) - - for i := 1; i <= storeNum; i++ { - simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), - Status: metapb.StoreState_Up, - }) - } - - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+1)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+2)%noEmptyStoreNum + 1}, - } - simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), - Peers: peers, - Leader: peers[0], - Size: 96 * units.MiB, - Keys: 960000, - }) - } - - threshold := 0.05 - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - leaderCounts := make([]int, 0, storeNum) - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - leaderCount := regions.GetStoreLeaderCount(uint64(i)) - regionCount := regions.GetStoreRegionCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum, threshold) - } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res - } - return &simCase -} diff --git a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go b/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go deleted file mode 100644 index 59b0b54e1ca..00000000000 --- a/tools/pd-simulator/simulator/cases/add_nodes_dynamic.go +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2018 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cases - -import ( - "math/rand" - - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" -) - -func newAddNodesDynamic() *Case { - var simCase Case - - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyRatio := rand.Float64() // the ratio of noEmpty store to total store - noEmptyStoreNum := getNoEmptyStoreNum(storeNum, noEmptyRatio) - - for i := 1; i <= int(noEmptyStoreNum); i++ { - simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), - Status: metapb.StoreState_Up, - }) - } - - var ids []uint64 - for i := 1; i <= storeNum-int(noEmptyStoreNum); i++ { - ids = append(ids, IDAllocator.nextID()) - } - - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+1)%noEmptyStoreNum + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64(i+2)%noEmptyStoreNum + 1}, - } - simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), - Peers: peers, - Leader: peers[0], - Size: 96 * units.MiB, - Keys: 960000, - }) - } - - numNodes := int(noEmptyStoreNum) - e := &AddNodesDescriptor{} - e.Step = func(tick int64) uint64 { - if tick%100 == 0 && numNodes < storeNum { - numNodes++ - nodeID := ids[0] - ids = append(ids[:0], ids[1:]...) - return nodeID - } - return 0 - } - simCase.Events = []EventDescriptor{e} - - threshold := 0.05 - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := numNodes == storeNum - leaderCounts := make([]int, 0, numNodes) - regionCounts := make([]int, 0, numNodes) - for i := 1; i <= numNodes; i++ { - leaderCount := regions.GetStoreLeaderCount(uint64(i)) - regionCount := regions.GetStoreRegionCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum, threshold) - } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res - } - return &simCase -} diff --git a/tools/pd-simulator/simulator/cases/balance_leader.go b/tools/pd-simulator/simulator/cases/balance_leader.go index bbc7ce97f68..fd9028bc91a 100644 --- a/tools/pd-simulator/simulator/cases/balance_leader.go +++ b/tools/pd-simulator/simulator/cases/balance_leader.go @@ -18,31 +18,39 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newBalanceLeader() *Case { +func newBalanceLeader(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeNum)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%(storeNum-1)) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%(storeNum-1)) + 1}, + leaderStoreID := simCase.Stores[totalStore-1].ID + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + peers = append(peers, &metapb.Peer{ + Id: simutil.IDAllocator.NextID(), + StoreId: leaderStoreID, + }) + for j := 1; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: simutil.IDAllocator.NextID(), + StoreId: uint64((i+j)%(totalStore-1) + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, @@ -50,17 +58,14 @@ func newBalanceLeader() *Case { }) } - threshold := 0.05 simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - leaderCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { leaderCount := regions.GetStoreLeaderCount(uint64(i)) - leaderCounts = append(leaderCounts, leaderCount) - res = res && isUniform(leaderCount, regionNum/3, threshold) + if !isUniform(leaderCount, totalRegion/totalStore) { + return false + } } - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/balance_region.go b/tools/pd-simulator/simulator/cases/balance_region.go index 3b0c46f1670..82a7ac2d704 100644 --- a/tools/pd-simulator/simulator/cases/balance_region.go +++ b/tools/pd-simulator/simulator/cases/balance_region.go @@ -19,23 +19,21 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRedundantBalanceRegion() *Case { +func newRedundantBalanceRegion(config *sc.SimConfig) *Case { var simCase Case - storeNum := simutil.CaseConfigure.StoreNum - regionNum := simutil.CaseConfigure.RegionNum - if storeNum == 0 || regionNum == 0 { - storeNum, regionNum = 6, 4000 - } + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) - for i := 0; i < storeNum; i++ { + for i := 0; i < totalStore; i++ { s := &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, } if i%2 == 1 { @@ -44,43 +42,41 @@ func newRedundantBalanceRegion() *Case { simCase.Stores = append(simCase.Stores, s) } - for i := 0; i < regionNum; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i%storeNum + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: simutil.IDAllocator.NextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], }) } - storesLastUpdateTime := make([]int64, storeNum+1) - storeLastAvailable := make([]uint64, storeNum+1) + storesLastUpdateTime := make([]int64, totalStore+1) + storeLastAvailable := make([]uint64, totalStore+1) simCase.Checker = func(_ *core.RegionsInfo, stats []info.StoreStats) bool { - res := true curTime := time.Now().Unix() - storesAvailable := make([]uint64, 0, storeNum+1) - for i := 1; i <= storeNum; i++ { + for i := 1; i <= totalStore; i++ { available := stats[i].GetAvailable() - storesAvailable = append(storesAvailable, available) if curTime-storesLastUpdateTime[i] > 60 { if storeLastAvailable[i] != available { - res = false + return false } if stats[i].ToCompactionSize != 0 { - res = false + return false } storesLastUpdateTime[i] = curTime storeLastAvailable[i] = available } else { - res = false + return false } } - simutil.Logger.Info("current counts", zap.Uint64s("storesAvailable", storesAvailable)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/cases.go b/tools/pd-simulator/simulator/cases/cases.go index 0a8967a8d86..c4e2f999978 100644 --- a/tools/pd-simulator/simulator/cases/cases.go +++ b/tools/pd-simulator/simulator/cases/cases.go @@ -16,11 +16,11 @@ package cases import ( "github.com/pingcap/kvproto/pkg/metapb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" ) // Store is used to simulate tikv. @@ -57,7 +57,7 @@ type Case struct { TableNumber int Checker CheckerFunc // To check the schedule is finished. - Rules []*placement.Rule + Rules []*pdHttp.Rule Labels typeutil.StringSlice } @@ -86,12 +86,9 @@ func (a *idAllocator) GetID() uint64 { var IDAllocator idAllocator // CaseMap is a mapping of the cases to the their corresponding initialize functions. -var CaseMap = map[string]func() *Case{ +var CaseMap = map[string]func(*config.SimConfig) *Case{ "balance-leader": newBalanceLeader, "redundant-balance-region": newRedundantBalanceRegion, - "add-nodes": newAddNodes, - "add-nodes-dynamic": newAddNodesDynamic, - "delete-nodes": newDeleteNodes, "region-split": newRegionSplit, "region-merge": newRegionMerge, "hot-read": newHotRead, @@ -106,43 +103,16 @@ var CaseMap = map[string]func() *Case{ } // NewCase creates a new case. -func NewCase(name string) *Case { +func NewCase(name string, simConfig *config.SimConfig) *Case { if f, ok := CaseMap[name]; ok { - return f() + return f(simConfig) } return nil } -func leaderAndRegionIsUniform(leaderCount, regionCount, regionNum int, threshold float64) bool { - return isUniform(leaderCount, regionNum/3, threshold) && isUniform(regionCount, regionNum, threshold) -} - -func isUniform(count, meanCount int, threshold float64) bool { +func isUniform(count, meanCount int) bool { + threshold := 0.05 maxCount := int((1.0 + threshold) * float64(meanCount)) minCount := int((1.0 - threshold) * float64(meanCount)) return minCount <= count && count <= maxCount } - -func getStoreNum() int { - storeNum := simutil.CaseConfigure.StoreNum - if storeNum < 3 { - simutil.Logger.Fatal("store num should be larger than or equal to 3") - } - return storeNum -} - -func getRegionNum() int { - regionNum := simutil.CaseConfigure.RegionNum - if regionNum <= 0 { - simutil.Logger.Fatal("region num should be larger than 0") - } - return regionNum -} - -func getNoEmptyStoreNum(storeNum int, noEmptyRatio float64) uint64 { - noEmptyStoreNum := uint64(float64(storeNum) * noEmptyRatio) - if noEmptyStoreNum < 3 || noEmptyStoreNum == uint64(storeNum) { - noEmptyStoreNum = 3 - } - return noEmptyStoreNum -} diff --git a/tools/pd-simulator/simulator/cases/delete_nodes.go b/tools/pd-simulator/simulator/cases/delete_nodes.go deleted file mode 100644 index 4ba8e5064a4..00000000000 --- a/tools/pd-simulator/simulator/cases/delete_nodes.go +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2018 TiKV Project Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cases - -import ( - "math/rand" - - "github.com/docker/go-units" - "github.com/pingcap/kvproto/pkg/metapb" - "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" -) - -func newDeleteNodes() *Case { - var simCase Case - - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyStoreNum := storeNum - 1 - for i := 1; i <= storeNum; i++ { - simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), - Status: metapb.StoreState_Up, - }) - } - - for i := 0; i < regionNum*storeNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(i%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum) + 1}, - } - simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), - Peers: peers, - Leader: peers[0], - Size: 96 * units.MiB, - Keys: 960000, - }) - } - - ids := make([]uint64, 0, len(simCase.Stores)) - for _, store := range simCase.Stores { - ids = append(ids, store.ID) - } - - numNodes := storeNum - e := &DeleteNodesDescriptor{} - e.Step = func(tick int64) uint64 { - if numNodes > noEmptyStoreNum && tick%100 == 0 { - idx := rand.Intn(numNodes) - numNodes-- - nodeID := ids[idx] - ids = append(ids[:idx], ids[idx+1:]...) - return nodeID - } - return 0 - } - simCase.Events = []EventDescriptor{e} - - threshold := 0.05 - simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := numNodes == noEmptyStoreNum - leaderCounts := make([]int, 0, numNodes) - regionCounts := make([]int, 0, numNodes) - for _, i := range ids { - leaderCount := regions.GetStoreLeaderCount(i) - regionCount := regions.GetStoreRegionCount(i) - leaderCounts = append(leaderCounts, leaderCount) - regionCounts = append(regionCounts, regionCount) - res = res && leaderAndRegionIsUniform(leaderCount, regionCount, regionNum*storeNum/noEmptyStoreNum, threshold) - } - - simutil.Logger.Info("current counts", zap.Ints("leader", leaderCounts), zap.Ints("region", regionCounts)) - return res - } - return &simCase -} diff --git a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go index 7fa50e56197..09037136608 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go +++ b/tools/pd-simulator/simulator/cases/diagnose_label_isolation.go @@ -21,12 +21,13 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newLabelNotMatch1() *Case { +func newLabelNotMatch1(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"host"} @@ -88,7 +89,7 @@ func newLabelNotMatch1() *Case { return &simCase } -func newLabelIsolation1() *Case { +func newLabelIsolation1(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"host"} @@ -154,7 +155,7 @@ func newLabelIsolation1() *Case { return &simCase } -func newLabelIsolation2() *Case { +func newLabelIsolation2(_ *sc.SimConfig) *Case { var simCase Case simCase.Labels = []string{"dc", "zone", "host"} diff --git a/tools/pd-simulator/simulator/cases/diagnose_rule.go b/tools/pd-simulator/simulator/cases/diagnose_rule.go index 15c5942d810..2cd11b9624a 100644 --- a/tools/pd-simulator/simulator/cases/diagnose_rule.go +++ b/tools/pd-simulator/simulator/cases/diagnose_rule.go @@ -19,25 +19,27 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/schedule/placement" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newRule1() *Case { +func newRule1(_ *sc.SimConfig) *Case { var simCase Case - simCase.Rules = make([]*placement.Rule, 0) - simCase.Rules = append(simCase.Rules, &placement.Rule{ + simCase.Rules = make([]*pdHttp.Rule, 0) + simCase.Rules = append(simCase.Rules, &pdHttp.Rule{ GroupID: "test1", ID: "test1", StartKeyHex: "", EndKeyHex: "", - Role: placement.Learner, + Role: pdHttp.Learner, Count: 1, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", @@ -45,14 +47,14 @@ func newRule1() *Case { }, }, LocationLabels: []string{"host"}, - }, &placement.Rule{ + }, &pdHttp.Rule{ GroupID: placement.DefaultGroupID, ID: placement.DefaultRuleID, StartKeyHex: "", EndKeyHex: "", - Role: placement.Voter, + Role: pdHttp.Voter, Count: 5, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", @@ -126,19 +128,19 @@ func newRule1() *Case { return &simCase } -func newRule2() *Case { +func newRule2(_ *sc.SimConfig) *Case { var simCase Case - simCase.Rules = make([]*placement.Rule, 0) + simCase.Rules = make([]*pdHttp.Rule, 0) simCase.Rules = append(simCase.Rules, - &placement.Rule{ + &pdHttp.Rule{ GroupID: "test1", ID: "test1", StartKeyHex: "", EndKeyHex: "", - Role: placement.Leader, + Role: pdHttp.Leader, Count: 1, - LabelConstraints: []placement.LabelConstraint{ + LabelConstraints: []pdHttp.LabelConstraint{ { Key: "region", Op: "in", diff --git a/tools/pd-simulator/simulator/cases/hot_read.go b/tools/pd-simulator/simulator/cases/hot_read.go index d4ec6831d95..d154886b0a4 100644 --- a/tools/pd-simulator/simulator/cases/hot_read.go +++ b/tools/pd-simulator/simulator/cases/hot_read.go @@ -15,38 +15,38 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newHotRead() *Case { +func newHotRead(config *sc.SimConfig) *Case { var simCase Case - - storeNum, regionNum := getStoreNum(), getRegionNum() + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) // Initialize the cluster - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: simutil.IDAllocator.NextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, @@ -56,7 +56,7 @@ func newHotRead() *Case { // Events description // select regions on store 1 as hot read regions. - selectRegionNum := 4 * storeNum + selectRegionNum := 4 * totalStore readFlow := make(map[uint64]int64, selectRegionNum) for _, r := range simCase.Regions { if r.Leader.GetStoreId() == 1 { @@ -73,12 +73,11 @@ func newHotRead() *Case { simCase.Events = []EventDescriptor{e} // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, storeNum) + leaderCount := make([]int, totalStore) for id := range readFlow { leaderStore := regions.GetRegion(id).GetLeader().GetStoreId() leaderCount[int(leaderStore-1)]++ } - simutil.Logger.Info("current hot region counts", zap.Reflect("hot-region", leaderCount)) // check count diff < 2. var min, max int diff --git a/tools/pd-simulator/simulator/cases/hot_write.go b/tools/pd-simulator/simulator/cases/hot_write.go index 8428afa75b5..e73ca6f3ce3 100644 --- a/tools/pd-simulator/simulator/cases/hot_write.go +++ b/tools/pd-simulator/simulator/cases/hot_write.go @@ -15,37 +15,38 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newHotWrite() *Case { +func newHotWrite(config *sc.SimConfig) *Case { var simCase Case + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) - storeNum, regionNum := getStoreNum(), getRegionNum() // Initialize the cluster - for i := 1; i <= storeNum; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: simutil.IDAllocator.NextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, @@ -55,7 +56,7 @@ func newHotWrite() *Case { // Events description // select regions on store 1 as hot write regions. - selectStoreNum := storeNum + selectStoreNum := totalStore writeFlow := make(map[uint64]int64, selectStoreNum) for _, r := range simCase.Regions { if r.Leader.GetStoreId() == 1 { @@ -74,8 +75,8 @@ func newHotWrite() *Case { // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - leaderCount := make([]int, storeNum) - peerCount := make([]int, storeNum) + leaderCount := make([]int, totalStore) + peerCount := make([]int, totalStore) for id := range writeFlow { region := regions.GetRegion(id) leaderCount[int(region.GetLeader().GetStoreId()-1)]++ @@ -83,7 +84,6 @@ func newHotWrite() *Case { peerCount[int(p.GetStoreId()-1)]++ } } - simutil.Logger.Info("current hot region counts", zap.Reflect("leader", leaderCount), zap.Reflect("peer", peerCount)) // check count diff <= 2. var minLeader, maxLeader, minPeer, maxPeer int diff --git a/tools/pd-simulator/simulator/cases/import_data.go b/tools/pd-simulator/simulator/cases/import_data.go index 6cf3b79a736..b9f448a6cf6 100644 --- a/tools/pd-simulator/simulator/cases/import_data.go +++ b/tools/pd-simulator/simulator/cases/import_data.go @@ -17,7 +17,6 @@ package cases import ( "bytes" "fmt" - "math/rand" "os" "github.com/docker/go-units" @@ -26,27 +25,33 @@ import ( "github.com/pingcap/log" "github.com/tikv/pd/pkg/codec" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) -func newImportData() *Case { +func newImportData(config *sc.SimConfig) *Case { var simCase Case + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + // Initialize the cluster - for i := 1; i <= 10; i++ { + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: IDAllocator.nextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < getRegionNum(); i++ { - storeIDs := rand.Perm(10) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: IDAllocator.nextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ ID: IDAllocator.nextID(), @@ -65,7 +70,7 @@ func newImportData() *Case { table12 := string(codec.EncodeBytes(codec.GenerateTableKey(12))) table13 := string(codec.EncodeBytes(codec.GenerateTableKey(13))) e.Step = func(tick int64) map[string]int64 { - if tick > int64(getRegionNum())/10 { + if tick > int64(totalRegion)/10 { return nil } return map[string]int64{ @@ -141,14 +146,14 @@ func newImportData() *Case { if dev > 0.02 { simutil.Logger.Warn("Not balanced, change scheduler or store limit", zap.Float64("dev score", dev)) } - if checkCount > uint64(getRegionNum())/5 { + if checkCount > uint64(totalRegion)/5 { isEnd = true - } else if checkCount > uint64(getRegionNum())/10 { + } else if checkCount > uint64(totalRegion)/10 { isEnd = dev < 0.01 } if isEnd { - renderPlot("new_region.html", newRegionCount, int(checkCount), 0, getRegionNum()/10) - renderPlot("all_region.html", allRegionCount, int(checkCount), 28*getRegionNum()/100, getRegionNum()/3) + renderPlot("new_region.html", newRegionCount, int(checkCount), 0, totalRegion/10) + renderPlot("all_region.html", allRegionCount, int(checkCount), 28*totalRegion/100, totalRegion/3) } return isEnd } diff --git a/tools/pd-simulator/simulator/cases/makeup_down_replica.go b/tools/pd-simulator/simulator/cases/makeup_down_replica.go index 86c9b4cac1d..a5ee63e71a0 100644 --- a/tools/pd-simulator/simulator/cases/makeup_down_replica.go +++ b/tools/pd-simulator/simulator/cases/makeup_down_replica.go @@ -18,30 +18,35 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newMakeupDownReplicas() *Case { +func newMakeupDownReplicas(config *sc.SimConfig) *Case { var simCase Case - storeNum, regionNum := getStoreNum(), getRegionNum() - noEmptyStoreNum := storeNum - 1 - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + + noEmptyStoreNum := totalStore - 1 + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64((i)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+1)%storeNum) + 1}, - {Id: IDAllocator.nextID(), StoreId: uint64((i+2)%storeNum) + 1}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: simutil.IDAllocator.NextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 96 * units.MiB, @@ -49,7 +54,7 @@ func newMakeupDownReplicas() *Case { }) } - numNodes := storeNum + numNodes := totalStore down := false e := &DeleteNodesDescriptor{} e.Step = func(tick int64) uint64 { @@ -65,31 +70,16 @@ func newMakeupDownReplicas() *Case { simCase.Events = []EventDescriptor{e} simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - sum := 0 - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - sum += regionCount - } - simutil.Logger.Info("current region counts", zap.Ints("region", regionCounts)) - - if down && sum < storeNum*regionNum { - // only need to print once - down = false - simutil.Logger.Error("making up replicas don't start immediately") + if !down { return false } - - res := true - threshold := 0.05 - for index, regionCount := range regionCounts { - if index == 0 { // storeId == 1 - continue + for i := 1; i <= totalStore; i++ { + peerCount := regions.GetStoreRegionCount(uint64(i)) + if isUniform(peerCount, replica*totalRegion/noEmptyStoreNum) { + return false } - res = res && isUniform(regionCount, storeNum*regionNum/noEmptyStoreNum, threshold) } - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/region_merge.go b/tools/pd-simulator/simulator/cases/region_merge.go index 3d5d57f804f..8097565d1a7 100644 --- a/tools/pd-simulator/simulator/cases/region_merge.go +++ b/tools/pd-simulator/simulator/cases/region_merge.go @@ -15,36 +15,37 @@ package cases import ( - "math/rand" - "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRegionMerge() *Case { +func newRegionMerge(config *sc.SimConfig) *Case { var simCase Case - // Initialize the cluster - storeNum, regionNum := getStoreNum(), getRegionNum() - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + totalRegion := config.TotalRegion + replica := int(config.ServerConfig.Replication.MaxReplicas) + + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Status: metapb.StoreState_Up, }) } - for i := 0; i < storeNum*regionNum/3; i++ { - storeIDs := rand.Perm(storeNum) - peers := []*metapb.Peer{ - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[0] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[1] + 1)}, - {Id: IDAllocator.nextID(), StoreId: uint64(storeIDs[2] + 1)}, + for i := 0; i < totalRegion; i++ { + peers := make([]*metapb.Peer, 0, replica) + for j := 0; j < replica; j++ { + peers = append(peers, &metapb.Peer{ + Id: simutil.IDAllocator.NextID(), + StoreId: uint64((i+j)%totalStore + 1), + }) } simCase.Regions = append(simCase.Regions, Region{ - ID: IDAllocator.nextID(), + ID: simutil.IDAllocator.NextID(), Peers: peers, Leader: peers[0], Size: 10 * units.MiB, @@ -52,18 +53,13 @@ func newRegionMerge() *Case { }) } // Checker description - threshold := 0.05 mergeRatio := 4 // when max-merge-region-size is 20, per region will reach 40MB simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - sum := 0 - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - sum += regionCount + currentPeerCount := 0 + for i := 1; i <= totalStore; i++ { + currentPeerCount += regions.GetStoreRegionCount(uint64(i)) } - simutil.Logger.Info("current counts", zap.Ints("region", regionCounts), zap.Int64("average region size", regions.GetAverageRegionSize())) - return isUniform(sum, storeNum*regionNum/mergeRatio, threshold) + return isUniform(currentPeerCount, totalRegion*replica/mergeRatio) } return &simCase } diff --git a/tools/pd-simulator/simulator/cases/region_split.go b/tools/pd-simulator/simulator/cases/region_split.go index b85cd319494..7b712f4dc48 100644 --- a/tools/pd-simulator/simulator/cases/region_split.go +++ b/tools/pd-simulator/simulator/cases/region_split.go @@ -18,16 +18,15 @@ import ( "github.com/docker/go-units" "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/pkg/core" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" - "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" - "go.uber.org/zap" ) -func newRegionSplit() *Case { +func newRegionSplit(config *sc.SimConfig) *Case { var simCase Case - // Initialize the cluster - storeNum := getStoreNum() - for i := 1; i <= storeNum; i++ { + totalStore := config.TotalStore + + for i := 0; i < totalStore; i++ { simCase.Stores = append(simCase.Stores, &Store{ ID: uint64(i), Status: metapb.StoreState_Up, @@ -57,15 +56,13 @@ func newRegionSplit() *Case { // Checker description simCase.Checker = func(regions *core.RegionsInfo, _ []info.StoreStats) bool { - res := true - regionCounts := make([]int, 0, storeNum) - for i := 1; i <= storeNum; i++ { - regionCount := regions.GetStoreRegionCount(uint64(i)) - regionCounts = append(regionCounts, regionCount) - res = res && regionCount > 5 + for i := 1; i <= totalStore; i++ { + peerCount := regions.GetStoreRegionCount(uint64(i)) + if peerCount < 5 { + return false + } } - simutil.Logger.Info("current counts", zap.Ints("region", regionCounts)) - return res + return true } return &simCase } diff --git a/tools/pd-simulator/simulator/client.go b/tools/pd-simulator/simulator/client.go index 808c991e97f..0bbbebe4602 100644 --- a/tools/pd-simulator/simulator/client.go +++ b/tools/pd-simulator/simulator/client.go @@ -15,11 +15,8 @@ package simulator import ( - "bytes" "context" - "encoding/json" - "fmt" - "net/http" + "strconv" "strings" "sync" "time" @@ -27,9 +24,10 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" + pdHttp "github.com/tikv/pd/client/http" "github.com/tikv/pd/pkg/core" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/typeutil" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" "google.golang.org/grpc" @@ -45,7 +43,7 @@ type Client interface { PutStore(ctx context.Context, store *metapb.Store) error StoreHeartbeat(ctx context.Context, stats *pdpb.StoreStats) error RegionHeartbeat(ctx context.Context, region *core.RegionInfo) error - PutPDConfig(*PDConfig) error + PutPDConfig(*sc.PDConfig) error Close() } @@ -53,12 +51,12 @@ type Client interface { const ( pdTimeout = time.Second maxInitClusterRetries = 100 - httpPrefix = "pd/api/v1" ) var ( // errFailInitClusterID is returned when failed to load clusterID from all supplied PD addresses. errFailInitClusterID = errors.New("[pd] failed to get cluster id") + PDHTTPClient pdHttp.Client ) type client struct { @@ -66,7 +64,6 @@ type client struct { tag string clusterID uint64 clientConn *grpc.ClientConn - httpClient *http.Client reportRegionHeartbeatCh chan *core.RegionInfo receiveRegionHeartbeatCh chan *pdpb.RegionHeartbeatResponse @@ -87,7 +84,6 @@ func NewClient(pdAddr string, tag string) (Client, <-chan *pdpb.RegionHeartbeatR ctx: ctx, cancel: cancel, tag: tag, - httpClient: &http.Client{}, } cc, err := c.createConn() if err != nil { @@ -316,48 +312,29 @@ func (c *client) PutStore(ctx context.Context, store *metapb.Store) error { return nil } -func (c *client) PutPDConfig(config *PDConfig) error { +func (c *client) PutPDConfig(config *sc.PDConfig) error { if len(config.PlacementRules) > 0 { - path := fmt.Sprintf("%s/%s/config/rules/batch", c.url, httpPrefix) - ruleOps := make([]*placement.RuleOp, 0) + ruleOps := make([]*pdHttp.RuleOp, 0) for _, rule := range config.PlacementRules { - ruleOps = append(ruleOps, &placement.RuleOp{ + ruleOps = append(ruleOps, &pdHttp.RuleOp{ Rule: rule, - Action: placement.RuleOpAdd, + Action: pdHttp.RuleOpAdd, }) } - content, _ := json.Marshal(ruleOps) - req, err := http.NewRequest(http.MethodPost, path, bytes.NewBuffer(content)) - req.Header.Add("Content-Type", "application/json") + err := PDHTTPClient.SetPlacementRuleInBatch(c.ctx, ruleOps) if err != nil { return err } - res, err := c.httpClient.Do(req) - if err != nil { - return err - } - defer res.Body.Close() - simutil.Logger.Info("add placement rule success", zap.String("rules", string(content))) + simutil.Logger.Info("add placement rule success", zap.Any("rules", config.PlacementRules)) } if len(config.LocationLabels) > 0 { - path := fmt.Sprintf("%s/%s/config", c.url, httpPrefix) data := make(map[string]any) data["location-labels"] = config.LocationLabels - content, err := json.Marshal(data) - if err != nil { - return err - } - req, err := http.NewRequest(http.MethodPost, path, bytes.NewBuffer(content)) - req.Header.Add("Content-Type", "application/json") + err := PDHTTPClient.SetConfig(c.ctx, data) if err != nil { return err } - res, err := c.httpClient.Do(req) - if err != nil { - return err - } - defer res.Body.Close() - simutil.Logger.Info("add location labels success", zap.String("labels", string(content))) + simutil.Logger.Info("add location labels success", zap.Any("labels", config.LocationLabels)) } return nil } @@ -390,3 +367,9 @@ func (c *client) requestHeader() *pdpb.RequestHeader { ClusterId: c.clusterID, } } + +func ChooseToHaltPDSchedule(halt bool) { + PDHTTPClient.SetConfig(context.Background(), map[string]any{ + "schedule.halt-scheduling": strconv.FormatBool(halt), + }) +} diff --git a/tools/pd-simulator/simulator/config.go b/tools/pd-simulator/simulator/config/config.go similarity index 84% rename from tools/pd-simulator/simulator/config.go rename to tools/pd-simulator/simulator/config/config.go index 4f197fb83c2..6598cf35c0f 100644 --- a/tools/pd-simulator/simulator/config.go +++ b/tools/pd-simulator/simulator/config/config.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package simulator +package config import ( "fmt" @@ -21,8 +21,8 @@ import ( "github.com/BurntSushi/toml" "github.com/docker/go-units" + pdHttp "github.com/tikv/pd/client/http" sc "github.com/tikv/pd/pkg/schedule/config" - "github.com/tikv/pd/pkg/schedule/placement" "github.com/tikv/pd/pkg/utils/configutil" "github.com/tikv/pd/pkg/utils/tempurl" "github.com/tikv/pd/pkg/utils/typeutil" @@ -31,8 +31,11 @@ import ( ) const ( - // tick - defaultSimTickInterval = 100 * time.Millisecond + // simulator + defaultSimTickInterval = 100 * time.Millisecond + defaultTotalStore = 3 + defaultTotalRegion = 1000 + defaultEnableTransferRegionCounter = false // store defaultStoreIOMBPerSecond = 40 defaultStoreHeartbeat = 10 * time.Second @@ -53,9 +56,12 @@ const ( // SimConfig is the simulator configuration. type SimConfig struct { - // tick - CaseName string `toml:"case-name"` - SimTickInterval typeutil.Duration `toml:"sim-tick-interval"` + // Simulator + CaseName string `toml:"case-name"` + TotalStore int `toml:"total-store"` + TotalRegion int `toml:"total-region"` + EnableTransferRegionCounter bool `toml:"enable-transfer-region-counter"` + SimTickInterval typeutil.Duration `toml:"sim-tick-interval"` // store StoreIOMBPerSecond int64 `toml:"store-io-per-second"` StoreVersion string `toml:"store-version"` @@ -99,6 +105,9 @@ func NewSimConfig(serverLogLevel string) *SimConfig { // Adjust is used to adjust configurations func (sc *SimConfig) Adjust(meta *toml.MetaData) error { configutil.AdjustDuration(&sc.SimTickInterval, defaultSimTickInterval) + configutil.AdjustInt(&sc.TotalStore, defaultTotalStore) + configutil.AdjustInt(&sc.TotalRegion, defaultTotalRegion) + configutil.AdjustBool(&sc.EnableTransferRegionCounter, defaultEnableTransferRegionCounter) configutil.AdjustInt64(&sc.StoreIOMBPerSecond, defaultStoreIOMBPerSecond) configutil.AdjustString(&sc.StoreVersion, versioninfo.PDReleaseVersion) configutil.AdjustDuration(&sc.RaftStore.RegionHeartBeatInterval, defaultRegionHeartbeat) @@ -118,12 +127,12 @@ func (sc *SimConfig) Adjust(meta *toml.MetaData) error { return sc.ServerConfig.Adjust(meta, false) } -func (sc *SimConfig) speed() uint64 { +func (sc *SimConfig) Speed() uint64 { return uint64(time.Second / sc.SimTickInterval.Duration) } // PDConfig saves some config which may be changed in PD. type PDConfig struct { - PlacementRules []*placement.Rule + PlacementRules []*pdHttp.Rule LocationLabels typeutil.StringSlice } diff --git a/tools/pd-simulator/simulator/conn.go b/tools/pd-simulator/simulator/conn.go index 588fec246d4..4be8a2b76dc 100644 --- a/tools/pd-simulator/simulator/conn.go +++ b/tools/pd-simulator/simulator/conn.go @@ -17,6 +17,7 @@ package simulator import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" ) // Connection records the information of connection among nodes. @@ -26,7 +27,7 @@ type Connection struct { } // NewConnection creates nodes according to the configuration and returns the connection among nodes. -func NewConnection(simCase *cases.Case, pdAddr string, storeConfig *SimConfig) (*Connection, error) { +func NewConnection(simCase *cases.Case, pdAddr string, storeConfig *config.SimConfig) (*Connection, error) { conn := &Connection{ pdAddr: pdAddr, Nodes: make(map[uint64]*Node), @@ -51,3 +52,13 @@ func (c *Connection) nodeHealth(storeID uint64) bool { return n.GetNodeState() == metapb.NodeState_Preparing || n.GetNodeState() == metapb.NodeState_Serving } + +func (c *Connection) getNodes() []*Node { + var nodes []*Node + for _, n := range c.Nodes { + if n.GetNodeState() != metapb.NodeState_Removed { + nodes = append(nodes, n) + } + } + return nodes +} diff --git a/tools/pd-simulator/simulator/drive.go b/tools/pd-simulator/simulator/drive.go index c7f64324c19..700dd58f87a 100644 --- a/tools/pd-simulator/simulator/drive.go +++ b/tools/pd-simulator/simulator/drive.go @@ -16,6 +16,8 @@ package simulator import ( "context" + "net/http" + "net/http/pprof" "path" "strconv" "sync" @@ -23,9 +25,11 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/kvproto/pkg/metapb" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/typeutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.etcd.io/etcd/clientv3" @@ -34,32 +38,34 @@ import ( // Driver promotes the cluster status change. type Driver struct { - wg sync.WaitGroup - pdAddr string - simCase *cases.Case - client Client - tickCount int64 - eventRunner *EventRunner - raftEngine *RaftEngine - conn *Connection - simConfig *SimConfig - pdConfig *PDConfig + wg sync.WaitGroup + pdAddr string + statusAddress string + simCase *cases.Case + client Client + tickCount int64 + eventRunner *EventRunner + raftEngine *RaftEngine + conn *Connection + simConfig *config.SimConfig + pdConfig *config.PDConfig } // NewDriver returns a driver. -func NewDriver(pdAddr string, caseName string, simConfig *SimConfig) (*Driver, error) { - simCase := cases.NewCase(caseName) +func NewDriver(pdAddr, statusAddress, caseName string, simConfig *config.SimConfig) (*Driver, error) { + simCase := cases.NewCase(caseName, simConfig) if simCase == nil { return nil, errors.Errorf("failed to create case %s", caseName) } - pdConfig := &PDConfig{} + pdConfig := &config.PDConfig{} pdConfig.PlacementRules = simCase.Rules pdConfig.LocationLabels = simCase.Labels return &Driver{ - pdAddr: pdAddr, - simCase: simCase, - simConfig: simConfig, - pdConfig: pdConfig, + pdAddr: pdAddr, + statusAddress: statusAddress, + simCase: simCase, + simConfig: simConfig, + pdConfig: pdConfig, }, nil } @@ -76,6 +82,9 @@ func (d *Driver) Prepare() error { d.updateNodeAvailable() + if d.statusAddress != "" { + go d.runHTTPServer() + } // Bootstrap. store, region, err := d.GetBootstrapInfo(d.raftEngine) if err != nil { @@ -94,7 +103,7 @@ func (d *Driver) Prepare() error { // Setup alloc id. // TODO: This is a hack way. Once we have reset alloc ID API, we need to replace it. - maxID := cases.IDAllocator.GetID() + maxID := simutil.IDAllocator.GetID() requestTimeout := 10 * time.Second etcdTimeout := 3 * time.Second etcdClient, err := clientv3.New(clientv3.Config{ @@ -122,7 +131,7 @@ func (d *Driver) Prepare() error { return errors.WithStack(err) } if id > maxID { - cases.IDAllocator.ResetID() + simutil.IDAllocator.ResetID() break } } @@ -225,3 +234,20 @@ func (d *Driver) updateNodeAvailable() { } } } + +func (d *Driver) runHTTPServer() { + http.Handle("/metrics", promhttp.Handler()) + // profile API + http.HandleFunc("/pprof/profile", pprof.Profile) + http.HandleFunc("/pprof/trace", pprof.Trace) + http.HandleFunc("/pprof/symbol", pprof.Symbol) + http.Handle("/pprof/heap", pprof.Handler("heap")) + http.Handle("/pprof/mutex", pprof.Handler("mutex")) + http.Handle("/pprof/allocs", pprof.Handler("allocs")) + http.Handle("/pprof/block", pprof.Handler("block")) + http.Handle("/pprof/goroutine", pprof.Handler("goroutine")) + eventHandler := newEventHandler(d.eventRunner) + http.HandleFunc("/event", eventHandler.createEvent) + // nolint + http.ListenAndServe(d.statusAddress, nil) +} diff --git a/tools/pd-simulator/simulator/event.go b/tools/pd-simulator/simulator/event.go index 04ad10a0db8..20c75b58384 100644 --- a/tools/pd-simulator/simulator/event.go +++ b/tools/pd-simulator/simulator/event.go @@ -15,6 +15,12 @@ package simulator import ( + "context" + "fmt" + "math/rand" + "net/http" + "sync" + "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" "github.com/tikv/pd/pkg/core" @@ -30,6 +36,7 @@ type Event interface { // EventRunner includes all events. type EventRunner struct { + sync.RWMutex events []Event raftEngine *RaftEngine } @@ -46,6 +53,33 @@ func NewEventRunner(events []cases.EventDescriptor, raftEngine *RaftEngine) *Eve return er } +type eventHandler struct { + er *EventRunner +} + +func newEventHandler(er *EventRunner) *eventHandler { + return &eventHandler{ + er: er, + } +} + +func (e *eventHandler) createEvent(w http.ResponseWriter, r *http.Request) { + event := r.URL.Query().Get("event") + if len(event) < 1 { + fmt.Fprintf(w, "no given event") + return + } + switch event { + case "add-node": + e.er.addEvent(&AddNode{}) + return + case "down-node": + e.er.addEvent(&DownNode{}) + return + default: + } +} + func parserEvent(e cases.EventDescriptor) Event { switch t := e.(type) { case *cases.WriteFlowOnSpotDescriptor: @@ -54,16 +88,20 @@ func parserEvent(e cases.EventDescriptor) Event { return &WriteFlowOnRegion{descriptor: t} case *cases.ReadFlowOnRegionDescriptor: return &ReadFlowOnRegion{descriptor: t} - case *cases.AddNodesDescriptor: - return &AddNodes{descriptor: t} - case *cases.DeleteNodesDescriptor: - return &DeleteNodes{descriptor: t} } return nil } +func (er *EventRunner) addEvent(e Event) { + er.Lock() + defer er.Unlock() + er.events = append(er.events, e) +} + // Tick ticks the event run func (er *EventRunner) Tick(tickCount int64) { + er.Lock() + defer er.Unlock() var finishedIndex int for i, e := range er.events { isFinished := e.Run(er.raftEngine, tickCount) @@ -126,24 +164,18 @@ func (e *ReadFlowOnRegion) Run(raft *RaftEngine, tickCount int64) bool { return false } -// AddNodes adds nodes. -type AddNodes struct { - descriptor *cases.AddNodesDescriptor -} +// AddNode adds nodes. +type AddNode struct{} // Run implements the event interface. -func (e *AddNodes) Run(raft *RaftEngine, tickCount int64) bool { - id := e.descriptor.Step(tickCount) - if id == 0 { - return false - } - - if _, ok := raft.conn.Nodes[id]; ok { - simutil.Logger.Info("node has already existed", zap.Uint64("node-id", id)) +func (*AddNode) Run(raft *RaftEngine, _ int64) bool { + config := raft.storeConfig + nodes := raft.conn.getNodes() + id, err := nodes[0].client.AllocID(context.TODO()) + if err != nil { + simutil.Logger.Error("alloc node id failed", zap.Error(err)) return false } - - config := raft.storeConfig s := &cases.Store{ ID: id, Status: metapb.StoreState_Up, @@ -152,49 +184,57 @@ func (e *AddNodes) Run(raft *RaftEngine, tickCount int64) bool { } n, err := NewNode(s, raft.conn.pdAddr, config) if err != nil { - simutil.Logger.Error("add node failed", zap.Uint64("node-id", id), zap.Error(err)) + simutil.Logger.Error("create node failed", zap.Error(err)) return false } - raft.conn.Nodes[id] = n + + raft.conn.Nodes[s.ID] = n n.raftEngine = raft err = n.Start() if err != nil { - simutil.Logger.Error("start node failed", zap.Uint64("node-id", id), zap.Error(err)) + delete(raft.conn.Nodes, s.ID) + simutil.Logger.Error("start node failed", zap.Uint64("node-id", s.ID), zap.Error(err)) + return false } - return false + return true } -// DeleteNodes deletes nodes. -type DeleteNodes struct { - descriptor *cases.DeleteNodesDescriptor -} +// DownNode deletes nodes. +type DownNode struct{} // Run implements the event interface. -func (e *DeleteNodes) Run(raft *RaftEngine, tickCount int64) bool { - id := e.descriptor.Step(tickCount) - if id == 0 { +func (*DownNode) Run(raft *RaftEngine, _ int64) bool { + nodes := raft.conn.getNodes() + if len(nodes) == 0 { + simutil.Logger.Error("can not find any node") return false } - - node := raft.conn.Nodes[id] + i := rand.Intn(len(nodes)) + node := nodes[i] if node == nil { - simutil.Logger.Error("node is not existed", zap.Uint64("node-id", id)) + simutil.Logger.Error("node is not existed", zap.Uint64("node-id", node.Id)) + return false + } + delete(raft.conn.Nodes, node.Id) + // delete store + err := PDHTTPClient.DeleteStore(context.Background(), node.Id) + if err != nil { + simutil.Logger.Error("put store failed", zap.Uint64("node-id", node.Id), zap.Error(err)) return false } - delete(raft.conn.Nodes, id) node.Stop() regions := raft.GetRegions() for _, region := range regions { storeIDs := region.GetStoreIDs() - if _, ok := storeIDs[id]; ok { + if _, ok := storeIDs[node.Id]; ok { downPeer := &pdpb.PeerStats{ - Peer: region.GetStorePeer(id), + Peer: region.GetStorePeer(node.Id), DownSeconds: 24 * 60 * 60, } region = region.Clone(core.WithDownPeers(append(region.GetDownPeers(), downPeer))) raft.SetRegion(region) } } - return false + return true } diff --git a/tools/pd-simulator/simulator/node.go b/tools/pd-simulator/simulator/node.go index 68a10a8638e..fe8dc74a944 100644 --- a/tools/pd-simulator/simulator/node.go +++ b/tools/pd-simulator/simulator/node.go @@ -27,6 +27,7 @@ import ( "github.com/tikv/pd/pkg/ratelimit" "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + sc "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/info" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" @@ -57,7 +58,7 @@ type Node struct { } // NewNode returns a Node. -func NewNode(s *cases.Store, pdAddr string, config *SimConfig) (*Node, error) { +func NewNode(s *cases.Store, pdAddr string, config *sc.SimConfig) (*Node, error) { ctx, cancel := context.WithCancel(context.Background()) store := &metapb.Store{ Id: s.ID, @@ -71,6 +72,7 @@ func NewNode(s *cases.Store, pdAddr string, config *SimConfig) (*Node, error) { StoreId: s.ID, Capacity: uint64(config.RaftStore.Capacity), StartTime: uint32(time.Now().Unix()), + Available: uint64(config.RaftStore.Capacity), }, } tag := fmt.Sprintf("store %d", s.ID) @@ -93,7 +95,7 @@ func NewNode(s *cases.Store, pdAddr string, config *SimConfig) (*Node, error) { cancel() return nil, err } - ratio := config.speed() + ratio := config.Speed() speed := config.StoreIOMBPerSecond * units.MiB * int64(ratio) return &Node{ Store: store, @@ -170,6 +172,8 @@ func (n *Node) stepTask() { } } +var schedulerCheck sync.Once + func (n *Node) stepHeartBeat() { config := n.raftEngine.storeConfig @@ -180,6 +184,7 @@ func (n *Node) stepHeartBeat() { period = uint64(config.RaftStore.RegionHeartBeatInterval.Duration / config.SimTickInterval.Duration) if n.tick%period == 0 { n.regionHeartBeat() + schedulerCheck.Do(func() { ChooseToHaltPDSchedule(false) }) } } diff --git a/tools/pd-simulator/simulator/raft.go b/tools/pd-simulator/simulator/raft.go index fccf75781d3..d416f69ff80 100644 --- a/tools/pd-simulator/simulator/raft.go +++ b/tools/pd-simulator/simulator/raft.go @@ -22,6 +22,7 @@ import ( "github.com/tikv/pd/pkg/core" "github.com/tikv/pd/pkg/utils/syncutil" "github.com/tikv/pd/tools/pd-simulator/simulator/cases" + "github.com/tikv/pd/tools/pd-simulator/simulator/config" "github.com/tikv/pd/tools/pd-simulator/simulator/simutil" "go.uber.org/zap" ) @@ -34,12 +35,12 @@ type RaftEngine struct { regionChange map[uint64][]uint64 regionSplitSize int64 regionSplitKeys int64 - storeConfig *SimConfig + storeConfig *config.SimConfig useTiDBEncodedKey bool } // NewRaftEngine creates the initialized raft with the configuration. -func NewRaftEngine(conf *cases.Case, conn *Connection, storeConfig *SimConfig) *RaftEngine { +func NewRaftEngine(conf *cases.Case, conn *Connection, storeConfig *config.SimConfig) *RaftEngine { r := &RaftEngine{ regionsInfo: core.NewRegionsInfo(), conn: conn, diff --git a/tools/pd-simulator/simulator/simutil/case_config.go b/tools/pd-simulator/simulator/simutil/id.go similarity index 50% rename from tools/pd-simulator/simulator/simutil/case_config.go rename to tools/pd-simulator/simulator/simutil/id.go index a34035c15aa..8badddff3f1 100644 --- a/tools/pd-simulator/simulator/simutil/case_config.go +++ b/tools/pd-simulator/simulator/simutil/id.go @@ -1,4 +1,4 @@ -// Copyright 2019 TiKV Project Authors. +// Copyright 2024 TiKV Project Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,21 +14,26 @@ package simutil -// CaseConfig is to save flags -type CaseConfig struct { - StoreNum int - RegionNum int - EnableTransferRegionCounter bool +// IDAllocator is used to alloc unique ID. +type idAllocator struct { + id uint64 } -// CaseConfigure is an global instance for CaseConfig -var CaseConfigure *CaseConfig +// NextID gets the next unique ID. +func (a *idAllocator) NextID() uint64 { + a.id++ + return a.id +} + +// ResetID resets the IDAllocator. +func (a *idAllocator) ResetID() { + a.id = 0 +} -// InitCaseConfig is to init caseConfigure -func InitCaseConfig(storeNum, regionNum int, enableTransferRegionCounter bool) { - CaseConfigure = &CaseConfig{ - StoreNum: storeNum, - RegionNum: regionNum, - EnableTransferRegionCounter: enableTransferRegionCounter, - } +// GetID gets the current ID. +func (a *idAllocator) GetID() uint64 { + return a.id } + +// IDAllocator is used to alloc unique ID. +var IDAllocator idAllocator diff --git a/tools/pd-simulator/simulator/task.go b/tools/pd-simulator/simulator/task.go index a19854b53ba..c0bfa1e691b 100644 --- a/tools/pd-simulator/simulator/task.go +++ b/tools/pd-simulator/simulator/task.go @@ -415,7 +415,7 @@ func (a *addPeer) tick(engine *RaftEngine, region *core.RegionInfo) (newRegion * pendingPeers := append(region.GetPendingPeers(), a.peer) return region.Clone(core.WithAddPeer(a.peer), core.WithIncConfVer(), core.WithPendingPeers(pendingPeers)), false } - speed := engine.storeConfig.speed() + speed := engine.storeConfig.Speed() // Step 2: Process Snapshot if !processSnapshot(sendNode, a.sendingStat, speed) { return nil, false