From 5928a0d6b0263e1219c1f82407476bc2a18ccfaa Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Mon, 9 Oct 2023 17:40:54 +0800 Subject: [PATCH] mcs: add service role (#7175) ref tikv/pd#5839 Signed-off-by: Ryan Leung --- metrics/grafana/pd.json | 314 ++++++++---------- pkg/mcs/resourcemanager/server/server.go | 8 +- pkg/mcs/scheduling/server/server.go | 12 +- pkg/mcs/tso/server/server.go | 6 +- pkg/member/metrics.go | 32 ++ pkg/schedule/coordinator.go | 18 +- pkg/tso/global_allocator.go | 4 + .../unsafe_recovery_controller.go | 9 +- server/server.go | 4 +- 9 files changed, 217 insertions(+), 190 deletions(-) create mode 100644 pkg/member/metrics.go diff --git a/metrics/grafana/pd.json b/metrics/grafana/pd.json index 16131b4b8d2..5568dbc13ae 100644 --- a/metrics/grafana/pd.json +++ b/metrics/grafana/pd.json @@ -77,14 +77,17 @@ "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", - "description": "It indicates whether the current PD is the leader or a follower.", - "format": "none", + "decimals": null, + "description": "The total capacity size of the cluster", + "editable": true, + "error": false, + "format": "decbytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, - "thresholdMarkers": true + "thresholdMarkers": false }, "gridPos": { "h": 6, @@ -92,7 +95,7 @@ "x": 0, "y": 0 }, - "id": 55, + "id": 10, "interval": null, "links": [], "mappingType": 1, @@ -107,7 +110,7 @@ } ], "maxDataPoints": 100, - "nullPointMode": "connected", + "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", @@ -115,54 +118,36 @@ "prefixFontSize": "50%", "rangeMaps": [ { - "from": "1", - "text": "Leader", - "to": "100000" - }, - { - "from": "0", - "text": "Follower", - "to": "1" + "from": "null", + "text": "N/A", + "to": "null" } ], "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", + "fillColor": "rgba(77, 135, 25, 0.18)", + "full": true, + "lineColor": "rgb(21, 179, 65)", "show": false }, "tableColumn": "", "targets": [ { - "expr": "pd_tso_role{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", dc=\"global\"}", + "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"storage_capacity\"})", "format": "time_series", - "instant": true, "intervalFactor": 2, - "legendFormat": "", - "metric": "pd_server_tso", "refId": "A", "step": 40 } ], "thresholds": "", - "title": "PD role", + "title": "Storage capacity", "type": "singlestat", - "valueFontSize": "50%", + "valueFontSize": "80%", "valueMaps": [ { "op": "=", - "text": "Unknown", + "text": "N/A", "value": "null" - }, - { - "op": "=", - "text": "Follower", - "value": "0" - }, - { - "op": "=", - "text": "Leader", - "value": "1" } ], "valueName": "current" @@ -177,8 +162,8 @@ "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", - "decimals": null, - "description": "The total capacity size of the cluster", + "decimals": 1, + "description": "The current storage size of the cluster", "editable": true, "error": false, "format": "decbytes", @@ -187,7 +172,7 @@ "minValue": 0, "show": false, "thresholdLabels": false, - "thresholdMarkers": false + "thresholdMarkers": true }, "gridPos": { "h": 6, @@ -195,7 +180,8 @@ "x": 4, "y": 0 }, - "id": 10, + "hideTimeOverride": false, + "id": 38, "interval": null, "links": [], "mappingType": 1, @@ -224,15 +210,15 @@ } ], "sparkline": { - "fillColor": "rgba(77, 135, 25, 0.18)", + "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, - "lineColor": "rgb(21, 179, 65)", + "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { - "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"storage_capacity\"})", + "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"storage_size\"})", "format": "time_series", "intervalFactor": 2, "refId": "A", @@ -240,7 +226,7 @@ } ], "thresholds": "", - "title": "Storage capacity", + "title": "Current storage size", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -255,22 +241,21 @@ { "cacheTimeout": null, "colorBackground": false, - "colorValue": false, + "colorValue": true, "colors": [ - "rgba(245, 54, 54, 0.9)", + "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" + "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_TEST-CLUSTER}", - "decimals": 1, - "description": "The current storage size of the cluster", + "description": "The current storage size and used ratio of the cluster", "editable": true, "error": false, - "format": "decbytes", + "format": "percentunit", "gauge": { - "maxValue": 100, + "maxValue": 1, "minValue": 0, - "show": false, + "show": true, "thresholdLabels": false, "thresholdMarkers": true }, @@ -281,7 +266,7 @@ "y": 0 }, "hideTimeOverride": false, - "id": 38, + "id": 37, "interval": null, "links": [], "mappingType": 1, @@ -318,15 +303,15 @@ "tableColumn": "", "targets": [ { - "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"storage_size\"})", + "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"storage_size\"}) / sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"storage_capacity\"})", "format": "time_series", "intervalFactor": 2, "refId": "A", "step": 40 } ], - "thresholds": "", - "title": "Current storage size", + "thresholds": "0.01,0.5", + "title": "Current storage used", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -341,21 +326,19 @@ { "cacheTimeout": null, "colorBackground": false, - "colorValue": true, + "colorValue": false, "colors": [ - "rgba(50, 172, 45, 0.97)", + "#d44a3a", "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" + "#299c46" ], "datasource": "${DS_TEST-CLUSTER}", - "description": "The current storage size and used ratio of the cluster", - "editable": true, - "error": false, - "format": "percentunit", + "description": "The count of healthy stores", + "format": "none", "gauge": { - "maxValue": 1, + "maxValue": 100, "minValue": 0, - "show": true, + "show": false, "thresholdLabels": false, "thresholdMarkers": true }, @@ -365,8 +348,7 @@ "x": 12, "y": 0 }, - "hideTimeOverride": false, - "id": 37, + "id": 97, "interval": null, "links": [], "mappingType": 1, @@ -381,7 +363,7 @@ } ], "maxDataPoints": 100, - "nullPointMode": "null", + "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", @@ -396,22 +378,24 @@ ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", - "full": true, + "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { - "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"storage_size\"}) / sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"storage_capacity\"})", + "exemplar": true, + "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"store_up_count\"})", "format": "time_series", + "interval": "", "intervalFactor": 2, - "refId": "A", - "step": 40 + "legendFormat": "", + "refId": "A" } ], - "thresholds": "0.01,0.5", - "title": "Current storage used", + "thresholds": "0,1", + "title": "Normal stores", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -428,19 +412,21 @@ "colorBackground": false, "colorValue": false, "colors": [ - "#d44a3a", + "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", - "#299c46" + "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", - "description": "The count of healthy stores", + "description": "The total number of Regions without replicas", + "editable": true, + "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, - "thresholdMarkers": true + "thresholdMarkers": false }, "gridPos": { "h": 6, @@ -448,7 +434,7 @@ "x": 16, "y": 0 }, - "id": 97, + "id": 21, "interval": null, "links": [], "mappingType": 1, @@ -463,7 +449,7 @@ } ], "maxDataPoints": 100, - "nullPointMode": "connected", + "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", @@ -478,23 +464,24 @@ ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, + "full": true, "lineColor": "rgb(31, 120, 193)", - "show": false + "show": true }, "tableColumn": "", "targets": [ { - "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"store_up_count\"})", + "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"leader_count\"})", "format": "time_series", "intervalFactor": 2, - "refId": "A" + "refId": "A", + "step": 40 } ], - "thresholds": "0,1", - "title": "Normal stores", + "thresholds": "", + "title": "Number of Regions", "type": "singlestat", - "valueFontSize": "100%", + "valueFontSize": "80%", "valueMaps": [ { "op": "=", @@ -514,7 +501,7 @@ "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", - "description": "The total number of Regions without replicas", + "description": "The current peer count of the cluster", "editable": true, "error": false, "format": "none", @@ -568,15 +555,16 @@ "tableColumn": "", "targets": [ { - "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\",type=\"leader_count\"})", + "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"region_count\"})", "format": "time_series", "intervalFactor": 2, + "legendFormat": "count", "refId": "A", - "step": 40 + "step": 4 } ], "thresholds": "", - "title": "Number of Regions", + "title": "Current peer count", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -804,8 +792,7 @@ "fill": true, "line": true, "op": "gt", - "value": 100, - "yaxis": "left" + "value": 100 } ], "timeFrom": null, @@ -849,99 +836,85 @@ } }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, + "cacheTimeout": null, "datasource": "${DS_TEST-CLUSTER}", - "decimals": null, - "description": "The current peer count of the cluster", - "editable": true, - "error": false, - "fill": 0, - "fillGradient": 0, - "grid": {}, + "description": "It indicates the current leader/primary of services", "gridPos": { "h": 7, "w": 8, "x": 16, "y": 6 }, - "hiddenSeries": false, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 3, + "id": 55, + "interval": null, "links": [], - "nullPointMode": "null", - "paceLength": 10, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": true, + "maxDataPoints": 100, + "options": { + "showHeader": true + }, + "tableColumn": "instance", "targets": [ { - "expr": "sum(pd_cluster_status{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=\"$instance\", type=\"region_count\"})", - "format": "time_series", + "exemplar": true, + "expr": "service_member_role{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}", + "format": "table", + "instant": true, + "interval": "", "intervalFactor": 2, - "legendFormat": "count", + "legendFormat": "", + "metric": "pd_server_tso", "refId": "A", - "step": 4 + "step": 40 } ], - "thresholds": [], "timeFrom": null, - "timeRegions": [], "timeShift": null, - "title": "Current peer count", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ + "title": "Leader/Primary", + "transformations": [ { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "id": "filterByValue", + "options": { + "filters": [ + { + "fieldName": "Value", + "config": { + "id": "notEqual", + "options": { + "value": "0" + } + } + } + ], + "type": "include", + "match": "all" + } }, { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "instance", + "service" + ] + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": {}, + "indexByName": { + "instance": 1, + "service": 0 + }, + "renameByName": { + "instance": "" + } + } } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "table" }, { "datasource": "${DS_TEST-CLUSTER}", @@ -2417,7 +2390,6 @@ }, "yaxes": [ { - "$$hashKey": "object:533", "format": "short", "label": null, "logBase": 1, @@ -2426,7 +2398,6 @@ "show": true }, { - "$$hashKey": "object:534", "format": "short", "label": null, "logBase": 1, @@ -8822,8 +8793,16 @@ "format": "time_series", "hide": false, "intervalFactor": 1, - "legendFormat": "{{instance}}", + "legendFormat": "{{instance}}-physically-allocated", "refId": "A" + }, + { + "expr": "etcd_mvcc_db_total_size_in_use_in_bytes{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", job=\"pd\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}-logically-in-use", + "refId": "B" } ], "thresholds": [], @@ -10569,7 +10548,6 @@ "renderer": "flot", "seriesOverrides": [ { - "$$hashKey": "object:363", "alias": "/pending.*/", "yaxis": 2 } @@ -10616,7 +10594,6 @@ }, "yaxes": [ { - "$$hashKey": "object:307", "format": "ops", "label": null, "logBase": 1, @@ -10625,7 +10602,6 @@ "show": true }, { - "$$hashKey": "object:308", "format": "short", "label": null, "logBase": 1, @@ -12294,4 +12270,4 @@ "title": "Test-Cluster-PD", "uid": "Q6RuHYIWk", "version": 1 -} +} \ No newline at end of file diff --git a/pkg/mcs/resourcemanager/server/server.go b/pkg/mcs/resourcemanager/server/server.go index 9b9bd91c6eb..47248208c8a 100644 --- a/pkg/mcs/resourcemanager/server/server.go +++ b/pkg/mcs/resourcemanager/server/server.go @@ -52,6 +52,8 @@ import ( var _ bs.Server = (*Server)(nil) +const serviceName = "Resource Manager" + // Server is the resource manager server, and it implements bs.Server. type Server struct { *server.BaseServer @@ -168,6 +170,7 @@ func (s *Server) campaignLeader() { defer resetLeaderOnce.Do(func() { cancel() s.participant.ResetLeader() + member.ServiceMemberGauge.WithLabelValues(serviceName).Set(0) }) // maintain the leadership, after this, Resource Manager could be ready to provide service. @@ -180,6 +183,7 @@ func (s *Server) campaignLeader() { } s.participant.EnableLeader() + member.ServiceMemberGauge.WithLabelValues(serviceName).Set(1) log.Info("resource manager primary is ready to serve", zap.String("resource-manager-primary-name", s.participant.Name())) leaderTicker := time.NewTicker(utils.LeaderTickInterval) @@ -382,8 +386,8 @@ func CreateServerWrapper(cmd *cobra.Command, args []string) { // Flushing any buffered log entries defer log.Sync() - versioninfo.Log("Resource Manager") - log.Info("Resource Manager config", zap.Reflect("config", cfg)) + versioninfo.Log(serviceName) + log.Info("resource manager config", zap.Reflect("config", cfg)) grpcprometheus.EnableHandlingTimeHistogram() metricutil.Push(&cfg.Metric) diff --git a/pkg/mcs/scheduling/server/server.go b/pkg/mcs/scheduling/server/server.go index 62a04599e8f..32788284b70 100644 --- a/pkg/mcs/scheduling/server/server.go +++ b/pkg/mcs/scheduling/server/server.go @@ -62,7 +62,11 @@ import ( var _ bs.Server = (*Server)(nil) -const memberUpdateInterval = time.Minute +const ( + serviceName = "Scheduling Service" + + memberUpdateInterval = time.Minute +) // Server is the scheduling server, and it implements bs.Server. type Server struct { @@ -255,6 +259,7 @@ func (s *Server) campaignLeader() { defer resetLeaderOnce.Do(func() { cancel() s.participant.ResetLeader() + member.ServiceMemberGauge.WithLabelValues(serviceName).Set(0) }) // maintain the leadership, after this, Scheduling could be ready to provide service. @@ -274,6 +279,7 @@ func (s *Server) campaignLeader() { } }() s.participant.EnableLeader() + member.ServiceMemberGauge.WithLabelValues(serviceName).Set(1) log.Info("scheduling primary is ready to serve", zap.String("scheduling-primary-name", s.participant.Name())) leaderTicker := time.NewTicker(utils.LeaderTickInterval) @@ -533,8 +539,8 @@ func CreateServerWrapper(cmd *cobra.Command, args []string) { // Flushing any buffered log entries defer log.Sync() - versioninfo.Log("Scheduling") - log.Info("Scheduling config", zap.Reflect("config", cfg)) + versioninfo.Log(serviceName) + log.Info("scheduling service config", zap.Reflect("config", cfg)) grpcprometheus.EnableHandlingTimeHistogram() metricutil.Push(&cfg.Metric) diff --git a/pkg/mcs/tso/server/server.go b/pkg/mcs/tso/server/server.go index 133f87b78f3..16ef3216c62 100644 --- a/pkg/mcs/tso/server/server.go +++ b/pkg/mcs/tso/server/server.go @@ -58,6 +58,8 @@ import ( var _ bs.Server = (*Server)(nil) var _ tso.ElectionMember = (*member.Participant)(nil) +const serviceName = "TSO Service" + // Server is the TSO server, and it implements bs.Server. type Server struct { *server.BaseServer @@ -450,8 +452,8 @@ func CreateServerWrapper(cmd *cobra.Command, args []string) { // Flushing any buffered log entries defer log.Sync() - versioninfo.Log("TSO") - log.Info("TSO config", zap.Reflect("config", cfg)) + versioninfo.Log(serviceName) + log.Info("TSO service config", zap.Reflect("config", cfg)) grpcprometheus.EnableHandlingTimeHistogram() metricutil.Push(&cfg.Metric) diff --git a/pkg/member/metrics.go b/pkg/member/metrics.go new file mode 100644 index 00000000000..d2b99f0cf93 --- /dev/null +++ b/pkg/member/metrics.go @@ -0,0 +1,32 @@ +// Copyright 2023 TiKV Project Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package member + +import "github.com/prometheus/client_golang/prometheus" + +var ( + // ServiceMemberGauge is used to record the leader/primary of services. + ServiceMemberGauge = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "service", + Subsystem: "member", + Name: "role", + Help: "The leader/primary of services", + }, []string{"service"}) +) + +func init() { + prometheus.MustRegister(ServiceMemberGauge) +} diff --git a/pkg/schedule/coordinator.go b/pkg/schedule/coordinator.go index f6b82c706c4..571a0e1d258 100644 --- a/pkg/schedule/coordinator.go +++ b/pkg/schedule/coordinator.go @@ -144,7 +144,7 @@ func (c *Coordinator) PatrolRegions() { ticker := time.NewTicker(c.cluster.GetCheckerConfig().GetPatrolRegionInterval()) defer ticker.Stop() - log.Info("Coordinator starts patrol regions") + log.Info("coordinator starts patrol regions") start := time.Now() var ( key []byte @@ -252,7 +252,7 @@ func (c *Coordinator) checkPriorityRegions() { func (c *Coordinator) checkSuspectRanges() { defer logutil.LogPanic() defer c.wg.Done() - log.Info("Coordinator begins to check suspect key ranges") + log.Info("coordinator begins to check suspect key ranges") ticker := time.NewTicker(checkSuspectRangesInterval) defer ticker.Stop() for { @@ -316,7 +316,7 @@ func (c *Coordinator) drivePushOperator() { defer logutil.LogPanic() defer c.wg.Done() - log.Info("Coordinator begins to actively drive push operator") + log.Info("coordinator begins to actively drive push operator") ticker := time.NewTicker(pushOperatorTickInterval) defer ticker.Stop() for { @@ -334,10 +334,10 @@ func (c *Coordinator) drivePushOperator() { func (c *Coordinator) RunUntilStop() { c.Run() <-c.ctx.Done() - log.Info("Coordinator is stopping") + log.Info("coordinator is stopping") c.GetSchedulersController().Wait() c.wg.Wait() - log.Info("Coordinator has been stopped") + log.Info("coordinator has been stopped") } // Run starts coordinator. @@ -347,20 +347,20 @@ func (c *Coordinator) Run() { ticker = time.NewTicker(100 * time.Millisecond) }) defer ticker.Stop() - log.Info("Coordinator starts to collect cluster information") + log.Info("coordinator starts to collect cluster information") for { if c.ShouldRun() { - log.Info("Coordinator has finished cluster information preparation") + log.Info("coordinator has finished cluster information preparation") break } select { case <-ticker.C: case <-c.ctx.Done(): - log.Info("Coordinator stops running") + log.Info("coordinator stops running") return } } - log.Info("Coordinator starts to run schedulers") + log.Info("coordinator starts to run schedulers") c.InitSchedulers(true) c.wg.Add(3) diff --git a/pkg/tso/global_allocator.go b/pkg/tso/global_allocator.go index c9bef397728..bb7c7d96965 100644 --- a/pkg/tso/global_allocator.go +++ b/pkg/tso/global_allocator.go @@ -28,6 +28,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/tikv/pd/pkg/errs" mcsutils "github.com/tikv/pd/pkg/mcs/utils" + "github.com/tikv/pd/pkg/member" "github.com/tikv/pd/pkg/slice" "github.com/tikv/pd/pkg/storage/endpoint" "github.com/tikv/pd/pkg/utils/logutil" @@ -614,10 +615,13 @@ func (gta *GlobalTSOAllocator) campaignLeader() { gta.am.ResetAllocatorGroup(GlobalDCLocation) }() + tsoLabel := fmt.Sprintf("TSO Service Group %d", gta.getGroupID()) gta.member.EnableLeader() + member.ServiceMemberGauge.WithLabelValues(tsoLabel).Set(1) defer resetLeaderOnce.Do(func() { cancel() gta.member.ResetLeader() + member.ServiceMemberGauge.WithLabelValues(tsoLabel).Set(0) }) // TODO: if enable-local-tso is true, check the cluster dc-location after the primary is elected diff --git a/pkg/unsaferecovery/unsafe_recovery_controller.go b/pkg/unsaferecovery/unsafe_recovery_controller.go index a0f6d997ac0..fab6623e204 100644 --- a/pkg/unsaferecovery/unsafe_recovery_controller.go +++ b/pkg/unsaferecovery/unsafe_recovery_controller.go @@ -463,7 +463,7 @@ func (u *Controller) CollectReport(heartbeat *pdpb.StoreHeartbeatRequest) (bool, } if heartbeat.StoreReport.GetStep() != u.step { - log.Info("Unsafe recovery receives invalid store report", + log.Info("unsafe recovery receives invalid store report", zap.Uint64("store-id", storeID), zap.Uint64("expected-step", u.step), zap.Uint64("obtained-step", heartbeat.StoreReport.GetStep())) // invalid store report, ignore return false, nil @@ -879,10 +879,11 @@ func (t *regionTree) insert(item *regionItem) (bool, error) { return false, errors.Errorf("region %v shouldn't be updated twice", item.Region().GetId()) } - for _, old := range overlaps { + for _, newer := range overlaps { + log.Info("unsafe recovery found overlap regions", logutil.ZapRedactStringer("newer-region-meta", core.RegionToHexMeta(newer.Region())), logutil.ZapRedactStringer("older-region-meta", core.RegionToHexMeta(item.Region()))) // it's ensured by the `buildUpFromReports` that peers are inserted in epoch descending order. - if old.IsEpochStale(item) { - return false, errors.Errorf("region %v's epoch shouldn't be staler than old ones %v", item, old) + if newer.IsEpochStale(item) { + return false, errors.Errorf("region %v's epoch shouldn't be staler than old ones %v", item, newer) } } if len(overlaps) != 0 { diff --git a/server/server.go b/server/server.go index 1f4aedfa31a..4b1c50bec67 100644 --- a/server/server.go +++ b/server/server.go @@ -101,7 +101,7 @@ const ( // PDMode represents that server is in PD mode. PDMode = "PD" // APIServiceMode represents that server is in API service mode. - APIServiceMode = "API service" + APIServiceMode = "API Service" // maxRetryTimesGetServicePrimary is the max retry times for getting primary addr. // Note: it need to be less than client.defaultPDTimeout @@ -1624,6 +1624,7 @@ func (s *Server) campaignLeader() { } // EnableLeader to accept the remaining service, such as GetStore, GetRegion. s.member.EnableLeader() + member.ServiceMemberGauge.WithLabelValues(s.mode).Set(1) if !s.IsAPIServiceMode() { // Check the cluster dc-location after the PD leader is elected. go s.tsoAllocatorManager.ClusterDCLocationChecker() @@ -1633,6 +1634,7 @@ func (s *Server) campaignLeader() { // to be new leader. cancel() s.member.ResetLeader() + member.ServiceMemberGauge.WithLabelValues(s.mode).Set(0) }) CheckPDVersion(s.persistOptions)