-
Notifications
You must be signed in to change notification settings - Fork 725
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
*: move tso to independent thread #8720
Changes from 7 commits
f4ee98b
02c66a5
400e527
ff9c6b9
f891bad
74f9c72
b556318
2a1aee2
8cec8f3
6479d60
1bbc97f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,7 @@ | |
import ( | ||
"context" | ||
"encoding/json" | ||
errorspkg "errors" | ||
"fmt" | ||
"io" | ||
"math" | ||
|
@@ -43,6 +44,7 @@ | |
"github.com/tikv/pd/pkg/keyspace" | ||
"github.com/tikv/pd/pkg/mcs/discovery" | ||
"github.com/tikv/pd/pkg/mcs/utils/constant" | ||
"github.com/tikv/pd/pkg/member" | ||
"github.com/tikv/pd/pkg/memory" | ||
"github.com/tikv/pd/pkg/progress" | ||
"github.com/tikv/pd/pkg/ratelimit" | ||
|
@@ -56,6 +58,7 @@ | |
"github.com/tikv/pd/pkg/statistics/utils" | ||
"github.com/tikv/pd/pkg/storage" | ||
"github.com/tikv/pd/pkg/syncer" | ||
"github.com/tikv/pd/pkg/tso" | ||
"github.com/tikv/pd/pkg/unsaferecovery" | ||
"github.com/tikv/pd/pkg/utils/etcdutil" | ||
"github.com/tikv/pd/pkg/utils/keypath" | ||
|
@@ -88,12 +91,13 @@ | |
// nodeStateCheckJobInterval is the interval to run node state check job. | ||
nodeStateCheckJobInterval = 10 * time.Second | ||
// metricsCollectionJobInterval is the interval to run metrics collection job. | ||
metricsCollectionJobInterval = 10 * time.Second | ||
updateStoreStatsInterval = 9 * time.Millisecond | ||
clientTimeout = 3 * time.Second | ||
defaultChangedRegionsLimit = 10000 | ||
gcTombstoneInterval = 30 * 24 * time.Hour | ||
serviceCheckInterval = 10 * time.Second | ||
metricsCollectionJobInterval = 10 * time.Second | ||
updateStoreStatsInterval = 9 * time.Millisecond | ||
clientTimeout = 3 * time.Second | ||
defaultChangedRegionsLimit = 10000 | ||
gcTombstoneInterval = 30 * 24 * time.Hour | ||
schedulingServiceCheckInterval = 10 * time.Second | ||
tsoServiceCheckInterval = 100 * time.Millisecond | ||
// persistLimitRetryTimes is used to reduce the probability of the persistent error | ||
// since the once the store is added or removed, we shouldn't return an error even if the store limit is failed to persist. | ||
persistLimitRetryTimes = 5 | ||
|
@@ -144,6 +148,7 @@ | |
cancel context.CancelFunc | ||
|
||
*core.BasicCluster // cached cluster info | ||
member *member.EmbeddedEtcdMember | ||
|
||
etcdClient *clientv3.Client | ||
httpClient *http.Client | ||
|
@@ -174,6 +179,7 @@ | |
keyspaceGroupManager *keyspace.GroupManager | ||
independentServices sync.Map | ||
hbstreams *hbstream.HeartbeatStreams | ||
tsoAllocator *tso.AllocatorManager | ||
|
||
// heartbeatRunner is used to process the subtree update task asynchronously. | ||
heartbeatRunner ratelimit.Runner | ||
|
@@ -194,16 +200,18 @@ | |
} | ||
|
||
// NewRaftCluster create a new cluster. | ||
func NewRaftCluster(ctx context.Context, clusterID uint64, basicCluster *core.BasicCluster, storage storage.Storage, regionSyncer *syncer.RegionSyncer, etcdClient *clientv3.Client, | ||
httpClient *http.Client) *RaftCluster { | ||
func NewRaftCluster(ctx context.Context, clusterID uint64, member *member.EmbeddedEtcdMember, basicCluster *core.BasicCluster, storage storage.Storage, regionSyncer *syncer.RegionSyncer, etcdClient *clientv3.Client, | ||
httpClient *http.Client, tsoAllocator *tso.AllocatorManager) *RaftCluster { | ||
return &RaftCluster{ | ||
serverCtx: ctx, | ||
clusterID: clusterID, | ||
member: member, | ||
regionSyncer: regionSyncer, | ||
httpClient: httpClient, | ||
etcdClient: etcdClient, | ||
BasicCluster: basicCluster, | ||
storage: storage, | ||
tsoAllocator: tsoAllocator, | ||
heartbeatRunner: ratelimit.NewConcurrentRunner(heartbeatTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), | ||
miscRunner: ratelimit.NewConcurrentRunner(miscTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), | ||
logRunner: ratelimit.NewConcurrentRunner(logTaskRunner, ratelimit.NewConcurrencyLimiter(uint64(runtime.NumCPU()*2)), time.Minute), | ||
|
@@ -314,11 +322,13 @@ | |
if err != nil { | ||
return err | ||
} | ||
c.checkTSOService() | ||
cluster, err := c.LoadClusterInfo() | ||
if err != nil { | ||
return err | ||
} | ||
if cluster == nil { | ||
log.Warn("cluster is not bootstrapped") | ||
return nil | ||
} | ||
|
||
|
@@ -351,7 +361,7 @@ | |
return err | ||
} | ||
} | ||
c.checkServices() | ||
c.checkSchedulingService() | ||
c.wg.Add(9) | ||
go c.runServiceCheckJob() | ||
go c.runMetricsCollectionJob() | ||
|
@@ -370,7 +380,7 @@ | |
return nil | ||
} | ||
|
||
func (c *RaftCluster) checkServices() { | ||
func (c *RaftCluster) checkSchedulingService() { | ||
if c.isAPIServiceMode { | ||
servers, err := discovery.Discover(c.etcdClient, strconv.FormatUint(c.clusterID, 10), constant.SchedulingServiceName) | ||
if c.opt.GetMicroServiceConfig().IsSchedulingFallbackEnabled() && (err != nil || len(servers) == 0) { | ||
|
@@ -390,27 +400,90 @@ | |
} | ||
} | ||
|
||
// checkTSOService checks the TSO service. | ||
func (c *RaftCluster) checkTSOService() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems we do not need this wrap function now. We can call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for the upcomming PR |
||
if c.isAPIServiceMode { | ||
return | ||
} | ||
if c.member.IsLeader() { | ||
if err := c.startTSOJobs(); err != nil { | ||
// If there is an error, need to wait for the next check. | ||
log.Error("failed to start TSO jobs", errs.ZapError(err)) | ||
return | ||
} | ||
} else { | ||
// leader exits, reset the allocator group | ||
if err := c.stopTSOJobs(); err != nil { | ||
// If there is an error, need to wait for the next check. | ||
log.Error("failed to stop TSO jobs", errs.ZapError(err)) | ||
return | ||
} | ||
|
||
failpoint.Inject("updateAfterResetTSO", func() { | ||
allocator, _ := c.tsoAllocator.GetAllocator(tso.GlobalDCLocation) | ||
if err := allocator.UpdateTSO(); !errorspkg.Is(err, errs.ErrUpdateTimestamp) { | ||
log.Panic("the tso update after reset should return ErrUpdateTimestamp as expected", zap.Error(err)) | ||
} | ||
if allocator.IsInitialize() { | ||
log.Panic("the allocator should be uninitialized after reset") | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func (c *RaftCluster) runServiceCheckJob() { | ||
defer logutil.LogPanic() | ||
defer c.wg.Done() | ||
|
||
ticker := time.NewTicker(serviceCheckInterval) | ||
schedulingTicker := time.NewTicker(schedulingServiceCheckInterval) | ||
failpoint.Inject("highFrequencyClusterJobs", func() { | ||
ticker.Reset(time.Millisecond) | ||
schedulingTicker.Reset(time.Millisecond) | ||
}) | ||
defer ticker.Stop() | ||
defer schedulingTicker.Stop() | ||
tsoTicker := time.NewTicker(tsoServiceCheckInterval) | ||
defer tsoTicker.Stop() | ||
|
||
for { | ||
select { | ||
case <-c.ctx.Done(): | ||
log.Info("service check job is stopped") | ||
return | ||
case <-ticker.C: | ||
c.checkServices() | ||
case <-schedulingTicker.C: | ||
c.checkSchedulingService() | ||
case <-tsoTicker.C: | ||
c.checkTSOService() | ||
} | ||
} | ||
} | ||
|
||
func (c *RaftCluster) startTSOJobs() error { | ||
allocator, err := c.tsoAllocator.GetAllocator(tso.GlobalDCLocation) | ||
if err != nil { | ||
log.Error("failed to get global TSO allocator", errs.ZapError(err)) | ||
return err | ||
} | ||
if !allocator.IsInitialize() { | ||
log.Info("initializing the global TSO allocator") | ||
if err := allocator.Initialize(0); err != nil { | ||
log.Error("failed to initialize the global TSO allocator", errs.ZapError(err)) | ||
return err | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func (c *RaftCluster) stopTSOJobs() error { | ||
allocator, err := c.tsoAllocator.GetAllocator(tso.GlobalDCLocation) | ||
if err != nil { | ||
log.Error("failed to get global TSO allocator", errs.ZapError(err)) | ||
return err | ||
} | ||
if allocator.IsInitialize() { | ||
c.tsoAllocator.ResetAllocatorGroup(tso.GlobalDCLocation, true) | ||
} | ||
return nil | ||
} | ||
|
||
// startGCTuner | ||
func (c *RaftCluster) startGCTuner() { | ||
defer logutil.LogPanic() | ||
|
@@ -757,6 +830,9 @@ | |
if !c.IsServiceIndependent(constant.SchedulingServiceName) { | ||
c.stopSchedulingJobs() | ||
} | ||
if err := c.stopTSOJobs(); err != nil { | ||
log.Error("failed to stop tso jobs", errs.ZapError(err)) | ||
} | ||
c.heartbeatRunner.Stop() | ||
c.miscRunner.Stop() | ||
c.logRunner.Stop() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,6 @@ package server | |
import ( | ||
"bytes" | ||
"context" | ||
errorspkg "errors" | ||
"fmt" | ||
"math/rand" | ||
"net/http" | ||
|
@@ -490,7 +489,7 @@ func (s *Server) startServer(ctx context.Context) error { | |
|
||
s.gcSafePointManager = gc.NewSafePointManager(s.storage, s.cfg.PDServerCfg) | ||
s.basicCluster = core.NewBasicCluster() | ||
s.cluster = cluster.NewRaftCluster(ctx, clusterID, s.GetBasicCluster(), s.GetStorage(), syncer.NewRegionSyncer(s), s.client, s.httpClient) | ||
s.cluster = cluster.NewRaftCluster(ctx, clusterID, s.GetMember(), s.GetBasicCluster(), s.GetStorage(), syncer.NewRegionSyncer(s), s.client, s.httpClient, s.tsoAllocatorManager) | ||
keyspaceIDAllocator := id.NewAllocator(&id.AllocatorParams{ | ||
Client: s.client, | ||
RootPath: s.rootPath, | ||
|
@@ -1715,29 +1714,6 @@ func (s *Server) campaignLeader() { | |
s.member.KeepLeader(ctx) | ||
log.Info(fmt.Sprintf("campaign %s leader ok", s.mode), zap.String("campaign-leader-name", s.Name())) | ||
|
||
if !s.IsAPIServiceMode() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need we remove the comments? L1714, L1703-L1706 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it still works |
||
allocator, err := s.tsoAllocatorManager.GetAllocator(tso.GlobalDCLocation) | ||
if err != nil { | ||
log.Error("failed to get the global TSO allocator", errs.ZapError(err)) | ||
return | ||
} | ||
log.Info("initializing the global TSO allocator") | ||
if err := allocator.Initialize(0); err != nil { | ||
log.Error("failed to initialize the global TSO allocator", errs.ZapError(err)) | ||
return | ||
} | ||
defer func() { | ||
s.tsoAllocatorManager.ResetAllocatorGroup(tso.GlobalDCLocation, false) | ||
failpoint.Inject("updateAfterResetTSO", func() { | ||
if err = allocator.UpdateTSO(); !errorspkg.Is(err, errs.ErrUpdateTimestamp) { | ||
log.Panic("the tso update after reset should return ErrUpdateTimestamp as expected", zap.Error(err)) | ||
} | ||
if allocator.IsInitialize() { | ||
log.Panic("the allocator should be uninitialized after reset") | ||
} | ||
}) | ||
}() | ||
} | ||
if err := s.reloadConfigFromKV(); err != nil { | ||
log.Error("failed to reload configuration", errs.ZapError(err)) | ||
return | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we put this func next to
checkSchedulingService
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
prefer to check it ASAP