Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

client/tso: double-check the contexts to prevent waiting for TSO requests in closed chan (#7962) #8090

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,17 @@ func (c *client) GetLocalTSAsync(ctx context.Context, dcLocation string) TSFutur
ctx = opentracing.ContextWithSpan(ctx, span)
}

<<<<<<< HEAD
=======
req := c.getTSORequest(ctx, dcLocation)
if err := c.dispatchTSORequestWithRetry(req); err != nil {
req.tryDone(err)
}
return req
}

func (c *client) getTSORequest(ctx context.Context, dcLocation string) *tsoRequest {
>>>>>>> fb9e2d561 (client/tso: double-check the contexts to prevent waiting for TSO requests in closed chan (#7962))
req := tsoReqPool.Get().(*tsoRequest)
req.requestCtx = ctx
req.clientCtx = c.ctx
Expand Down
34 changes: 33 additions & 1 deletion client/tso_batch_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ package pd
import (
"context"
"time"
<<<<<<< HEAD
=======

"github.com/pingcap/errors"
"github.com/pingcap/log"
"github.com/tikv/pd/client/tsoutil"
"go.uber.org/zap"
>>>>>>> fb9e2d561 (client/tso: double-check the contexts to prevent waiting for TSO requests in closed chan (#7962))
)

type tsoBatchController struct {
Expand Down Expand Up @@ -130,9 +138,33 @@ func (tbc *tsoBatchController) adjustBestBatchSize() {
}
}

<<<<<<< HEAD
func (tbc *tsoBatchController) revokePendingRequest(err error) {
=======
func (tbc *tsoBatchController) finishCollectedRequests(physical, firstLogical int64, suffixBits uint32, err error) {
for i := 0; i < tbc.collectedRequestCount; i++ {
tsoReq := tbc.collectedRequests[i]
tsoReq.physical, tsoReq.logical = physical, tsoutil.AddLogical(firstLogical, int64(i), suffixBits)
defer trace.StartRegion(tsoReq.requestCtx, "pdclient.tsoReqDequeue").End()
tsoReq.tryDone(err)
}
// Prevent the finished requests from being processed again.
tbc.collectedRequestCount = 0
}

func (tbc *tsoBatchController) revokePendingRequests(err error) {
>>>>>>> fb9e2d561 (client/tso: double-check the contexts to prevent waiting for TSO requests in closed chan (#7962))
for i := 0; i < len(tbc.tsoRequestCh); i++ {
req := <-tbc.tsoRequestCh
req.done <- err
req.tryDone(err)
}
}

func (tbc *tsoBatchController) clear() {
log.Info("[pd] clear the tso batch controller",
zap.Int("max-batch-size", tbc.maxBatchSize), zap.Int("best-batch-size", tbc.bestBatchSize),
zap.Int("collected-request-count", tbc.collectedRequestCount), zap.Int("pending-request-count", len(tbc.tsoRequestCh)))
tsoErr := errors.WithStack(errClosing)
tbc.finishCollectedRequests(0, 0, 0, tsoErr)
tbc.revokePendingRequests(tsoErr)
}
12 changes: 11 additions & 1 deletion client/tso_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"sync"
"time"

"github.com/pingcap/errors"
"github.com/pingcap/log"
"github.com/tikv/pd/client/errs"
"go.uber.org/zap"
Expand Down Expand Up @@ -62,6 +61,13 @@ var tsoReqPool = sync.Pool{
},
}

func (req *tsoRequest) tryDone(err error) {
select {
case req.done <- err:
default:
}
}

type tsoClient struct {
ctx context.Context
cancel context.CancelFunc
Expand Down Expand Up @@ -140,9 +146,13 @@ func (c *tsoClient) Close() {
c.tsoDispatcher.Range(func(_, dispatcherInterface interface{}) bool {
if dispatcherInterface != nil {
dispatcher := dispatcherInterface.(*tsoDispatcher)
<<<<<<< HEAD
tsoErr := errors.WithStack(errClosing)
dispatcher.tsoBatchController.revokePendingRequest(tsoErr)
=======
>>>>>>> fb9e2d561 (client/tso: double-check the contexts to prevent waiting for TSO requests in closed chan (#7962))
dispatcher.dispatcherCancel()
dispatcher.tsoBatchController.clear()
}
return true
})
Expand Down
36 changes: 36 additions & 0 deletions client/tso_dispatcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,42 @@ func (c *tsoClient) dispatchRequest(dcLocation string, request *tsoRequest) erro
c.svcDiscovery.ScheduleCheckMemberChanged()
return err
}
<<<<<<< HEAD
dispatcher.(*tsoDispatcher).tsoBatchController.tsoRequestCh <- request
return nil
=======

defer trace.StartRegion(request.requestCtx, "pdclient.tsoReqEnqueue").End()
select {
case <-request.requestCtx.Done():
// Caller cancelled the request, no need to retry.
return false, request.requestCtx.Err()
case <-request.clientCtx.Done():
// Client is closed, no need to retry.
return false, request.clientCtx.Err()
case <-c.ctx.Done():
// tsoClient is closed due to the PD service mode switch, which is retryable.
return true, c.ctx.Err()
default:
// This failpoint will increase the possibility that the request is sent to a closed dispatcher.
failpoint.Inject("delayDispatchTSORequest", func() {
time.Sleep(time.Second)
})
dispatcher.(*tsoDispatcher).tsoBatchController.tsoRequestCh <- request
}
// Check the contexts again to make sure the request is not been sent to a closed dispatcher.
// Never retry on these conditions to prevent unexpected data race.
select {
case <-request.requestCtx.Done():
return false, request.requestCtx.Err()
case <-request.clientCtx.Done():
return false, request.clientCtx.Err()
case <-c.ctx.Done():
return false, c.ctx.Err()
default:
}
return false, nil
>>>>>>> fb9e2d561 (client/tso: double-check the contexts to prevent waiting for TSO requests in closed chan (#7962))
}

// TSFuture is a future which promises to return a TSO.
Expand Down Expand Up @@ -320,6 +354,8 @@ func (c *tsoClient) handleDispatcher(
cc.(*tsoConnectionContext).cancel()
return true
})
// Clear the tso batch controller.
tbc.clear()
c.wg.Done()
}()
// Call updateTSOConnectionCtxs once to init the connectionCtxs first.
Expand Down
50 changes: 50 additions & 0 deletions tests/integrations/tso/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,56 @@ func (suite *tsoClientTestSuite) TestRandomShutdown() {
wg.Wait()
suite.TearDownSuite()
suite.SetupSuite()
<<<<<<< HEAD
=======
re.NoError(failpoint.Disable("github.com/tikv/pd/pkg/tso/fastUpdatePhysicalInterval"))
}

func (suite *tsoClientTestSuite) TestGetTSWhileResettingTSOClient() {
re := suite.Require()
re.NoError(failpoint.Enable("github.com/tikv/pd/client/delayDispatchTSORequest", "return(true)"))
var (
clients []pd.Client
stopSignal atomic.Bool
wg sync.WaitGroup
)
// Create independent clients to prevent interfering with other tests.
if suite.legacy {
client, err := pd.NewClientWithContext(suite.ctx, suite.getBackendEndpoints(), pd.SecurityOption{}, pd.WithForwardingOption(true))
re.NoError(err)
clients = []pd.Client{client}
} else {
clients = mcs.WaitForMultiKeyspacesTSOAvailable(suite.ctx, re, suite.keyspaceIDs, suite.getBackendEndpoints())
}
wg.Add(tsoRequestConcurrencyNumber * len(clients))
for i := 0; i < tsoRequestConcurrencyNumber; i++ {
for _, client := range clients {
go func(client pd.Client) {
defer wg.Done()
var lastTS uint64
for !stopSignal.Load() {
physical, logical, err := client.GetTS(suite.ctx)
if err != nil {
re.ErrorContains(err, context.Canceled.Error())
} else {
ts := tsoutil.ComposeTS(physical, logical)
re.Less(lastTS, ts)
lastTS = ts
}
}
}(client)
}
}
// Reset the TSO clients while requesting TSO concurrently.
for i := 0; i < tsoRequestConcurrencyNumber; i++ {
for _, client := range clients {
client.(interface{ ResetTSOClient() }).ResetTSOClient()
}
}
stopSignal.Store(true)
wg.Wait()
re.NoError(failpoint.Disable("github.com/tikv/pd/client/delayDispatchTSORequest"))
>>>>>>> fb9e2d561 (client/tso: double-check the contexts to prevent waiting for TSO requests in closed chan (#7962))
}

// When we upgrade the PD cluster, there may be a period of time that the old and new PDs are running at the same time.
Expand Down
Loading