From 4b5aecb1b15c3ec9b35d5dd706437ba09703e33e Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Mon, 18 Sep 2023 10:32:27 +0200 Subject: [PATCH] Allow to use max timeout for get status and allow to specify the max timeout --- fdbclient/admin_client.go | 61 +++++++++------------------------------ fdbclient/common.go | 11 ++++--- setup/setup.go | 3 ++ 3 files changed, 23 insertions(+), 52 deletions(-) diff --git a/fdbclient/admin_client.go b/fdbclient/admin_client.go index 7ca298356..b44c2abc9 100644 --- a/fdbclient/admin_client.go +++ b/fdbclient/admin_client.go @@ -111,17 +111,6 @@ func NewCliAdminClient(cluster *fdbv1beta2.FoundationDBCluster, _ client.Client, }, nil } -// getMaxTimeout returns the maximum timeout, this is either the default of 40 seconds or if the provided default timeout -// is higher it will be the default cli timeout. -func (client *cliAdminClient) getMaxTimeout() time.Duration { - // TODO (johscheuer): ALlow to specify the max timeout. - if DefaultCLITimeout > 40*time.Second { - return DefaultCLITimeout - } - - return 40 * time.Second -} - // cliCommand describes a command that we are running against FDB. type cliCommand struct { // binary is the binary to run. @@ -282,40 +271,11 @@ func (client *cliAdminClient) runCommand(command cliCommand) (string, error) { return outputString, nil } -// runCommandWithBackoff is a wrapper around runCommand which allows retrying commands if they hit a timeout. -func (client *cliAdminClient) runCommandWithBackoff(command string) (string, error) { - currentTimeout := DefaultCLITimeout - - var rawResult string - var err error - - // This method will be retrying to get the status if a timeout is seen. The timeout will be doubled everytime we try - // it with the default timeout of 10s we will try it 3 times with the following timeouts: 10s - 20s - 40s. We have - // seen that during upgrades of version incompatible version, when not all coordinators are properly restarted that - // the response time will be increased. - for currentTimeout <= client.getMaxTimeout() { - rawResult, err = client.runCommand(cliCommand{command: command, timeout: currentTimeout}) - if err == nil { - break - } - - var timoutError *fdbv1beta2.TimeoutError - if errors.As(err, &timoutError) { - client.log.Info("timeout issue will retry with higher timeout") - currentTimeout *= 2 - continue - } - - // If any error other than a timeout happens return this error and don't retry. - return "", err - } - - return rawResult, err -} - // getStatusFromCli uses the fdbcli to connect to the FDB cluster func (client *cliAdminClient) getStatusFromCli() (*fdbv1beta2.FoundationDBStatus, error) { - output, err := client.runCommandWithBackoff("status json") + // Always use the max timeout here. Otherwise we will retry multiple times with an increasing timeout. As the + // timeout is only the upper bound using directly the max timeout reduces the calls to a single call. + output, err := client.runCommand(cliCommand{command: "status json", timeout: MaxCliTimeout}) if err != nil { return nil, err } @@ -331,6 +291,10 @@ func (client *cliAdminClient) getStatusFromCli() (*fdbv1beta2.FoundationDBStatus return nil, err } + // TODO (johscheuer): Build a smarter retry mechanism here for timeouts that are not timeouts on the transaction + // level but rather on the get status itself, e.g. we should be checking for `status_incomplete_timeout`. We could specify + // a retry of X or we just force the operator to reconcile again. + return status, nil } @@ -343,8 +307,9 @@ func (client *cliAdminClient) getStatus() (*fdbv1beta2.FoundationDBStatus, error } // If the status doesn't contain any processes and we are doing an upgrade, we probably use the wrong fdbcli version - // and we have to fallback to the on specified in out spec.version. + // and we have to fallback to the one specified in our spec.version. if (status == nil || len(status.Cluster.Processes) == 0) && client.Cluster.IsBeingUpgradedWithVersionIncompatibleVersion() { + client.log.V(1).Info("retry fetching status with version specified in spec.Version", "error", err, "status", status) // Create a copy of the cluster and make use of the desired version instead of the last observed running version. clusterCopy := client.Cluster.DeepCopy() clusterCopy.Status.RunningVersion = clusterCopy.Spec.Version @@ -362,7 +327,7 @@ func (client *cliAdminClient) GetStatus() (*fdbv1beta2.FoundationDBStatus, error defer adminClientMutex.Unlock() // This will call directly the database and fetch the status information from the system key space. - status, err := getStatusFromDB(client.fdbLibClient) + status, err := getStatusFromDB(client.fdbLibClient, MaxCliTimeout) // There is a limitation in the multi version client if the cluster is only partially upgraded e.g. because not // all fdbserver processes are restarted, then the multi version client sometimes picks the wrong version // to connect to the cluster. This will result in an empty status only reporting the unreachable coordinators. @@ -443,7 +408,7 @@ func (client *cliAdminClient) ExcludeProcesses(addresses []fdbv1beta2.ProcessAdd excludeCommand.WriteString(fdbv1beta2.ProcessAddressesString(addresses, " ")) - _, err = client.runCommand(cliCommand{command: excludeCommand.String(), timeout: client.getMaxTimeout()}) + _, err = client.runCommand(cliCommand{command: excludeCommand.String(), timeout: MaxCliTimeout}) return err } @@ -494,7 +459,7 @@ func (client *cliAdminClient) KillProcesses(addresses []fdbv1beta2.ProcessAddres fdbv1beta2.ProcessAddressesStringWithoutFlags(addresses, " "), ) // Run the kill command once with the max timeout to reduce the risk of multiple recoveries happening. - _, err := client.runCommand(cliCommand{command: killCommand, timeout: client.getMaxTimeout()}) + _, err := client.runCommand(cliCommand{command: killCommand, timeout: MaxCliTimeout}) return err } @@ -537,7 +502,7 @@ func cleanConnectionStringOutput(input string) string { func (client *cliAdminClient) GetConnectionString() (string, error) { // This will call directly the database and fetch the connection string // from the system key space. - outputBytes, err := getConnectionStringFromDB(client.fdbLibClient) + outputBytes, err := getConnectionStringFromDB(client.fdbLibClient, DefaultCLITimeout) if err != nil { return "", err diff --git a/fdbclient/common.go b/fdbclient/common.go index a021b9b0f..3807c33e2 100644 --- a/fdbclient/common.go +++ b/fdbclient/common.go @@ -38,6 +38,9 @@ import ( // DefaultCLITimeout is the default timeout for CLI commands. var DefaultCLITimeout = 10 * time.Second +// MaxCliTimeout is the maximum CLI timeout that will be used for requests that might be slower to respond. +var MaxCliTimeout = 60 * time.Second + const ( defaultTransactionTimeout = 5 * time.Second ) @@ -89,13 +92,13 @@ func ensureClusterFileIsPresent(dir string, uid string, connectionString string) } // getConnectionStringFromDB gets the database's connection string directly from the system key -func getConnectionStringFromDB(libClient fdbLibClient) ([]byte, error) { - return libClient.getValueFromDBUsingKey("\xff/coordinators", DefaultCLITimeout) +func getConnectionStringFromDB(libClient fdbLibClient, timeout time.Duration) ([]byte, error) { + return libClient.getValueFromDBUsingKey("\xff/coordinators", timeout) } // getStatusFromDB gets the database's status directly from the system key -func getStatusFromDB(libClient fdbLibClient) (*fdbv1beta2.FoundationDBStatus, error) { - contents, err := libClient.getValueFromDBUsingKey("\xff\xff/status/json", DefaultCLITimeout) +func getStatusFromDB(libClient fdbLibClient, timeout time.Duration) (*fdbv1beta2.FoundationDBStatus, error) { + contents, err := libClient.getValueFromDBUsingKey("\xff\xff/status/json", timeout) if err != nil { return nil, err } diff --git a/setup/setup.go b/setup/setup.go index 55b2fd4da..7abedd99e 100644 --- a/setup/setup.go +++ b/setup/setup.go @@ -68,6 +68,7 @@ type Options struct { LabelSelector string WatchNamespace string CliTimeout int + MaxCliTimeout int MaxConcurrentReconciles int LogFileMaxSize int LogFileMaxAge int @@ -90,6 +91,7 @@ func (o *Options) BindFlags(fs *flag.FlagSet) { ) fs.StringVar(&o.LogFile, "log-file", "", "The path to a file to write logs to.") fs.IntVar(&o.CliTimeout, "cli-timeout", 10, "The timeout to use for CLI commands in seconds.") + fs.IntVar(&o.MaxCliTimeout, "max-cli-timeout", 60, "The maximum timeout to use for CLI commands in seconds. This timeout is used for CLI requests that are known to be potentially slow like get status or exclude.") fs.IntVar(&o.MaxConcurrentReconciles, "max-concurrent-reconciles", 1, "Defines the maximum number of concurrent reconciles for all controllers.") fs.BoolVar(&o.CleanUpOldLogFile, "cleanup-old-cli-logs", true, "Defines if the operator should delete old fdbcli log files.") fs.DurationVar(&o.LogFileMinAge, "log-file-min-age", 5*time.Minute, "Defines the minimum age of fdbcli log files before removing when \"--cleanup-old-cli-logs\" is set.") @@ -143,6 +145,7 @@ func StartManager( setupLog := logger.WithName("setup") fdbclient.DefaultCLITimeout = time.Duration(operatorOpts.CliTimeout) * time.Second + fdbclient.MaxCliTimeout = time.Duration(operatorOpts.MaxCliTimeout) * time.Second options := ctrl.Options{ Scheme: scheme,