From 20246c7f62e2b938413f897d3f03be49e893de3d Mon Sep 17 00:00:00 2001 From: "Johannes M. Scheuermann" Date: Mon, 18 Sep 2023 12:52:42 +0200 Subject: [PATCH] Move parse logic and printing of cluster messages into the same place --- controllers/cluster_controller.go | 10 +-------- fdbclient/admin_client.go | 14 ++---------- fdbclient/common.go | 36 ++++++++++++++++++++++++------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/controllers/cluster_controller.go b/controllers/cluster_controller.go index 6364cfc68..2bd098b65 100644 --- a/controllers/cluster_controller.go +++ b/controllers/cluster_controller.go @@ -136,7 +136,7 @@ func (r *FoundationDBClusterReconciler) Reconcile(ctx context.Context, request c clusterLog.Info("Fetch machine-readable status for reconcilitation loop", "cacheStatus", cacheStatus) status, err = r.getStatusFromClusterOrDummyStatus(clusterLog, cluster) if err != nil { - return ctrl.Result{Requeue: true}, err + return ctrl.Result{Requeue: true, RequeueAfter: 10 * time.Second}, err } } @@ -481,14 +481,6 @@ func (r *FoundationDBClusterReconciler) getStatusFromClusterOrDummyStatus(logger status, err := adminClient.GetStatus() if err == nil { - if len(status.Client.Messages) > 0 { - logger.Info("found client message(s) in the machine-readable status", "messages", status.Client.Messages) - } - - if len(status.Cluster.Messages) > 0 { - logger.Info("found cluster message(s) in the machine-readable status", "messages", status.Cluster.Messages) - } - return status, nil } diff --git a/fdbclient/admin_client.go b/fdbclient/admin_client.go index b44c2abc9..15c78eebe 100644 --- a/fdbclient/admin_client.go +++ b/fdbclient/admin_client.go @@ -285,17 +285,7 @@ func (client *cliAdminClient) getStatusFromCli() (*fdbv1beta2.FoundationDBStatus return nil, err } - status := &fdbv1beta2.FoundationDBStatus{} - err = json.Unmarshal(contents, status) - if err != nil { - return nil, err - } - - // TODO (johscheuer): Build a smarter retry mechanism here for timeouts that are not timeouts on the transaction - // level but rather on the get status itself, e.g. we should be checking for `status_incomplete_timeout`. We could specify - // a retry of X or we just force the operator to reconcile again. - - return status, nil + return parseMachineReadableStatus(client.log, contents) } // getStatus uses fdbcli to connect to the FDB cluster, if the cluster is upgraded and the initial version returns no processes @@ -327,7 +317,7 @@ func (client *cliAdminClient) GetStatus() (*fdbv1beta2.FoundationDBStatus, error defer adminClientMutex.Unlock() // This will call directly the database and fetch the status information from the system key space. - status, err := getStatusFromDB(client.fdbLibClient, MaxCliTimeout) + status, err := getStatusFromDB(client.fdbLibClient, client.log, MaxCliTimeout) // There is a limitation in the multi version client if the cluster is only partially upgraded e.g. because not // all fdbserver processes are restarted, then the multi version client sometimes picks the wrong version // to connect to the cluster. This will result in an empty status only reporting the unreachable coordinators. diff --git a/fdbclient/common.go b/fdbclient/common.go index 3807c33e2..c6d7cce0d 100644 --- a/fdbclient/common.go +++ b/fdbclient/common.go @@ -23,6 +23,7 @@ package fdbclient import ( "encoding/json" "errors" + "fmt" "io/fs" "os" "path" @@ -45,6 +46,31 @@ const ( defaultTransactionTimeout = 5 * time.Second ) +func parseMachineReadableStatus(logger logr.Logger, contents []byte) (*fdbv1beta2.FoundationDBStatus, error) { + status := &fdbv1beta2.FoundationDBStatus{} + err := json.Unmarshal(contents, status) + if err != nil { + return nil, err + } + + if len(status.Client.Messages) > 0 { + logger.Info("found client message(s) in the machine-readable status", "messages", status.Client.Messages) + } + + if len(status.Cluster.Messages) > 0 { + logger.Info("found cluster message(s) in the machine-readable status", "messages", status.Cluster.Messages) + + // If the status is incomplete because of a timeout, return an error. This will force a new reconciliation. + for _, message := range status.Cluster.Messages { + if message.Name == "status_incomplete_timeout" { + return nil, fdbv1beta2.TimeoutError{Err: fmt.Errorf("found \"status_incomplete_timeout\" in cluster messages")} + } + } + } + + return status, nil +} + // getFDBDatabase opens an FDB database. func getFDBDatabase(cluster *fdbv1beta2.FoundationDBCluster) (fdb.Database, error) { clusterFile, err := createClusterFile(cluster) @@ -97,19 +123,13 @@ func getConnectionStringFromDB(libClient fdbLibClient, timeout time.Duration) ([ } // getStatusFromDB gets the database's status directly from the system key -func getStatusFromDB(libClient fdbLibClient, timeout time.Duration) (*fdbv1beta2.FoundationDBStatus, error) { +func getStatusFromDB(libClient fdbLibClient, logger logr.Logger, timeout time.Duration) (*fdbv1beta2.FoundationDBStatus, error) { contents, err := libClient.getValueFromDBUsingKey("\xff\xff/status/json", timeout) if err != nil { return nil, err } - status := &fdbv1beta2.FoundationDBStatus{} - err = json.Unmarshal(contents, status) - if err != nil { - return nil, err - } - - return status, nil + return parseMachineReadableStatus(logger, contents) } type realDatabaseClientProvider struct {