Skip to content

Commit

Permalink
Sync from server repo (886a4000af)
Browse files Browse the repository at this point in the history
  • Loading branch information
Matt Spilchen committed Sep 19, 2023
1 parent 5d77c8b commit 29bcaeb
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 21 deletions.
14 changes: 13 additions & 1 deletion vclusterops/cluster_op.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
package vclusterops

import (
"errors"
"fmt"
"net"
"strings"

"github.com/vertica/vcluster/vclusterops/util"
Expand Down Expand Up @@ -103,7 +105,7 @@ func (hostResult *HostHTTPResult) IsSuccess() bool {
}

// check only password and certificate for start_db
func (hostResult *HostHTTPResult) IsPasswordandCertificateError() bool {
func (hostResult *HostHTTPResult) IsPasswordAndCertificateError() bool {
if !hostResult.IsUnauthorizedRequest() {
return false
}
Expand Down Expand Up @@ -140,6 +142,16 @@ func (hostResult *HostHTTPResult) isException() bool {
return hostResult.status == EXCEPTION
}

func (hostResult *HostHTTPResult) isTimeout() bool {
if hostResult.err != nil {
var netErr net.Error
if errors.As(hostResult.err, &netErr) && netErr.Timeout() {
return true
}
}
return false
}

// getStatusString converts ResultStatus to string
func (status ResultStatus) getStatusString() string {
if status == FAILURE {
Expand Down
54 changes: 34 additions & 20 deletions vclusterops/https_poll_node_state_op.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ import (
"github.com/vertica/vcluster/vclusterops/vlog"
)

// Timeout set to 30 seconds for each GET /v1/nodes/{node} call.
// 30 seconds is long enough for normal http request.
// If this timeout is reached, it might imply that the target IP is unreachable
const httpRequestTimeoutSeconds = 30

type HTTPSPollNodeStateOp struct {
OpBase
OpHTTPSBase
Expand Down Expand Up @@ -92,7 +97,8 @@ func (op *HTTPSPollNodeStateOp) setupClusterHTTPRequest(hosts []string) error {
for _, host := range hosts {
httpRequest := HostHTTPRequest{}
httpRequest.Method = GetMethod
httpRequest.BuildHTTPSEndpoint("nodes")
httpRequest.Timeout = httpRequestTimeoutSeconds
httpRequest.BuildHTTPSEndpoint("nodes/" + host)
if op.useHTTPPassword {
httpRequest.Password = op.httpsPassword
httpRequest.Username = op.userName
Expand Down Expand Up @@ -175,43 +181,51 @@ type NodesInfo struct {
func (op *HTTPSPollNodeStateOp) shouldStopPolling() (bool, error) {
for host, result := range op.clusterHTTPRequest.ResultCollection {
op.logResponse(host, result)

// when we get timeout error, we know that the host is unreachable/dead
if result.isTimeout() {
return true, fmt.Errorf("[%s] cannot connect to host %s, please check if the host is still alive", op.name, host)
}

// VER-88185 vcluster start_db - password related issues
// We don't need to wait until timeout to determine if all nodes are up or not.
// If we find the wrong password for the HTTPS service on any hosts, we should fail immediately."
if result.IsPasswordandCertificateError() {
vlog.LogPrintError("[%s] All nodes are UP, but the credentials are incorrect. Catalog sync failed.",
if result.IsPasswordAndCertificateError() {
vlog.LogPrintError("[%s] The credentials are incorrect. The following steps like 'Catalog Sync' will not be executed.",
op.name)
return false, fmt.Errorf("[%s] wrong password/certificate for https service on host %s",
return true, fmt.Errorf("[%s] wrong password/certificate for https service on host %s",
op.name, host)
}
if result.isPassing() {
// parse the /nodes endpoint response
// parse the /nodes/{node} endpoint response
nodesInfo := NodesInfo{}
err := op.parseAndCheckResponse(host, result.content, &nodesInfo)
if err != nil {
vlog.LogPrintError("[%s] fail to parse result on host %s, details: %s",
op.name, host, err)
return false, err
return true, err
}

// check whether all nodes are up
for _, n := range nodesInfo.NodeList {
if n.State == util.NodeUpState {
op.upHosts[n.Address] = struct{}{}
// check whether the node is up
// the node list should only have one node info
if len(nodesInfo.NodeList) == 1 {
nodeInfo := nodesInfo.NodeList[0]
if nodeInfo.State == util.NodeUpState {
continue
}
} else {
// if NMA endpoint cannot function well on any of the hosts, we do not want to retry polling
return true, fmt.Errorf("[%s] expect one node's information, but got %d nodes' information"+
" from NMA /v1/nodes/{node} endpoint on host %s",
op.name, len(nodesInfo.NodeList), host)
}

// the HTTPS /nodes endpoint will return the states of all nodes
// we only need to read info from one responding node
break
}
}

op.notUpHosts = util.MapKeyDiff(op.allHosts, op.upHosts)
if len(op.notUpHosts) == 0 {
vlog.LogPrintInfoln("All nodes are up")
return true, nil
// if we cannot get correct response in current node, we assume the node is not up and wait for the next poll.
// if the node is busy and cannot return correct response in this poll, the following polls should get correct response from it.
return false, nil
}

return false, nil
vlog.LogPrintInfoln("All nodes are up")
return true, nil
}
39 changes: 39 additions & 0 deletions vclusterops/https_poll_node_state_op_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
(c) Copyright [2023] Open Text.
Licensed under the Apache License, Version 2.0 (the "License");
You may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package vclusterops

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestTimeoutCase(t *testing.T) {
var instructions []ClusterOp
// use a non-existing IP to test the timeout error
// 192.0.2.1 is one that is reserved for test purpose (by RFC 5737)
hosts := []string{"192.0.2.1"}
username := "testUser"
password := "testPwd"
httpsPollNodeStateOp, err := makeHTTPSPollNodeStateOp(hosts, true, username, &password)
assert.Nil(t, err)
instructions = append(instructions, &httpsPollNodeStateOp)

certs := HTTPSCerts{}
clusterOpEngine := MakeClusterOpEngine(instructions, &certs)
err = clusterOpEngine.Run()
assert.ErrorContains(t, err, "[HTTPSPollNodeStateOp] cannot connect to host 192.0.2.1, please check if the host is still alive")
}

0 comments on commit 29bcaeb

Please sign in to comment.