Skip to content

Commit

Permalink
Sync from server repo (6b462bb9bb9)
Browse files Browse the repository at this point in the history
  • Loading branch information
cchen-vertica committed Jun 3, 2024
1 parent 61fab78 commit c66c9e9
Show file tree
Hide file tree
Showing 6 changed files with 184 additions and 122 deletions.
93 changes: 93 additions & 0 deletions vclusterops/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,3 +359,96 @@ func validateHostMaps(hosts []string, maps ...map[string]string) error {
}
return allErrors
}

// reIP will do re-IP before sandboxing/unsandboxing if we find the catalog has stale node IPs.
// reIP will be called in three cases:
// 1. when sandboxing a subcluster, we will do re-ip in target sandbox since the node IPs in
// the main cluster could be changed. For example, a pod in main cluster gets restarted in k8s
// will cause inconsistent IPs between the sandbox and the main cluster. The target sandbox will
// have a stale node IP so adding that pod to the sandbox will fail.
// 2. when unsandboxing a subcluster, we will do re-ip in the main cluster since the node IPs
// in the sandbox could be changed. For example, a pod in a sandbox gets restarted in k8s will
// cause inconsistent IPs between the sandbox and the main cluster. The main cluster will
// have a stale node IP so moving that pod back to the main cluster will fail.
// 3. when removing a subcluster, we will do re-ip in the main cluster since the node IPs in
// the subcluster could be changed. This is a special case in k8s online upgrade, when a pod in
// a transient subcluster gets killed, we will not restart the pods in the subcluster. Instead,
// we will remove the subcluster. At this time, the nodes inside the subcluster have different IPs
// than the ones in the catalog, so removing subcluster will fail when deleting the catalog directories.
// We cannot find the correct nodes to do the deletion.
func (vcc *VClusterCommands) reIP(options *DatabaseOptions, scName, primaryUpHost string,
nodeNameAddressMap map[string]string, reloadSpread bool) error {
reIPList := []ReIPInfo{}
reIPHosts := []string{}
vdb := makeVCoordinationDatabase()

backupHosts := options.Hosts
// only use one up node in the sandbox/main-cluster to retrieve nodes' info,
// then we can get the latest node IPs in the sandbox/main-cluster.
// When the operation is sandbox, the initiator will be a primary up node
// from the target sandbox.
// When the operation is unsandbox, the initiator will be a primary up node
// from the main cluster.
// When the operation is remove_subcluster, the initiator will be a primary
// up node from the main cluster.
initiator := []string{primaryUpHost}
options.Hosts = initiator
err := vcc.getVDBFromRunningDBIncludeSandbox(&vdb, options, AnySandbox)
if err != nil {
return fmt.Errorf("host %q in database is not available: %w", primaryUpHost, err)
}
// restore the options.Hosts for later creating sandbox/unsandbox instructions
options.Hosts = backupHosts

// if the current node IPs doesn't match the expected ones, we need to do re-ip
for _, vnode := range vdb.HostNodeMap {
address, ok := nodeNameAddressMap[vnode.Name]
if ok && address != vnode.Address {
reIPList = append(reIPList, ReIPInfo{NodeName: vnode.Name, TargetAddress: address})
reIPHosts = append(reIPHosts, address)
}
}
if len(reIPList) > 0 {
return vcc.doReIP(options, scName, initiator, reIPHosts, reIPList, reloadSpread)
}
return nil
}

// doReIP will call NMA and HTTPs endpoints to fix the IPs in the catalog.
// It will execute below steps:
// 1. collect network profile for the nodes that need to re-ip
// 2. execute re-ip on a primary up host
// 3. reload spread on a primary up host if needed
func (vcc *VClusterCommands) doReIP(options *DatabaseOptions, scName string,
initiator, reIPHosts []string, reIPList []ReIPInfo, reloadSpread bool) error {
var instructions []clusterOp
nmaNetworkProfileOp := makeNMANetworkProfileOp(reIPHosts)
err := options.setUsePassword(vcc.Log)
if err != nil {
return err
}
instructions = append(instructions, &nmaNetworkProfileOp)
for _, reIPNode := range reIPList {
httpsReIPOp, e := makeHTTPSReIPOpWithHosts(initiator, []string{reIPNode.NodeName},
[]string{reIPNode.TargetAddress}, options.usePassword, options.UserName, options.Password)
if e != nil {
return e
}
instructions = append(instructions, &httpsReIPOp)
}
if reloadSpread {
httpsReloadSpreadOp, e := makeHTTPSReloadSpreadOpWithInitiator(initiator, options.usePassword, options.UserName, options.Password)
if e != nil {
return err
}
instructions = append(instructions, &httpsReloadSpreadOp)
}
certs := httpsCerts{key: options.Key, cert: options.Cert, caCert: options.CaCert}
clusterOpEngine := makeClusterOpEngine(instructions, &certs)
err = clusterOpEngine.run(vcc.Log)
if err != nil {
return fmt.Errorf("failed to re-ip nodes of subcluster %q: %w", scName, err)
}

return nil
}
79 changes: 43 additions & 36 deletions vclusterops/nma_vertica_version_op.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,51 +338,66 @@ func (op *nmaVerticaVersionOp) prepareHostNodeMap(execContext *opEngineExecConte
hostSCMap[host] = vnode.Subcluster.Name
scHostsMap[vnode.Subcluster.Name] = append(scHostsMap[vnode.Subcluster.Name], host)
}
// find subclusters that hold the target hosts
targetSCs := []string{}
for _, host := range op.targetNodeIPs {
sc, ok := hostSCMap[host]
if ok {
targetSCs = append(targetSCs, sc)
} else {
return hostNodeMap, fmt.Errorf("[%s] host %s does not exist in the database", op.name, host)
}
}
// find all hosts that in target subclusters
allHostsInTargetSCs := []string{}
for _, sc := range targetSCs {
hosts, ok := scHostsMap[sc]
if ok {
allHostsInTargetSCs = append(allHostsInTargetSCs, hosts...)
} else {
return hostNodeMap, fmt.Errorf("[%s] internal error: subcluster %s was lost when preparing the hosts", op.name, sc)
}
allHostsInTargetSCs, err := op.findHostsInTargetSubclusters(hostSCMap, scHostsMap)
if err != nil {
return hostNodeMap, err
}
// get host-node map for all hosts in target subclusters
hostNodeMap = util.FilterMapByKey(execContext.nmaVDatabase.HostNodeMap, allHostsInTargetSCs)
}
return hostNodeMap, nil
}

// prepareHostNodeMapWithVDB is a helper to make a host-node map for nodes in the main cluster
// or in a sandbox
// prepareHostNodeMapWithVDB is a helper to make a host-node map for all nodes in the
// subclusters of target nodes
func (op *nmaVerticaVersionOp) prepareHostNodeMapWithVDB() (vHostNodeMap, error) {
if len(op.targetNodeIPs) == 0 {
return op.vdb.HostNodeMap, nil
}
hostNodeMap := makeVHostNodeMap()
// we pass in the first host because we expect all of the
// target hosts to belong to the same cluster
sbName, err := op.getSandboxName(op.targetNodeIPs[0])
hostSCMap := make(map[string]string)
scHostsMap := make(map[string][]string)
for host, vnode := range op.vdb.HostNodeMap {
hostSCMap[host] = vnode.Subcluster
scHostsMap[vnode.Subcluster] = append(scHostsMap[vnode.Subcluster], host)
}
allHostsInTargetSCs, err := op.findHostsInTargetSubclusters(hostSCMap, scHostsMap)
if err != nil {
return hostNodeMap, err
}
for host, vnode := range op.vdb.HostNodeMap {
if vnode.Sandbox == sbName {
hostNodeMap[host] = vnode
// get host-node map for all hosts in target subclusters
hostNodeMap = util.FilterMapByKey(op.vdb.HostNodeMap, allHostsInTargetSCs)

return hostNodeMap, nil
}

// findHostsInTargetSubclusters is a helper function to get all hosts in the subclusters of
// target nodes. The parameters of this function are two maps:
// 1. host-subcluster map for the entire database
// 2. subcluster-hosts map for the entire database
func (op *nmaVerticaVersionOp) findHostsInTargetSubclusters(hostSCMap map[string]string,
scHostsMap map[string][]string) ([]string, error) {
allHostsInTargetSCs := []string{}
// find subclusters that hold the target hosts
targetSCs := []string{}
for _, host := range op.targetNodeIPs {
sc, ok := hostSCMap[host]
if ok {
targetSCs = append(targetSCs, sc)
} else {
return allHostsInTargetSCs, fmt.Errorf("[%s] host %s does not exist in the database", op.name, host)
}
}
return hostNodeMap, nil
// find all hosts that in target subclusters
for _, sc := range targetSCs {
hosts, ok := scHostsMap[sc]
if ok {
allHostsInTargetSCs = append(allHostsInTargetSCs, hosts...)
} else {
return allHostsInTargetSCs, fmt.Errorf("[%s] internal error: subcluster %s was lost when preparing the hosts", op.name, sc)
}
}
return allHostsInTargetSCs, nil
}

func (op *nmaVerticaVersionOp) buildHostVersionMapDefault() {
Expand Down Expand Up @@ -443,11 +458,3 @@ func (op *nmaVerticaVersionOp) buildHostVersionMapWithVDB(execContext *opEngineE
}
return nil
}

func (op *nmaVerticaVersionOp) getSandboxName(host string) (string, error) {
vnode, ok := op.vdb.HostNodeMap[host]
if !ok {
return "", fmt.Errorf("[%s] host %s does not exist in the database", op.name, host)
}
return vnode.Sandbox, nil
}
22 changes: 22 additions & 0 deletions vclusterops/remove_subcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ type VRemoveScOptions struct {
DatabaseOptions
SCName string // subcluster to remove from database
ForceDelete bool // whether force delete directories
// The expected node names with their IPs in the subcluster, the user of vclusterOps needs
// to make sure the provided values are correct. This option will be used to do re-ip in
// the cluster that contains the subcluster.
NodeNameAddressMap map[string]string
// A primary up host in another subcluster that belongs to same cluster as the target subcluster.
// This option will be used to do re-ip in the cluster.
PrimaryUpHost string
}

func VRemoveScOptionsFactory() VRemoveScOptions {
Expand Down Expand Up @@ -141,6 +148,21 @@ func (vcc VClusterCommands) VRemoveSubcluster(removeScOpt *VRemoveScOptions) (VC
return vdb, err
}

// If the users provide extra node information, we will check and do re-ip for the nodes in
// the subcluster if necessary. This is to address the case where catalog has stale IPs of the
// nodes in the subcluster, which would cause a node removal failure at delete-directory step.
if removeScOpt.PrimaryUpHost != "" && len(removeScOpt.NodeNameAddressMap) > 0 {
e := vcc.reIP(&removeScOpt.DatabaseOptions,
removeScOpt.SCName,
removeScOpt.PrimaryUpHost,
removeScOpt.NodeNameAddressMap,
// we will do reload spread in remove_node so we don't need to do reload spread here
false /*reload spread*/)
if e != nil {
return vdb, e
}
}

// pre-check: should not remove the default subcluster
vcc.PrintInfo("Performing remove_subcluster pre-checks")
hostsToRemove, err := vcc.removeScPreCheck(&vdb, removeScOpt)
Expand Down
86 changes: 1 addition & 85 deletions vclusterops/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ func (options *VSandboxOptions) runCommand(vcc VClusterCommands) error {
// to provide some node information
if options.SandboxPrimaryUpHost != "" && len(options.NodeNameAddressMap) > 0 {
err := vcc.reIP(&options.DatabaseOptions, options.SCName, options.SandboxPrimaryUpHost,
options.NodeNameAddressMap)
options.NodeNameAddressMap, true /*reload spread*/)
if err != nil {
return err
}
Expand Down Expand Up @@ -242,87 +242,3 @@ func runSandboxCmd(vcc VClusterCommands, i sandboxInterface) error {

return i.runCommand(vcc)
}

// reIP will do re-IP before sandboxing/unsandboxing if we find the catalog has stale node IPs.
// reIP will be called in two cases:
// 1. when sandboxing a subcluster, we will do re-ip in target sandbox since the node IPs in
// the main cluster could be changed. For example, a pod in main cluster gets restarted in k8s
// will cause inconsistent IPs between the sandbox and the main cluster. The target sandbox will
// have a stale node IP so adding that pod to the sandbox will fail.
// 2. when unsandboxing a subcluster, we will do re-ip in the main cluster since the node IPs
// in the sandbox could be changed. For example, a pod in a sandbox gets restarted in k8s will
// cause inconsistent IPs between the sandbox and the main cluster. The main cluster will
// have a stale node IP so moving that pod back to the main cluster will fail.
func (vcc *VClusterCommands) reIP(options *DatabaseOptions, scName, primaryUpHost string,
nodeNameAddressMap map[string]string) error {
reIPList := []ReIPInfo{}
reIPHosts := []string{}
vdb := makeVCoordinationDatabase()

backupHosts := options.Hosts
// only use one up node in the sandbox/main-cluster to retrieve nodes' info,
// then we can get the latest node IPs in the sandbox/main-cluster.
// When the operation is sandbox, the initiator will be a primary up node
// from the target sandbox. When the operation is unsandbox, the initiator
// will be a primary up node from the main cluster.
initiator := []string{primaryUpHost}
options.Hosts = initiator
err := vcc.getVDBFromRunningDBIncludeSandbox(&vdb, options, AnySandbox)
if err != nil {
return fmt.Errorf("host %q in database is not available: %w", primaryUpHost, err)
}
// restore the options.Hosts for later creating sandbox/unsandbox instructions
options.Hosts = backupHosts

// if the current node IPs doesn't match the expected ones, we need to do re-ip
for _, vnode := range vdb.HostNodeMap {
address, ok := nodeNameAddressMap[vnode.Name]
if ok && address != vnode.Address {
reIPList = append(reIPList, ReIPInfo{NodeName: vnode.Name, TargetAddress: address})
reIPHosts = append(reIPHosts, address)
}
}
if len(reIPList) > 0 {
return vcc.doReIP(options, scName, initiator, reIPHosts, reIPList)
}
return nil
}

// doReIP will call NMA and HTTPs endpoints to fix the IPs in the catalog.
// It will execute below steps:
// 1. collect network profile for the nodes that need to re-ip
// 2. execute re-ip on a primary up host
// 3. reload spread on a primary up host
func (vcc *VClusterCommands) doReIP(options *DatabaseOptions, scName string,
initiator, reIPHosts []string, reIPList []ReIPInfo) error {
var instructions []clusterOp
nmaNetworkProfileOp := makeNMANetworkProfileOp(reIPHosts)
err := options.setUsePassword(vcc.Log)
if err != nil {
return err
}
instructions = append(instructions, &nmaNetworkProfileOp)
for _, reIPNode := range reIPList {
httpsReIPOp, e := makeHTTPSReIPOpWithHosts(initiator, []string{reIPNode.NodeName},
[]string{reIPNode.TargetAddress}, options.usePassword, options.UserName, options.Password)
if e != nil {
return e
}
instructions = append(instructions, &httpsReIPOp)
}
// host is set to nil value in the reload spread step
// we use information from node information to find the up host later
httpsReloadSpreadOp, err := makeHTTPSReloadSpreadOpWithInitiator(initiator, options.usePassword, options.UserName, options.Password)
if err != nil {
return err
}
instructions = append(instructions, &httpsReloadSpreadOp)
certs := httpsCerts{key: options.Key, cert: options.Cert, caCert: options.CaCert}
clusterOpEngine := makeClusterOpEngine(instructions, &certs)
err = clusterOpEngine.run(vcc.Log)
if err != nil {
return fmt.Errorf("failed to re-ip nodes of subcluster %q: %w", scName, err)
}

return nil
}
24 changes: 24 additions & 0 deletions vclusterops/start_node.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,12 @@ func (vcc VClusterCommands) VStartNodes(options *VStartNodesOptions) error {
// - that don't need to re-ip
hostsNoNeedToReIP := options.separateHostsBasedOnReIPNeed(hostNodeNameMap, restartNodeInfo, &vdb, vcc.Log)

// check primary node count is more than nodes to re-ip, specially for sandboxes
err = options.checkQuorum(&vdb, restartNodeInfo)
if err != nil {
return err
}

// for the hosts that don't need to re-ip,
// if none of them is down and no other nodes to re-ip,
// we will early stop as there is no need to start them
Expand Down Expand Up @@ -246,6 +252,24 @@ func (vcc VClusterCommands) VStartNodes(options *VStartNodesOptions) error {
return nil
}

// primary up node details can vary in case of sandboxes. This check is to ensure quorum is maintained
// even when a sandbox node is reip'ed
func (options *VStartNodesOptions) checkQuorum(vdb *VCoordinationDatabase, restartNodeInfo *VStartNodesInfo) error {
sandboxPrimaryUpNodes := []string{}
for _, vnode := range vdb.HostNodeMap {
if vnode.IsPrimary && vnode.State == util.NodeUpState && vnode.Sandbox == restartNodeInfo.Sandbox {
sandboxPrimaryUpNodes = append(sandboxPrimaryUpNodes, vnode.Address)
}
}
if len(sandboxPrimaryUpNodes) <= len(restartNodeInfo.ReIPList) {
return &ReIPNoClusterQuorumError{
Detail: fmt.Sprintf("Quorum check failed: %d up node(s) is/are not enough to re-ip %d node(s)",
len(sandboxPrimaryUpNodes), len(restartNodeInfo.ReIPList)),
}
}
return nil
}

// produceStartNodesInstructions will build a list of instructions to execute for
// the restart_node command.
//
Expand Down
2 changes: 1 addition & 1 deletion vclusterops/unsandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ func (options *VUnsandboxOptions) runCommand(vcc VClusterCommands) error {
// to provide some node information
if options.PrimaryUpHost != "" && len(options.NodeNameAddressMap) > 0 {
err := vcc.reIP(&options.DatabaseOptions, options.SCName, options.PrimaryUpHost,
options.NodeNameAddressMap)
options.NodeNameAddressMap, true /*reload spread*/)
if err != nil {
return err
}
Expand Down

0 comments on commit c66c9e9

Please sign in to comment.