Skip to content

Commit

Permalink
minor fix
Browse files Browse the repository at this point in the history
  • Loading branch information
YZ775 committed Nov 11, 2024
1 parent 3768397 commit 69b7b44
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 72 deletions.
127 changes: 73 additions & 54 deletions pkg/neco/cmd/non_graceful_shutdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

var (
cephClusters = []string{"ceph-canary-block", "ceph-dotcom-block-0", "ceph-poc", "ceph-ssd"}
)

var nonGracefulNodeShutdownCmd = &cobra.Command{
Use: "nonGracefulNodeShutdown IP_ADDRESS",
Short: "nonGracefulNodeShutdown related commands",
Expand Down Expand Up @@ -69,68 +73,41 @@ var nonGracefulNodeShutdownCmd = &cobra.Command{
powerCheckCmd.Stdout = &out
err = powerCheckCmd.Run()
if err != nil {
if kubernetesNode.Labels["node.cybozu.io/reserved-for"] == "rbd" {
fmt.Println("The node is dedicated. Skip the shutdown.")
} else {
return err
}
if strings.TrimSpace(out.String()) == "On" {
poweroffCmd := exec.Command("neco", "power", "stop", node)
err = poweroffCmd.Run()
if err != nil {
return err
}
} else {
if strings.TrimSpace(out.String()) == "On" {
poweroffCmd := exec.Command("neco", "power", "stop", node)
err = poweroffCmd.Run()
if err != nil {
return err
}
//wait for the node to be down
fmt.Printf("Waiting for the node %s to be down\n", node)
timeoutCtx, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
L:
for {
select {
case <-timeoutCtx.Done():
return errors.New("power check timeout")
default:
out.Reset()
powerCheckCmd := exec.Command("neco", "power", "status", node)
powerCheckCmd.Stdout = &out
err = powerCheckCmd.Run()
if err != nil {
return err
}
if strings.TrimSpace(out.String()) == "Off" {
break L
}
time.Sleep(5 * time.Second)
//wait for the node to be down
fmt.Printf("Waiting for the node %s to be down\n", node)
timeoutCtx, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
L:
for {
select {
case <-timeoutCtx.Done():
return errors.New("power check timeout")
default:
out.Reset()
powerCheckCmd := exec.Command("neco", "power", "status", node)
powerCheckCmd.Stdout = &out
err = powerCheckCmd.Run()
if err != nil {
return err
}
if strings.TrimSpace(out.String()) == "Off" {
break L
}
time.Sleep(5 * time.Second)
}
}
}

// Add taint to the node
fmt.Println("Adding taint to the node")
tainted := false
for _, taint := range kubernetesNode.Spec.Taints {
if taint.Key == "node.kubernetes.io/out-of-service" {
tainted = true
break
}
}
if !tainted {
kubernetesNode.Spec.Taints = append(kubernetesNode.Spec.Taints, corev1.Taint{
Key: "node.kubernetes.io/out-of-service",
Value: "nodeshutdown",
Effect: "NoExecute",
})
err = kubeClient.Update(ctx, kubernetesNode)
if err != nil {
return err
}
}

// Create NetworkFence for ceph clusters
fmt.Println("Creating NetworkFence for ceph clusters")
cephClusters := []string{"ceph-canary-block", "ceph-dotcom-block-0", "ceph-poc", "ceph-ssd"}
for _, cephCluster := range cephClusters {
//check cephCluster exists
nameSpace := &corev1.Namespace{}
Expand All @@ -143,9 +120,10 @@ var nonGracefulNodeShutdownCmd = &cobra.Command{
return err
}
}
fenceName := cephCluster + "-" + strings.Replace(node, ".", "-", -1)
networkFence := csiaddonsv1alpha1.NetworkFence{
ObjectMeta: metav1.ObjectMeta{
Name: strings.Replace(node, ".", "-", -1) + "-" + cephCluster,
Name: fenceName,
Namespace: cephCluster,
},
Spec: csiaddonsv1alpha1.NetworkFenceSpec{
Expand All @@ -169,6 +147,47 @@ var nonGracefulNodeShutdownCmd = &cobra.Command{
return err
}
}
// wait for fence of networkfence to be Succeeded
timeoutCtx, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
networkFence = csiaddonsv1alpha1.NetworkFence{}
L2:
for {
select {
case <-timeoutCtx.Done():
return errors.New("timeout waiting for networkfence to be fenced")
default:
err := kubeClient.Get(ctx, client.ObjectKey{Name: fenceName}, &networkFence)
if err != nil {
return err
}
if networkFence.Status.Result == csiaddonsv1alpha1.FencingOperationResultSucceeded {
break L2
}
time.Sleep(5 * time.Second)
}
}
}

// Add taint to the node
fmt.Println("Adding taint to the node")
tainted := false
for _, taint := range kubernetesNode.Spec.Taints {
if taint.Key == "node.kubernetes.io/out-of-service" {
tainted = true
break
}
}
if !tainted {
kubernetesNode.Spec.Taints = append(kubernetesNode.Spec.Taints, corev1.Taint{
Key: "node.kubernetes.io/out-of-service",
Value: "nodeshutdown",
Effect: "NoExecute",
})
err = kubeClient.Update(ctx, kubernetesNode)
if err != nil {
return err
}
}
return nil
},
Expand Down
32 changes: 14 additions & 18 deletions pkg/neco/cmd/non_graceful_shutdown_cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,9 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
clientgoscheme.AddToScheme(scheme)
csiaddonsv1alpha1.AddToScheme(scheme)

//get sabakan status
opt := sabakanMachinesGetOpts{}
opt.params = map[string]*string{
"ipv4": &node,
}
machines, err := sabakanMachinesGet(ctx, &opt)
if err != nil {
return err
}
sabakanStatus := machines[0].Status.State

//issue kubeconfig
issueCmd := exec.Command("sh", "-c", "ckecli kubernetes issue > /home/cybozu/.kube/shutdown-config")
err = issueCmd.Run()
err := issueCmd.Run()
if err != nil {
fmt.Println("Failed to issue kubeconfig")
os.Exit(1)
Expand All @@ -66,8 +55,18 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
return err
}

//get sabakan status
opt := sabakanMachinesGetOpts{}
opt.params = map[string]*string{
"ipv4": &node,
}
machines, err := sabakanMachinesGet(ctx, &opt)
if err != nil {
return err
}
sabakanStatus := machines[0].Status.State

// remove networkfence
cephClusters := []string{"ceph-canary-block", "ceph-dotcom-block-0", "ceph-poc", "ceph-ssd"}
for _, cephCluster := range cephClusters {
//check cephCluster exists
nameSpace := &corev1.Namespace{}
Expand All @@ -80,8 +79,7 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
return err
}
}
fenceName := strings.Replace(node, ".", "-", -1) + "-" + cephCluster
//get networkfence
fenceName := cephCluster + "-" + strings.Replace(node, ".", "-", -1)
networkFence := &csiaddonsv1alpha1.NetworkFence{}
err = kubeClient.Get(ctx, client.ObjectKey{Name: fenceName}, networkFence)
if err != nil {
Expand All @@ -98,6 +96,7 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
if err != nil {
return err
}

// wait for unfense of networkfence to be Succeeded
timeoutCtx, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
Expand All @@ -115,7 +114,6 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
break L
}
time.Sleep(5 * time.Second)
// break L
}
}
err = kubeClient.Delete(ctx, networkFence)
Expand All @@ -124,9 +122,7 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{
}
}

//run this command if node is Healthy
if sabakanStatus == sabakan.StateHealthy {
// remove out-of-service taint
kubernetesNode := &corev1.Node{}
err = kubeClient.Get(ctx, client.ObjectKey{Name: node}, kubernetesNode)
if err != nil {
Expand Down

0 comments on commit 69b7b44

Please sign in to comment.