diff --git a/pkg/neco/cmd/non_graceful_shutdown.go b/pkg/neco/cmd/non_graceful_shutdown.go index e9304c898..acbb0d0c0 100644 --- a/pkg/neco/cmd/non_graceful_shutdown.go +++ b/pkg/neco/cmd/non_graceful_shutdown.go @@ -22,6 +22,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +var ( + cephClusters = []string{"ceph-canary-block", "ceph-dotcom-block-0", "ceph-poc", "ceph-ssd"} +) + var nonGracefulNodeShutdownCmd = &cobra.Command{ Use: "nonGracefulNodeShutdown IP_ADDRESS", Short: "nonGracefulNodeShutdown related commands", @@ -69,68 +73,41 @@ var nonGracefulNodeShutdownCmd = &cobra.Command{ powerCheckCmd.Stdout = &out err = powerCheckCmd.Run() if err != nil { - if kubernetesNode.Labels["node.cybozu.io/reserved-for"] == "rbd" { - fmt.Println("The node is dedicated. Skip the shutdown.") - } else { + return err + } + if strings.TrimSpace(out.String()) == "On" { + poweroffCmd := exec.Command("neco", "power", "stop", node) + err = poweroffCmd.Run() + if err != nil { return err } - } else { - if strings.TrimSpace(out.String()) == "On" { - poweroffCmd := exec.Command("neco", "power", "stop", node) - err = poweroffCmd.Run() - if err != nil { - return err - } - //wait for the node to be down - fmt.Printf("Waiting for the node %s to be down\n", node) - timeoutCtx, cancel := context.WithTimeout(ctx, 1*time.Minute) - defer cancel() - L: - for { - select { - case <-timeoutCtx.Done(): - return errors.New("power check timeout") - default: - out.Reset() - powerCheckCmd := exec.Command("neco", "power", "status", node) - powerCheckCmd.Stdout = &out - err = powerCheckCmd.Run() - if err != nil { - return err - } - if strings.TrimSpace(out.String()) == "Off" { - break L - } - time.Sleep(5 * time.Second) + //wait for the node to be down + fmt.Printf("Waiting for the node %s to be down\n", node) + timeoutCtx, cancel := context.WithTimeout(ctx, 1*time.Minute) + defer cancel() + L: + for { + select { + case <-timeoutCtx.Done(): + return errors.New("power check timeout") + default: + out.Reset() + powerCheckCmd := exec.Command("neco", "power", "status", node) + powerCheckCmd.Stdout = &out + err = powerCheckCmd.Run() + if err != nil { + return err } + if strings.TrimSpace(out.String()) == "Off" { + break L + } + time.Sleep(5 * time.Second) } } } - // Add taint to the node - fmt.Println("Adding taint to the node") - tainted := false - for _, taint := range kubernetesNode.Spec.Taints { - if taint.Key == "node.kubernetes.io/out-of-service" { - tainted = true - break - } - } - if !tainted { - kubernetesNode.Spec.Taints = append(kubernetesNode.Spec.Taints, corev1.Taint{ - Key: "node.kubernetes.io/out-of-service", - Value: "nodeshutdown", - Effect: "NoExecute", - }) - err = kubeClient.Update(ctx, kubernetesNode) - if err != nil { - return err - } - } - // Create NetworkFence for ceph clusters fmt.Println("Creating NetworkFence for ceph clusters") - cephClusters := []string{"ceph-canary-block", "ceph-dotcom-block-0", "ceph-poc", "ceph-ssd"} for _, cephCluster := range cephClusters { //check cephCluster exists nameSpace := &corev1.Namespace{} @@ -143,9 +120,10 @@ var nonGracefulNodeShutdownCmd = &cobra.Command{ return err } } + fenceName := cephCluster + "-" + strings.Replace(node, ".", "-", -1) networkFence := csiaddonsv1alpha1.NetworkFence{ ObjectMeta: metav1.ObjectMeta{ - Name: strings.Replace(node, ".", "-", -1) + "-" + cephCluster, + Name: fenceName, Namespace: cephCluster, }, Spec: csiaddonsv1alpha1.NetworkFenceSpec{ @@ -169,6 +147,47 @@ var nonGracefulNodeShutdownCmd = &cobra.Command{ return err } } + // wait for fence of networkfence to be Succeeded + timeoutCtx, cancel := context.WithTimeout(ctx, 1*time.Minute) + defer cancel() + networkFence = csiaddonsv1alpha1.NetworkFence{} + L2: + for { + select { + case <-timeoutCtx.Done(): + return errors.New("timeout waiting for networkfence to be fenced") + default: + err := kubeClient.Get(ctx, client.ObjectKey{Name: fenceName}, &networkFence) + if err != nil { + return err + } + if networkFence.Status.Result == csiaddonsv1alpha1.FencingOperationResultSucceeded { + break L2 + } + time.Sleep(5 * time.Second) + } + } + } + + // Add taint to the node + fmt.Println("Adding taint to the node") + tainted := false + for _, taint := range kubernetesNode.Spec.Taints { + if taint.Key == "node.kubernetes.io/out-of-service" { + tainted = true + break + } + } + if !tainted { + kubernetesNode.Spec.Taints = append(kubernetesNode.Spec.Taints, corev1.Taint{ + Key: "node.kubernetes.io/out-of-service", + Value: "nodeshutdown", + Effect: "NoExecute", + }) + err = kubeClient.Update(ctx, kubernetesNode) + if err != nil { + return err + } } return nil }, diff --git a/pkg/neco/cmd/non_graceful_shutdown_cleanup.go b/pkg/neco/cmd/non_graceful_shutdown_cleanup.go index c8e662043..5bdebc0f2 100644 --- a/pkg/neco/cmd/non_graceful_shutdown_cleanup.go +++ b/pkg/neco/cmd/non_graceful_shutdown_cleanup.go @@ -37,20 +37,9 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{ clientgoscheme.AddToScheme(scheme) csiaddonsv1alpha1.AddToScheme(scheme) - //get sabakan status - opt := sabakanMachinesGetOpts{} - opt.params = map[string]*string{ - "ipv4": &node, - } - machines, err := sabakanMachinesGet(ctx, &opt) - if err != nil { - return err - } - sabakanStatus := machines[0].Status.State - //issue kubeconfig issueCmd := exec.Command("sh", "-c", "ckecli kubernetes issue > /home/cybozu/.kube/shutdown-config") - err = issueCmd.Run() + err := issueCmd.Run() if err != nil { fmt.Println("Failed to issue kubeconfig") os.Exit(1) @@ -66,8 +55,18 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{ return err } + //get sabakan status + opt := sabakanMachinesGetOpts{} + opt.params = map[string]*string{ + "ipv4": &node, + } + machines, err := sabakanMachinesGet(ctx, &opt) + if err != nil { + return err + } + sabakanStatus := machines[0].Status.State + // remove networkfence - cephClusters := []string{"ceph-canary-block", "ceph-dotcom-block-0", "ceph-poc", "ceph-ssd"} for _, cephCluster := range cephClusters { //check cephCluster exists nameSpace := &corev1.Namespace{} @@ -80,8 +79,7 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{ return err } } - fenceName := strings.Replace(node, ".", "-", -1) + "-" + cephCluster - //get networkfence + fenceName := cephCluster + "-" + strings.Replace(node, ".", "-", -1) networkFence := &csiaddonsv1alpha1.NetworkFence{} err = kubeClient.Get(ctx, client.ObjectKey{Name: fenceName}, networkFence) if err != nil { @@ -98,6 +96,7 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{ if err != nil { return err } + // wait for unfense of networkfence to be Succeeded timeoutCtx, cancel := context.WithTimeout(ctx, 1*time.Minute) defer cancel() @@ -115,7 +114,6 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{ break L } time.Sleep(5 * time.Second) - // break L } } err = kubeClient.Delete(ctx, networkFence) @@ -124,9 +122,7 @@ var nonGracefulShutdownCleanupCmd = &cobra.Command{ } } - //run this command if node is Healthy if sabakanStatus == sabakan.StateHealthy { - // remove out-of-service taint kubernetesNode := &corev1.Node{} err = kubeClient.Get(ctx, client.ObjectKey{Name: node}, kubernetesNode) if err != nil {