Skip to content

Commit

Permalink
Reboot
Browse files Browse the repository at this point in the history
  • Loading branch information
YZ775 committed Jul 8, 2024
1 parent 4caaac3 commit 9a19c57
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 6 deletions.
11 changes: 11 additions & 0 deletions cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,16 @@ const DefaultRepairEvictionTimeoutSeconds = 600
const DefaultRepairHealthCheckCommandTimeoutSeconds = 30
const DefaultRepairCommandTimeoutSeconds = 30

type Retire struct {
ShutdownCommand []string `json:"shutdown_command"`
CheckCommand []string `json:"check_command"`
CommandTimeoutSeconds *int `json:"command_timeout_seconds,omitempty"`
CheckTimeoutSeconds *int `json:"check_timeout_seconds,omitempty"`
}

const DefaultRetireCommandTimeoutSeconds = 30
const DefaultRetireCheckTimeoutSeconds = 300

// Options is a set of optional parameters for k8s components.
type Options struct {
Etcd EtcdParams `json:"etcd"`
Expand All @@ -343,6 +353,7 @@ type Cluster struct {
DNSService string `json:"dns_service"`
Reboot Reboot `json:"reboot"`
Repair Repair `json:"repair"`
Retire Retire `json:"retire"`
Options Options `json:"options"`
}

Expand Down
5 changes: 5 additions & 0 deletions mtest/cke-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ repair:
need_drain: true
watch_seconds: 30
health_check_command: ["sh", "-c", "test -f /tmp/mtest-repair-$1 && echo true", "health_check"]
retire:
shutdown_command: ["true"]
check_command: ["bash", "-c", "echo 'Off'"]
command_timeout_seconds: 30
check_timeout_seconds: 300
options:
kube-api:
extra_binds:
Expand Down
87 changes: 82 additions & 5 deletions op/kube_node_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ import (
"context"
"fmt"
"strings"
"time"

"github.com/cybozu-go/cke"
"github.com/cybozu-go/log"
"github.com/cybozu-go/well"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -16,12 +19,13 @@ import (
type kubeNodeRemove struct {
apiserver *cke.Node
nodes []*corev1.Node
config *cke.Retire
done bool
}

// KubeNodeRemoveOp removes k8s Node resources.
func KubeNodeRemoveOp(apiserver *cke.Node, nodes []*corev1.Node) cke.Operator {
return &kubeNodeRemove{apiserver: apiserver, nodes: nodes}
func KubeNodeRemoveOp(apiserver *cke.Node, nodes []*corev1.Node, config *cke.Retire) cke.Operator {
return &kubeNodeRemove{apiserver: apiserver, nodes: nodes, config: config}
}

func (o *kubeNodeRemove) Name() string {
Expand All @@ -34,7 +38,14 @@ func (o *kubeNodeRemove) NextCommand() cke.Commander {
}

o.done = true
return nodeRemoveCommand{o.apiserver, o.nodes}
return nodeRemoveCommand{
o.apiserver,
o.nodes,
o.config.ShutdownCommand,
o.config.CheckCommand,
o.config.CommandTimeoutSeconds,
o.config.CheckTimeoutSeconds,
}
}

func (o *kubeNodeRemove) Targets() []string {
Expand All @@ -44,8 +55,12 @@ func (o *kubeNodeRemove) Targets() []string {
}

type nodeRemoveCommand struct {
apiserver *cke.Node
nodes []*corev1.Node
apiserver *cke.Node
nodes []*corev1.Node
shutdownCommand []string
checkCommand []string
timeoutSeconds *int
checkTimeoutSeconds *int
}

func (c nodeRemoveCommand) Run(ctx context.Context, inf cke.Infrastructure, _ string) error {
Expand Down Expand Up @@ -77,6 +92,68 @@ func (c nodeRemoveCommand) Run(ctx context.Context, inf cke.Infrastructure, _ st
return fmt.Errorf("failed to patch node %s: %v", n.Name, err)
}
}
err := func() error {
ctx := ctx
timeout := cke.DefaultRetireCommandTimeoutSeconds
if c.timeoutSeconds != nil {
timeout = *c.timeoutSeconds
}
if timeout != 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, time.Second*time.Duration(timeout))
defer cancel()
}
args := append(c.shutdownCommand[1:], n.Name)
command := well.CommandContext(ctx, c.shutdownCommand[0], args...)
return command.Run()
}()
if err != nil {
return fmt.Errorf("failed to shutdown node %s: %v", n.Name, err)
}

err = func() error {
ctx := ctx
checkTimeout := cke.DefaultRetireCheckTimeoutSeconds
if c.checkTimeoutSeconds != nil {
checkTimeout = *c.checkTimeoutSeconds
}
timeout := time.After(time.Duration(checkTimeout) * time.Second)
ticker := time.NewTicker(10 * time.Second)
for {
select {
case <-timeout:
return fmt.Errorf("timeout")
case <-ticker.C:
args := append(c.checkCommand[1:], n.Name)
command := well.CommandContext(ctx, c.checkCommand[0], args...)
stdout, err := command.Output()
if err != nil {
log.Warn("failed to check shutdown status of node", map[string]interface{}{
log.FnError: err,
"node": n.Name,
})
continue
}
if strings.TrimSuffix(string(stdout), "\n") == "Off" {
return nil
}
}
}
}()
if err != nil {
return fmt.Errorf("failed to check shutdown status of node %s: %v", n.Name, err)
}
shutdownTaint := corev1.Taint{
Key: "node.kubernetes.io/out-of-service",
Value: "nodeshutdown",
Effect: corev1.TaintEffectNoExecute,
}
n.Spec.Taints = append(n.Spec.Taints, shutdownTaint)
_, err = nodesAPI.Update(ctx, n, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("failed to update node %s: %v", n.Name, err)
}

err = nodesAPI.Delete(ctx, n.Name, metav1.DeleteOptions{})
if err != nil {
return fmt.Errorf("failed to delete node %s: %v", n.Name, err)
Expand Down
2 changes: 1 addition & 1 deletion server/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ OUTER_ETCD:
}

if nodes := nf.NonClusterNodes(); len(nodes) > 0 {
ops = append(ops, op.KubeNodeRemoveOp(apiServer, nodes))
ops = append(ops, op.KubeNodeRemoveOp(apiServer, nodes, &c.Retire))
}

return ops
Expand Down

0 comments on commit 9a19c57

Please sign in to comment.