From a245bbee0f92a2385cdb8deabdc06c925e6b0a1a Mon Sep 17 00:00:00 2001 From: morimoto-cybozu Date: Tue, 21 May 2024 13:47:31 +0000 Subject: [PATCH] Delay repair of a rebooting unreachable machine (backport of #732) Signed-off-by: morimoto-cybozu --- CHANGELOG.md | 2 ++ constraints.go | 2 ++ docs/ckecli.md | 1 + docs/constraints.md | 1 + docs/sabakan-triggered-repair.md | 6 ++++ pkg/ckecli/cmd/constraints_set.go | 5 +++ sabakan/integrate.go | 8 ++++- sabakan/repairer.go | 24 +++++++++++-- sabakan/repairer_test.go | 56 ++++++++++++++++++++++++++++--- 9 files changed, 97 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb666026..2cb8eedd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ This project employs a versioning scheme described in [RELEASE.md](RELEASE.md#ve ## [Unreleased] +- Delay repair of a rebooting unreachable machine in [#733](https://github.com/cybozu-go/cke/pull/733) + ## [1.27.11] ### Added diff --git a/constraints.go b/constraints.go index 89d4337c..03350c78 100644 --- a/constraints.go +++ b/constraints.go @@ -9,6 +9,7 @@ type Constraints struct { MaximumWorkers int `json:"maximum-workers"` RebootMaximumUnreachable int `json:"maximum-unreachable-nodes-for-reboot"` MaximumRepairs int `json:"maximum-repair-queue-entries"` + RepairRebootingSeconds int `json:"wait-seconds-to-repair-rebooting"` } // Check checks the cluster satisfies the constraints @@ -43,5 +44,6 @@ func DefaultConstraints() *Constraints { MaximumWorkers: 0, RebootMaximumUnreachable: 0, MaximumRepairs: 0, + RepairRebootingSeconds: 0, } } diff --git a/docs/ckecli.md b/docs/ckecli.md index 70afac5c..07210deb 100644 --- a/docs/ckecli.md +++ b/docs/ckecli.md @@ -97,6 +97,7 @@ Set a constraint on the cluster configuration. - `maximum-workers` - `maximum-unreachable-nodes-for-reboot` - `maximum-repair-queue-entries` +- `wait-seconds-to-repair-rebooting` ### `ckecli constraints show` diff --git a/docs/constraints.md b/docs/constraints.md index 89ee9c9d..c547eb83 100644 --- a/docs/constraints.md +++ b/docs/constraints.md @@ -13,3 +13,4 @@ Cluster should satisfy these constraints. | `maximum-workers` | int | 0 | The maximum number of worker nodes. 0 means unlimited. | | `maximum-unreachable-nodes-for-reboot` | int | 0 | The maximum number of unreachable nodes allowed for operating reboot. | | `maximum-repair-queue-entries` | int | 0 | The maximum number of repair queue entries | +| `wait-seconds-to-repair-rebooting` | int | 0 | The wait time in seconds to repair a rebooting machine | diff --git a/docs/sabakan-triggered-repair.md b/docs/sabakan-triggered-repair.md index 945a6f0f..8f2bce6b 100644 --- a/docs/sabakan-triggered-repair.md +++ b/docs/sabakan-triggered-repair.md @@ -81,6 +81,12 @@ The maximum number of recent repair queue entries and new failure reports is [co As stated above, CKE considers all persisting queue entries as "recent" for simplicity. +### Limiter for planned reboot + +A machine may become "UNREACHABLE" very quickly even if it is [being rebooted in a planned manner](reboot.md). +CKE should wait for a while before starting repair operations for a rebooting machine. + +A user can [configure the wait time](ckecli.md#ckecli-constraints-set-name-value) as a [constraint `wait-seconds-to-repair-rebooting`](constraints.md) [sabakan]: https://github.com/cybozu-go/sabakan [schema]: https://github.com/cybozu-go/sabakan/blob/main/gql/graph/schema.graphqls diff --git a/pkg/ckecli/cmd/constraints_set.go b/pkg/ckecli/cmd/constraints_set.go index 36b0d6b0..2532bd32 100644 --- a/pkg/ckecli/cmd/constraints_set.go +++ b/pkg/ckecli/cmd/constraints_set.go @@ -24,6 +24,7 @@ NAME is one of: maximum-workers maximum-unreachable-nodes-for-reboot maximum-repair-queue-entries + wait-seconds-to-repair-rebooting VALUE is an integer.`, @@ -58,6 +59,10 @@ VALUE is an integer.`, cstrSet = func(cstr *cke.Constraints) { cstr.MaximumRepairs = val } + case "wait-seconds-to-repair-rebooting": + cstrSet = func(cstr *cke.Constraints) { + cstr.RepairRebootingSeconds = val + } default: return errors.New("no such constraint: " + args[0]) } diff --git a/sabakan/integrate.go b/sabakan/integrate.go index 9e3c8b82..9ff7cf7a 100644 --- a/sabakan/integrate.go +++ b/sabakan/integrate.go @@ -174,12 +174,18 @@ func (ig integrator) runRepairer(ctx context.Context, clusterStatus *cke.Cluster return nil } + rebootEntries, err := st.GetRebootsEntries(ctx) + if err != nil { + return err + } + rebootEntries = cke.DedupRebootQueueEntries(rebootEntries) + constraints, err := st.GetConstraints(ctx) if err != nil { return err } - entries := Repairer(machines, clusterStatus.RepairQueue.Entries, constraints) + entries := Repairer(machines, clusterStatus.RepairQueue.Entries, rebootEntries, constraints, time.Now().UTC()) for _, entry := range entries { err := st.RegisterRepairsEntry(ctx, entry) diff --git a/sabakan/repairer.go b/sabakan/repairer.go index 40046026..687df772 100644 --- a/sabakan/repairer.go +++ b/sabakan/repairer.go @@ -2,18 +2,27 @@ package sabakan import ( "strings" + "time" "github.com/cybozu-go/cke" "github.com/cybozu-go/log" ) -func Repairer(machines []Machine, entries []*cke.RepairQueueEntry, constraints *cke.Constraints) []*cke.RepairQueueEntry { +func Repairer(machines []Machine, repairEntries []*cke.RepairQueueEntry, rebootEntries []*cke.RebootQueueEntry, constraints *cke.Constraints, now time.Time) []*cke.RepairQueueEntry { recent := make(map[string]bool) - for _, entry := range entries { + for _, entry := range repairEntries { // entry.Operation is ignored when checking duplication recent[entry.Address] = true } + rebootLimit := now.Add(time.Duration(-constraints.RepairRebootingSeconds) * time.Second) + rebootingSince := make(map[string]time.Time) + for _, entry := range rebootEntries { + if entry.Status == cke.RebootStatusRebooting { + rebootingSince[entry.Node] = entry.LastTransitionTime // entry.Node denotes IP address + } + } + newMachines := make([]Machine, 0, len(machines)) for _, machine := range machines { if len(machine.Spec.IPv4) == 0 { @@ -31,10 +40,19 @@ func Repairer(machines []Machine, entries []*cke.RepairQueueEntry, constraints * continue } + since, ok := rebootingSince[machine.Spec.IPv4[0]] + if ok && since.After(rebootLimit) && machine.Status.State == StateUnreachable { + log.Info("ignore rebooting unreachable machine", map[string]interface{}{ + "serial": machine.Spec.Serial, + "address": machine.Spec.IPv4[0], + }) + continue + } + newMachines = append(newMachines, machine) } - if len(entries)+len(newMachines) > constraints.MaximumRepairs { + if len(repairEntries)+len(newMachines) > constraints.MaximumRepairs { log.Warn("ignore too many repair requests", nil) return nil } diff --git a/sabakan/repairer_test.go b/sabakan/repairer_test.go index e7c40bb7..a52a4b10 100644 --- a/sabakan/repairer_test.go +++ b/sabakan/repairer_test.go @@ -2,6 +2,7 @@ package sabakan import ( "testing" + "time" "github.com/cybozu-go/cke" "github.com/google/go-cmp/cmp" @@ -10,12 +11,13 @@ import ( func TestRepairer(t *testing.T) { constraints := &cke.Constraints{ - MaximumRepairs: 3, + MaximumRepairs: 3, + RepairRebootingSeconds: 300, } machines := []Machine{ {Spec: MachineSpec{Serial: "0000"}, Status: MachineStatus{State: StateUnhealthy}}, - {Spec: MachineSpec{Serial: "1111", IPv4: []string{"1.1.1.1"}, BMC: BMC{Type: "type1"}}, Status: MachineStatus{State: StateUnhealthy}}, + {Spec: MachineSpec{Serial: "1111", IPv4: []string{"1.1.1.1"}, BMC: BMC{Type: "type1"}}, Status: MachineStatus{State: StateUnreachable}}, {Spec: MachineSpec{Serial: "2222", IPv4: []string{"2.2.2.2"}, BMC: BMC{Type: "type2"}}, Status: MachineStatus{State: StateUnhealthy}}, {Spec: MachineSpec{Serial: "3333", IPv4: []string{"3.3.3.3"}, BMC: BMC{Type: "type3"}}, Status: MachineStatus{State: StateUnreachable}}, {Spec: MachineSpec{Serial: "4444", IPv4: []string{"4.4.4.4"}, BMC: BMC{Type: "type4"}}, Status: MachineStatus{State: StateUnreachable}}, @@ -23,54 +25,100 @@ func TestRepairer(t *testing.T) { entries := []*cke.RepairQueueEntry{ nil, - cke.NewRepairQueueEntry("unhealthy", "type1", "1.1.1.1"), + cke.NewRepairQueueEntry("unreachable", "type1", "1.1.1.1"), cke.NewRepairQueueEntry("unhealthy", "type2", "2.2.2.2"), cke.NewRepairQueueEntry("unreachable", "type3", "3.3.3.3"), cke.NewRepairQueueEntry("unreachable", "type4", "4.4.4.4"), } + now := time.Now().UTC() + recent := now.Add(-30 * time.Second) + stale := now.Add(-3000 * time.Second) + rebootEntries := []*cke.RebootQueueEntry{ + nil, + {Node: "1.1.1.1", Status: cke.RebootStatusRebooting, LastTransitionTime: recent}, + {Node: "2.2.2.2", Status: cke.RebootStatusRebooting, LastTransitionTime: recent}, + {Node: "3.3.3.3", Status: cke.RebootStatusRebooting, LastTransitionTime: stale}, + {Node: "4.4.4.4", Status: cke.RebootStatusDraining, LastTransitionTime: recent}, + } + tests := []struct { name string failedMachines []Machine queuedEntries []*cke.RepairQueueEntry + rebootEntries []*cke.RebootQueueEntry expectedEntries []*cke.RepairQueueEntry }{ { name: "NoFailedMachine", failedMachines: []Machine{}, queuedEntries: []*cke.RepairQueueEntry{entries[2]}, + rebootEntries: nil, expectedEntries: []*cke.RepairQueueEntry{}, }, { name: "OneFailedMachine", failedMachines: []Machine{machines[1]}, queuedEntries: []*cke.RepairQueueEntry{entries[2]}, + rebootEntries: nil, expectedEntries: []*cke.RepairQueueEntry{entries[1]}, }, { name: "IgnoreNoIPAddress", failedMachines: []Machine{machines[0], machines[1]}, queuedEntries: []*cke.RepairQueueEntry{entries[2]}, + rebootEntries: nil, expectedEntries: []*cke.RepairQueueEntry{entries[1]}, }, { name: "IgnoreRecentlyRepaired", failedMachines: []Machine{machines[1], machines[2], machines[3]}, queuedEntries: []*cke.RepairQueueEntry{entries[2]}, + rebootEntries: nil, expectedEntries: []*cke.RepairQueueEntry{entries[1], entries[3]}, }, { name: "IgnoreRecentlyRepairedWithDifferentOperation", failedMachines: []Machine{machines[1], machines[2], machines[3]}, queuedEntries: []*cke.RepairQueueEntry{cke.NewRepairQueueEntry("unreachable", "type2", "2.2.2.2")}, + rebootEntries: nil, expectedEntries: []*cke.RepairQueueEntry{entries[1], entries[3]}, }, { name: "IgnoreTooManyFailedMachines", failedMachines: []Machine{machines[1], machines[2], machines[3]}, queuedEntries: []*cke.RepairQueueEntry{entries[2], entries[4]}, + rebootEntries: nil, + expectedEntries: []*cke.RepairQueueEntry{}, + }, + { + name: "IgnoreRebootingUnreachableMachine", + failedMachines: []Machine{machines[1]}, + queuedEntries: []*cke.RepairQueueEntry{}, + rebootEntries: []*cke.RebootQueueEntry{rebootEntries[1]}, expectedEntries: []*cke.RepairQueueEntry{}, }, + { + name: "RebootingButUnhealthy", + failedMachines: []Machine{machines[2]}, + queuedEntries: []*cke.RepairQueueEntry{}, + rebootEntries: []*cke.RebootQueueEntry{rebootEntries[2]}, + expectedEntries: []*cke.RepairQueueEntry{entries[2]}, + }, + { + name: "RebootingButStale", + failedMachines: []Machine{machines[3]}, + queuedEntries: []*cke.RepairQueueEntry{}, + rebootEntries: []*cke.RebootQueueEntry{rebootEntries[3]}, + expectedEntries: []*cke.RepairQueueEntry{entries[3]}, + }, + { + name: "QueuedButNotRebooting", + failedMachines: []Machine{machines[4]}, + queuedEntries: []*cke.RepairQueueEntry{}, + rebootEntries: []*cke.RebootQueueEntry{rebootEntries[4]}, + expectedEntries: []*cke.RepairQueueEntry{entries[4]}, + }, } for _, tt := range tests { @@ -78,7 +126,7 @@ func TestRepairer(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - entries := Repairer(tt.failedMachines, tt.queuedEntries, constraints) + entries := Repairer(tt.failedMachines, tt.queuedEntries, tt.rebootEntries, constraints, now) if !cmp.Equal(entries, tt.expectedEntries, cmpopts.EquateEmpty()) { t.Errorf("!cmp.Equal(entries, tt.newEntries), actual: %v, expected: %v", entries, tt.expectedEntries) }