From e64f850b51093304cd8fe729470bc520af560a45 Mon Sep 17 00:00:00 2001 From: UMEZAWA Takeshi Date: Thu, 23 May 2024 16:43:35 +0900 Subject: [PATCH] perform eviction dry-run during node reboot feature Signed-off-by: UMEZAWA Takeshi --- mtest/assets_test.go | 3 ++ mtest/reboot-eviction-dry-run.yaml | 81 ++++++++++++++++++++++++++++++ mtest/reboot_test.go | 44 ++++++++++++++++ op/reboot.go | 23 ++++++++- op/reboot_decide.go | 53 +++++++++++++++---- op/repair_drain_start.go | 23 ++++++++- 6 files changed, 213 insertions(+), 14 deletions(-) create mode 100644 mtest/reboot-eviction-dry-run.yaml diff --git a/mtest/assets_test.go b/mtest/assets_test.go index 0c6a1bcc..df7370fd 100644 --- a/mtest/assets_test.go +++ b/mtest/assets_test.go @@ -14,6 +14,9 @@ var rebootJobCompletedYAML []byte //go:embed reboot-job-running.yaml var rebootJobRunningYAML []byte +//go:embed reboot-eviction-dry-run.yaml +var rebootEvictionDryRunYAML []byte + //go:embed reboot-slow-eviction-deployment.yaml var rebootSlowEvictionDeploymentYAML []byte diff --git a/mtest/reboot-eviction-dry-run.yaml b/mtest/reboot-eviction-dry-run.yaml new file mode 100644 index 00000000..7875a2f8 --- /dev/null +++ b/mtest/reboot-eviction-dry-run.yaml @@ -0,0 +1,81 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: reboot-test + name: 1-not-evictable +spec: + replicas: 1 + selector: + matchLabels: + reboot-app: 1-not-evictable + template: + metadata: + labels: + reboot-app: 1-not-evictable + spec: + containers: + - name: httpd + image: ghcr.io/cybozu/testhttpd:0 +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + namespace: reboot-test + name: 1-not-evictable +spec: + maxUnavailable: 0 + selector: + matchLabels: + reboot-app: 1-not-evictable +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: reboot-test + name: 0-evictable +spec: + replicas: 1 + selector: + matchLabels: + reboot-app: 0-evictable + template: + metadata: + labels: + reboot-app: 0-evictable + spec: + containers: + - name: httpd + image: ghcr.io/cybozu/testhttpd:0 + affinity: + podAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + reboot-app: 1-not-evictable + topologyKey: kubernetes.io/hostname +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: reboot-test + name: 2-evictable +spec: + replicas: 1 + selector: + matchLabels: + reboot-app: 2-evictable + template: + metadata: + labels: + reboot-app: 2-evictable + spec: + containers: + - name: httpd + image: ghcr.io/cybozu/testhttpd:0 + affinity: + podAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + reboot-app: 1-not-evictable + topologyKey: kubernetes.io/hostname diff --git a/mtest/reboot_test.go b/mtest/reboot_test.go index 04ae0975..63dc1530 100644 --- a/mtest/reboot_test.go +++ b/mtest/reboot_test.go @@ -329,6 +329,50 @@ func testRebootOperations() { Expect(err).ShouldNot(HaveOccurred(), "stderr: %s", stderr) }) + It("checks eviction dry run behavior", func() { + deploymentNames := []string{"0-evictable", "1-not-evictable", "2-evictable"} + + By("Deploying Pods") + _, stderr, err := kubectlWithInput(rebootEvictionDryRunYAML, "apply", "-f", "-") + Expect(err).ShouldNot(HaveOccurred(), "stderr: %s", stderr) + + creationTime := map[string]time.Time{} + var nodeName string + for _, name := range deploymentNames { + checkDeploymentEventually("reboot-test", name) + pods := getPodListGomega(Default, "reboot-test", "-l=reboot-app="+name) + Expect(pods.Items).Should(HaveLen(1), "pod is not created") + pod := &pods.Items[0] + Expect(pod.Status.Phase).Should(Equal(corev1.PodRunning), "pod is not running") + creationTime[name] = pod.ObjectMeta.CreationTimestamp.Time + nodeName = pod.Spec.NodeName // all pods run on the same node + } + + By("Reboot operation will not delete evictable pods if non-evictable pods exist on the same node") + rebootQueueAdd([]string{nodeName}) + rebootShouldNotProceed() + + for _, name := range deploymentNames { + pods := getPodListGomega(Default, "reboot-test", "-l=reboot-app="+name) + Expect(pods.Items).Should(HaveLen(1), "pod does not exist (probably evicted)") + pod := pods.Items[0] + Expect(pod.ObjectMeta.CreationTimestamp.Time).Should(Equal(creationTime[name]), "pod by deployment %s is recreated (probably evicted)", name) + } + + By("Cleaning up") + rebootQueueCancelAllAndWait(cluster) + + _, stderr, err = kubectlWithInput(rebootEvictionDryRunYAML, "delete", "-f", "-") + Expect(err).ShouldNot(HaveOccurred(), "stderr: %s", stderr) + + for _, name := range deploymentNames { + Eventually(func(g Gomega) { + deploymentPods := getPodListGomega(g, "reboot-test", "-l=reboot-app="+name) + g.Expect(deploymentPods.Items).Should(HaveLen(0), "Pod does not terminate") + }).Should(Succeed()) + } + }) + It("checks drain timeout behavior", func() { By("Deploying a slow eviction pod") _, stderr, err := kubectlWithInput(rebootSlowEvictionDeploymentYAML, "apply", "-f", "-") diff --git a/op/reboot.go b/op/reboot.go index 36011d62..5cc207e4 100644 --- a/op/reboot.go +++ b/op/reboot.go @@ -138,11 +138,20 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure // This "check and cordon" order may overlook Job pods just started. // On the other hand, "cordon and check" may cause excessive cordon. // The overlook is acceptable because it is rare case and detected by the following evictOrDeleteNodePod(). - - err = checkJobPodNotExist(ctx, cs, entry.Node) + log.Info("start eviction dry-run", map[string]interface{}{ + "name": entry.Node, + }) + err = dryRunEvictOrDeleteNodePod(ctx, cs, entry.Node, protected) if err != nil { + log.Warn("eviction dry-run failed", map[string]interface{}{ + "name": entry.Node, + log.FnError: err, + }) return err } + log.Info("eviction dry-run succeeded", map[string]interface{}{ + "name": entry.Node, + }) _, err = nodesAPI.Patch(ctx, entry.Node, types.StrategicMergePatchType, []byte(` { @@ -169,13 +178,23 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure // next, evict pods on each node for _, entry := range evictNodes { + log.Info("start eviction", map[string]interface{}{ + "name": entry.Node, + }) err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected, c.evictAttempts, c.evictInterval) if err != nil { + log.Warn("eviction failed", map[string]interface{}{ + "name": entry.Node, + log.FnError: err, + }) c.notifyFailedNode(entry.Node) err = drainBackOff(ctx, inf, entry, err) if err != nil { return err } + log.Info("eviction succeeded", map[string]interface{}{ + "name": entry.Node, + }) } } diff --git a/op/reboot_decide.go b/op/reboot_decide.go index 6f8f388c..c630470d 100644 --- a/op/reboot_decide.go +++ b/op/reboot_decide.go @@ -61,29 +61,54 @@ func enumeratePods(ctx context.Context, cs *kubernetes.Clientset, node string, return nil } -// checkJobPodNotExist checks running Pods on the specified Node. -// It returns an error if a running Pod exists. -func checkJobPodNotExist(ctx context.Context, cs *kubernetes.Clientset, node string) error { - return enumeratePods(ctx, cs, node, func(_ *corev1.Pod) error { - return nil - }, func(pod *corev1.Pod) error { - return fmt.Errorf("job-managed pod exists: %s/%s, phase=%s", pod.Namespace, pod.Name, pod.Status.Phase) - }) +// dryRunEvictOrDeleteNodePod checks eviction or deletion of Pods on the specified Node can proceed. +// It returns an error if a running Pod exists or an eviction of the Pod in protected namespace failed. +func dryRunEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool) error { + return doEvictOrDeleteNodePod(ctx, cs, node, protected, 0, 0, true) } // evictOrDeleteNodePod evicts or delete Pods on the specified Node. -// It first tries eviction. If the eviction failed and the Pod's namespace is not protected, it deletes the Pod. // If a running Job Pod exists, this function returns an error. func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration) error { + return doEvictOrDeleteNodePod(ctx, cs, node, protected, attempts, interval, false) +} + +// doEvictOrDeleteNodePod evicts or delete Pods on the specified Node. +// It first tries eviction. +// If the eviction failed and the Pod's namespace is not protected, it deletes the Pod. +// If the eviction failed and the Pod's namespace is protected, it retries after `interval` interval at most `attempts` times. +// If a running Job Pod exists, this function returns an error. +// If `dry` is true, it performs dry run and `attempts` and `interval` are ignored. +func doEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration, dry bool) error { + var deleteOptions *metav1.DeleteOptions + if dry { + deleteOptions = &metav1.DeleteOptions{ + DryRun: []string{"All"}, + } + } + return enumeratePods(ctx, cs, node, func(pod *corev1.Pod) error { + if dry && !protected[pod.Namespace] { + // in case of dry-run for Pods in non-protected namespace, + // return immediately because its "eviction or deletion" never fails + log.Info("skip evicting pod because its namespace is not protected", map[string]interface{}{ + "namespace": pod.Namespace, + "name": pod.Name, + "dry": dry, // for consistency + }) + return nil + } + evictCount := 0 EVICT: log.Info("start evicting pod", map[string]interface{}{ "namespace": pod.Namespace, "name": pod.Name, + "dry": dry, }) err := cs.CoreV1().Pods(pod.Namespace).EvictV1(ctx, &policyv1.Eviction{ - ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace}, + ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace}, + DeleteOptions: deleteOptions, }) evictCount++ switch { @@ -91,6 +116,7 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st log.Info("evicted pod", map[string]interface{}{ "namespace": pod.Namespace, "name": pod.Name, + "dry": dry, }) case apierrors.IsNotFound(err): // already evicted or deleted. @@ -98,9 +124,11 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st // not a PDB related error return fmt.Errorf("failed to evict pod %s/%s: %w", pod.Namespace, pod.Name, err) case !protected[pod.Namespace]: + // not dry here log.Warn("failed to evict non-protected pod due to PDB", map[string]interface{}{ "namespace": pod.Namespace, "name": pod.Name, + "dry": dry, // for consistency (same for below) log.FnError: err, }) err := cs.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}) @@ -110,9 +138,11 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st log.Warn("deleted non-protected pod", map[string]interface{}{ "namespace": pod.Namespace, "name": pod.Name, + "dry": dry, }) default: - if evictCount < attempts { + // in case of dry-run, do not retry. + if !dry && evictCount < attempts { select { case <-ctx.Done(): return ctx.Err() @@ -121,6 +151,7 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st log.Info("retry eviction of pod", map[string]interface{}{ "namespace": pod.Namespace, "name": pod.Name, + "dry": dry, }) goto EVICT } diff --git a/op/repair_drain_start.go b/op/repair_drain_start.go index b537146b..02a848ff 100644 --- a/op/repair_drain_start.go +++ b/op/repair_drain_start.go @@ -6,6 +6,7 @@ import ( "time" "github.com/cybozu-go/cke" + "github.com/cybozu-go/log" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ) @@ -87,10 +88,20 @@ func (c repairDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure return err } - err = checkJobPodNotExist(ctx, cs, c.entry.Nodename) + log.Info("start eviction dry-run", map[string]interface{}{ + "address": c.entry.Address, + }) + err = dryRunEvictOrDeleteNodePod(ctx, cs, c.entry.Nodename, protected) if err != nil { + log.Warn("eviction dry-run failed", map[string]interface{}{ + "address": c.entry.Address, + log.FnError: err, + }) return err } + log.Info("eviction dry-run succeeded", map[string]interface{}{ + "address": c.entry.Address, + }) // Note: The annotation name is shared with reboot operations. _, err = nodesAPI.Patch(ctx, c.entry.Nodename, types.StrategicMergePatchType, []byte(` @@ -109,10 +120,20 @@ func (c repairDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure return repairDrainBackOff(ctx, inf, c.entry, err) } + log.Info("start eviction", map[string]interface{}{ + "address": c.entry.Address, + }) err = evictOrDeleteNodePod(ctx, cs, c.entry.Nodename, protected, c.evictAttempts, c.evictInterval) if err != nil { + log.Warn("eviction failed", map[string]interface{}{ + "address": c.entry.Address, + log.FnError: err, + }) return repairDrainBackOff(ctx, inf, c.entry, err) } + log.Info("eviction succeeded", map[string]interface{}{ + "address": c.entry.Address, + }) return nil }