Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perform eviction dry-run during node reboot feature #736

Merged
merged 1 commit into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions mtest/assets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ var rebootJobCompletedYAML []byte
//go:embed reboot-job-running.yaml
var rebootJobRunningYAML []byte

//go:embed reboot-eviction-dry-run.yaml
var rebootEvictionDryRunYAML []byte

//go:embed reboot-slow-eviction-deployment.yaml
var rebootSlowEvictionDeploymentYAML []byte

Expand Down
81 changes: 81 additions & 0 deletions mtest/reboot-eviction-dry-run.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: reboot-test
name: 1-not-evictable
spec:
replicas: 1
selector:
matchLabels:
reboot-app: 1-not-evictable
template:
metadata:
labels:
reboot-app: 1-not-evictable
spec:
containers:
- name: httpd
image: ghcr.io/cybozu/testhttpd:0
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
namespace: reboot-test
name: 1-not-evictable
spec:
maxUnavailable: 0
selector:
matchLabels:
reboot-app: 1-not-evictable
---
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: reboot-test
name: 0-evictable
spec:
replicas: 1
selector:
matchLabels:
reboot-app: 0-evictable
template:
metadata:
labels:
reboot-app: 0-evictable
spec:
containers:
- name: httpd
image: ghcr.io/cybozu/testhttpd:0
affinity:
podAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
reboot-app: 1-not-evictable
topologyKey: kubernetes.io/hostname
---
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: reboot-test
name: 2-evictable
spec:
replicas: 1
selector:
matchLabels:
reboot-app: 2-evictable
template:
metadata:
labels:
reboot-app: 2-evictable
spec:
containers:
- name: httpd
image: ghcr.io/cybozu/testhttpd:0
affinity:
podAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchLabels:
reboot-app: 1-not-evictable
topologyKey: kubernetes.io/hostname
44 changes: 44 additions & 0 deletions mtest/reboot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,50 @@ func testRebootOperations() {
Expect(err).ShouldNot(HaveOccurred(), "stderr: %s", stderr)
})

It("checks eviction dry run behavior", func() {
deploymentNames := []string{"0-evictable", "1-not-evictable", "2-evictable"}

By("Deploying Pods")
_, stderr, err := kubectlWithInput(rebootEvictionDryRunYAML, "apply", "-f", "-")
Expect(err).ShouldNot(HaveOccurred(), "stderr: %s", stderr)

creationTime := map[string]time.Time{}
var nodeName string
for _, name := range deploymentNames {
checkDeploymentEventually("reboot-test", name)
pods := getPodListGomega(Default, "reboot-test", "-l=reboot-app="+name)
Expect(pods.Items).Should(HaveLen(1), "pod is not created")
pod := &pods.Items[0]
Expect(pod.Status.Phase).Should(Equal(corev1.PodRunning), "pod is not running")
creationTime[name] = pod.ObjectMeta.CreationTimestamp.Time
nodeName = pod.Spec.NodeName // all pods run on the same node
}

By("Reboot operation will not delete evictable pods if non-evictable pods exist on the same node")
rebootQueueAdd([]string{nodeName})
rebootShouldNotProceed()

for _, name := range deploymentNames {
pods := getPodListGomega(Default, "reboot-test", "-l=reboot-app="+name)
Expect(pods.Items).Should(HaveLen(1), "pod does not exist (probably evicted)")
pod := pods.Items[0]
Expect(pod.ObjectMeta.CreationTimestamp.Time).Should(Equal(creationTime[name]), "pod by deployment %s is recreated (probably evicted)", name)
}

By("Cleaning up")
rebootQueueCancelAllAndWait(cluster)

_, stderr, err = kubectlWithInput(rebootEvictionDryRunYAML, "delete", "-f", "-")
Expect(err).ShouldNot(HaveOccurred(), "stderr: %s", stderr)

for _, name := range deploymentNames {
Eventually(func(g Gomega) {
deploymentPods := getPodListGomega(g, "reboot-test", "-l=reboot-app="+name)
g.Expect(deploymentPods.Items).Should(HaveLen(0), "Pod does not terminate")
}).Should(Succeed())
}
})

It("checks drain timeout behavior", func() {
By("Deploying a slow eviction pod")
_, stderr, err := kubectlWithInput(rebootSlowEvictionDeploymentYAML, "apply", "-f", "-")
Expand Down
23 changes: 21 additions & 2 deletions op/reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,20 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure
// This "check and cordon" order may overlook Job pods just started.
// On the other hand, "cordon and check" may cause excessive cordon.
// The overlook is acceptable because it is rare case and detected by the following evictOrDeleteNodePod().

err = checkJobPodNotExist(ctx, cs, entry.Node)
log.Info("start eviction dry-run", map[string]interface{}{
"name": entry.Node,
})
err = dryRunEvictOrDeleteNodePod(ctx, cs, entry.Node, protected)
if err != nil {
log.Warn("eviction dry-run failed", map[string]interface{}{
"name": entry.Node,
log.FnError: err,
})
return err
}
log.Info("eviction dry-run succeeded", map[string]interface{}{
"name": entry.Node,
})

_, err = nodesAPI.Patch(ctx, entry.Node, types.StrategicMergePatchType, []byte(`
{
Expand All @@ -169,13 +178,23 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure

// next, evict pods on each node
for _, entry := range evictNodes {
log.Info("start eviction", map[string]interface{}{
"name": entry.Node,
})
err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected, c.evictAttempts, c.evictInterval)
if err != nil {
log.Warn("eviction failed", map[string]interface{}{
"name": entry.Node,
log.FnError: err,
})
c.notifyFailedNode(entry.Node)
err = drainBackOff(ctx, inf, entry, err)
if err != nil {
return err
}
log.Info("eviction succeeded", map[string]interface{}{
"name": entry.Node,
})
}
}

Expand Down
53 changes: 42 additions & 11 deletions op/reboot_decide.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,46 +61,74 @@ func enumeratePods(ctx context.Context, cs *kubernetes.Clientset, node string,
return nil
}

// checkJobPodNotExist checks running Pods on the specified Node.
// It returns an error if a running Pod exists.
func checkJobPodNotExist(ctx context.Context, cs *kubernetes.Clientset, node string) error {
return enumeratePods(ctx, cs, node, func(_ *corev1.Pod) error {
return nil
}, func(pod *corev1.Pod) error {
return fmt.Errorf("job-managed pod exists: %s/%s, phase=%s", pod.Namespace, pod.Name, pod.Status.Phase)
})
// dryRunEvictOrDeleteNodePod checks eviction or deletion of Pods on the specified Node can proceed.
// It returns an error if a running Pod exists or an eviction of the Pod in protected namespace failed.
func dryRunEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool) error {
return doEvictOrDeleteNodePod(ctx, cs, node, protected, 0, 0, true)
}

// evictOrDeleteNodePod evicts or delete Pods on the specified Node.
// It first tries eviction. If the eviction failed and the Pod's namespace is not protected, it deletes the Pod.
// If a running Job Pod exists, this function returns an error.
func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration) error {
return doEvictOrDeleteNodePod(ctx, cs, node, protected, attempts, interval, false)
}

// doEvictOrDeleteNodePod evicts or delete Pods on the specified Node.
// It first tries eviction.
// If the eviction failed and the Pod's namespace is not protected, it deletes the Pod.
// If the eviction failed and the Pod's namespace is protected, it retries after `interval` interval at most `attempts` times.
// If a running Job Pod exists, this function returns an error.
// If `dry` is true, it performs dry run and `attempts` and `interval` are ignored.
func doEvictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration, dry bool) error {
var deleteOptions *metav1.DeleteOptions
if dry {
deleteOptions = &metav1.DeleteOptions{
DryRun: []string{"All"},
}
}

return enumeratePods(ctx, cs, node, func(pod *corev1.Pod) error {
if dry && !protected[pod.Namespace] {
// in case of dry-run for Pods in non-protected namespace,
// return immediately because its "eviction or deletion" never fails
log.Info("skip evicting pod because its namespace is not protected", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry, // for consistency
})
return nil
}

evictCount := 0
EVICT:
log.Info("start evicting pod", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry,
})
err := cs.CoreV1().Pods(pod.Namespace).EvictV1(ctx, &policyv1.Eviction{
ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace},
ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace},
DeleteOptions: deleteOptions,
})
evictCount++
switch {
case err == nil:
log.Info("evicted pod", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry,
})
case apierrors.IsNotFound(err):
// already evicted or deleted.
case !apierrors.IsTooManyRequests(err):
// not a PDB related error
return fmt.Errorf("failed to evict pod %s/%s: %w", pod.Namespace, pod.Name, err)
case !protected[pod.Namespace]:
// not dry here
log.Warn("failed to evict non-protected pod due to PDB", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry, // for consistency (same for below)
log.FnError: err,
})
err := cs.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
Expand All @@ -110,9 +138,11 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st
log.Warn("deleted non-protected pod", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry,
})
default:
if evictCount < attempts {
// in case of dry-run, do not retry.
if !dry && evictCount < attempts {
select {
case <-ctx.Done():
return ctx.Err()
Expand All @@ -121,6 +151,7 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st
log.Info("retry eviction of pod", map[string]interface{}{
"namespace": pod.Namespace,
"name": pod.Name,
"dry": dry,
})
goto EVICT
}
Expand Down
23 changes: 22 additions & 1 deletion op/repair_drain_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"time"

"github.com/cybozu-go/cke"
"github.com/cybozu-go/log"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
)
Expand Down Expand Up @@ -87,10 +88,20 @@ func (c repairDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure
return err
}

err = checkJobPodNotExist(ctx, cs, c.entry.Nodename)
log.Info("start eviction dry-run", map[string]interface{}{
"address": c.entry.Address,
})
err = dryRunEvictOrDeleteNodePod(ctx, cs, c.entry.Nodename, protected)
if err != nil {
log.Warn("eviction dry-run failed", map[string]interface{}{
"address": c.entry.Address,
log.FnError: err,
})
return err
}
log.Info("eviction dry-run succeeded", map[string]interface{}{
"address": c.entry.Address,
})

// Note: The annotation name is shared with reboot operations.
_, err = nodesAPI.Patch(ctx, c.entry.Nodename, types.StrategicMergePatchType, []byte(`
Expand All @@ -109,10 +120,20 @@ func (c repairDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure
return repairDrainBackOff(ctx, inf, c.entry, err)
}

log.Info("start eviction", map[string]interface{}{
"address": c.entry.Address,
})
err = evictOrDeleteNodePod(ctx, cs, c.entry.Nodename, protected, c.evictAttempts, c.evictInterval)
if err != nil {
log.Warn("eviction failed", map[string]interface{}{
"address": c.entry.Address,
log.FnError: err,
})
return repairDrainBackOff(ctx, inf, c.entry, err)
}
log.Info("eviction succeeded", map[string]interface{}{
"address": c.entry.Address,
})

return nil
}
Expand Down