From a94ff4de50648e6825099fe4477f73eed2ec479b Mon Sep 17 00:00:00 2001 From: Em Sharnoff Date: Sun, 17 Nov 2024 20:45:19 -0800 Subject: [PATCH] scheduler: Shorten tolerations for node failure Similar to what was done in #1055, we need to explicitly add tolerations to the scheduler to get it to be recreated more quickly on node failure. This is particularly necessary because we don't have #955. We could wait for that, but it's a lot of work, and this is a small thing we can do in the meantime. Fixes neondatabase/cloud#17298. --- autoscale-scheduler/deployment.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/autoscale-scheduler/deployment.yaml b/autoscale-scheduler/deployment.yaml index 409002c59..d4f7dc9e2 100644 --- a/autoscale-scheduler/deployment.yaml +++ b/autoscale-scheduler/deployment.yaml @@ -63,3 +63,13 @@ spec: - name: plugin-config-volume configMap: name: scheduler-plugin-config + + tolerations: + # Add explicit (short) tolerations for node failure, because otherwise the default of 5m + # will be used, which is unacceptably long for us. + - key: node.kubernetes.io/not-ready + tolerationSeconds: 30 + effect: NoExecute + - key: node.kubernetes.io/unreachable + tolerationSeconds: 30 + effect: NoExecute