From a94ff4de50648e6825099fe4477f73eed2ec479b Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Sun, 17 Nov 2024 20:45:19 -0800
Subject: [PATCH] scheduler: Shorten tolerations for node failure

Similar to what was done in #1055, we need to explicitly add tolerations
to the scheduler to get it to be recreated more quickly on node failure.

This is particularly necessary because we don't have #955. We could wait
for that, but it's a lot of work, and this is a small thing we can do in
the meantime.

Fixes neondatabase/cloud#17298.
---
 autoscale-scheduler/deployment.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/autoscale-scheduler/deployment.yaml b/autoscale-scheduler/deployment.yaml
index 409002c59..d4f7dc9e2 100644
--- a/autoscale-scheduler/deployment.yaml
+++ b/autoscale-scheduler/deployment.yaml
@@ -63,3 +63,13 @@ spec:
         - name: plugin-config-volume
           configMap:
             name: scheduler-plugin-config
+
+      tolerations:
+        # Add explicit (short) tolerations for node failure, because otherwise the default of 5m
+        # will be used, which is unacceptably long for us.
+        - key: node.kubernetes.io/not-ready
+          tolerationSeconds: 30
+          effect: NoExecute
+        - key: node.kubernetes.io/unreachable
+          tolerationSeconds: 30
+          effect: NoExecute