From a6fcbba54173434e5ee964dd0d491fec69148049 Mon Sep 17 00:00:00 2001 From: Misha Sakhnov Date: Tue, 27 Aug 2024 15:12:57 +0300 Subject: [PATCH] plugin: add warning log message for the pods with assigned scheduler (#1037) plugin: add warning log message for the pods with assigned scheduler, nodeName or nodeSelector closes #93 Signed-off-by: Misha Sakhnov --- pkg/plugin/plugin.go | 3 ++- pkg/plugin/state.go | 18 ++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pkg/plugin/plugin.go b/pkg/plugin/plugin.go index cf8aa3250..95c425a01 100644 --- a/pkg/plugin/plugin.go +++ b/pkg/plugin/plugin.go @@ -153,7 +153,7 @@ func makeAutoscaleEnforcerPlugin( incEventCount() } pushToQueue(logger, pod.Name, func() { - p.handleStarted(hlogger, pod) + p.handleStarted(hlogger, pod, preexisting) if preexisting { initEvents.dec() } @@ -822,6 +822,7 @@ func (e *AutoscaleEnforcer) Reserve( // don't include buffer because we know that future changes by the autoscaler-agent must go // through us. includeBuffer: false, + preexisting: false, }) if err != nil { return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) diff --git a/pkg/plugin/state.go b/pkg/plugin/state.go index ce134e94e..68eea4db1 100644 --- a/pkg/plugin/state.go +++ b/pkg/plugin/state.go @@ -572,7 +572,7 @@ func (e *AutoscaleEnforcer) handleNodeDeletion(logger *zap.Logger, nodeName stri // otherwise, we might (a) ignore resources from pods that weren't scheduled here, or (b) fail to // include pods that *were* scheduled here, but had spurious Unreserves. // (for more, see: https://github.com/neondatabase/autoscaling/pull/435) -func (e *AutoscaleEnforcer) handleStarted(logger *zap.Logger, pod *corev1.Pod) { +func (e *AutoscaleEnforcer) handleStarted(logger *zap.Logger, pod *corev1.Pod, preexisting bool) { nodeName := pod.Spec.NodeName logger = logger.With( @@ -592,12 +592,14 @@ func (e *AutoscaleEnforcer) handleStarted(logger *zap.Logger, pod *corev1.Pod) { // this may be a preexisting VM. If so, we should include it in "buffer" as long it's // supposed to be handled by us (otherwise, the "buffer" will never be resolved) includeBuffer: pod.Spec.SchedulerName == e.state.conf.SchedulerName, + preexisting: preexisting, }) } type reserveOptions struct { allowDeny bool includeBuffer bool + preexisting bool } // reserveResources attempts to set aside resources on the node for the pod. @@ -630,13 +632,25 @@ func (e *AutoscaleEnforcer) reserveResources( e.state.lock.Lock() defer e.state.lock.Unlock() + podName := util.GetNamespacedName(pod) // If the pod already exists, nothing to do - if _, ok := e.state.pods[util.GetNamespacedName(pod)]; ok { + _, isPodInState := e.state.pods[podName] + if isPodInState { logger.Info("Pod already exists in global state") return true, &verdictSet{cpu: "", mem: ""}, nil } + // If the following conditions are met, the pod has bypassed neon scheduler which might be a sign + // of a bug or misbehavior: + // - pod is assigned to autoscaler scheduler + // - pod not in the state + // - pod is not preexisting pod + // - pod has the node name + if !isPodInState && !opts.preexisting && pod.Spec.SchedulerName == e.state.conf.SchedulerName && pod.Spec.NodeName != "" { + logger.Warn("Pod has bypassed neon scheduler") + } + // Get information about the node node, err := e.state.getOrFetchNodeState(ctx, logger, e.metrics, e.nodeStore, nodeName) if err != nil {