From 5e5465be280498bd22d7671ee4838ed234f198ab Mon Sep 17 00:00:00 2001
From: Oleg Vasilev <oleg@neon.tech>
Date: Tue, 24 Sep 2024 16:00:50 +0200
Subject: [PATCH] neonvm-controller: recreate the pod for VM even if the older
 pod still exists (#1055)

We have previously assumed running multiple pods for one VM is
dangerous.

We now believe it should be fine, even if the old pod still tries to
continue operating. What would happen is that the new pod would take
over the safekeepers quorum, leaving the old one to disconnect and
shutdown.

Signed-off-by: Oleg Vasilev <oleg@neon.tech>
---
 neonvm-controller/cmd/main.go                 |  5 ++++
 pkg/neonvm/controllers/config.go              |  3 ++
 .../functests/vm_controller_test.go           |  1 +
 pkg/neonvm/controllers/vm_controller.go       | 28 ++++---------------
 .../controllers/vm_controller_unit_test.go    |  1 +
 5 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/neonvm-controller/cmd/main.go b/neonvm-controller/cmd/main.go
index fb8046f85..c56eb6db8 100644
--- a/neonvm-controller/cmd/main.go
+++ b/neonvm-controller/cmd/main.go
@@ -100,6 +100,7 @@ func main() {
 	var memhpAutoMovableRatio string
 	var failurePendingPeriod time.Duration
 	var failingRefreshInterval time.Duration
+	var atMostOnePod bool
 	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
 	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
 	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
@@ -140,6 +141,9 @@ func main() {
 		"the period for the propagation of reconciliation failures to the observability instruments")
 	flag.DurationVar(&failingRefreshInterval, "failing-refresh-interval", 1*time.Minute,
 		"the interval between consecutive updates of metrics and logs, related to failing reconciliations")
+	flag.BoolVar(&atMostOnePod, "at-most-one-pod", false,
+		"If true, the controller will ensure that at most one pod is running at a time. "+
+			"Otherwise, the outdated pod might be left to terminate, while the new one is already running.")
 	flag.Parse()
 
 	if defaultMemoryProvider == "" {
@@ -195,6 +199,7 @@ func main() {
 		MemhpAutoMovableRatio:   memhpAutoMovableRatio,
 		FailurePendingPeriod:    failurePendingPeriod,
 		FailingRefreshInterval:  failingRefreshInterval,
+		AtMostOnePod:            atMostOnePod,
 	}
 
 	vmReconciler := &controllers.VMReconciler{
diff --git a/pkg/neonvm/controllers/config.go b/pkg/neonvm/controllers/config.go
index eb81d98b2..69616e73d 100644
--- a/pkg/neonvm/controllers/config.go
+++ b/pkg/neonvm/controllers/config.go
@@ -50,4 +50,7 @@ type ReconcilerConfig struct {
 	// FailingRefreshInterval is the interval between consecutive
 	// updates of metrics and logs, related to failing reconciliations
 	FailingRefreshInterval time.Duration
+
+	// AtMostOnePod is the flag that indicates whether we should only have one pod per VM.
+	AtMostOnePod bool
 }
diff --git a/pkg/neonvm/controllers/functests/vm_controller_test.go b/pkg/neonvm/controllers/functests/vm_controller_test.go
index 542dcb88b..8b3987b6c 100644
--- a/pkg/neonvm/controllers/functests/vm_controller_test.go
+++ b/pkg/neonvm/controllers/functests/vm_controller_test.go
@@ -115,6 +115,7 @@ var _ = Describe("VirtualMachine controller", func() {
 					MemhpAutoMovableRatio:   "301",
 					FailurePendingPeriod:    1 * time.Minute,
 					FailingRefreshInterval:  1 * time.Minute,
+					AtMostOnePod:            false,
 				},
 			}
 
diff --git a/pkg/neonvm/controllers/vm_controller.go b/pkg/neonvm/controllers/vm_controller.go
index a578bee0f..7a77668b6 100644
--- a/pkg/neonvm/controllers/vm_controller.go
+++ b/pkg/neonvm/controllers/vm_controller.go
@@ -776,13 +776,12 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine)
 			return err
 		}
 
-		// We must keep the VM status the same until we know the neonvm-runner container has been
-		// terminated, otherwise we could end up starting a new runner pod while the VM in the old
-		// one is still running.
+		// By default, we cleanup the VM, even if previous pod still exists. This behavior is for the case
+		// where the pod is stuck deleting, and we want to progress without waiting for it.
 		//
-		// Note that this is required because 'VmSucceeded' and 'VmFailed' are true if *at least
-		// one* container inside the runner pod has finished; the VM itself may still be running.
-		if apierrors.IsNotFound(err) || runnerContainerStopped(vmRunner) {
+		// However, this opens up a possibility for cascading failures where the pods would be constantly
+		// recreated, and then stuck deleting. That's why we have AtMostOnePod.
+		if !r.Config.AtMostOnePod || apierrors.IsNotFound(err) {
 			// NB: Cleanup() leaves status .Phase and .RestartCount (+ some others) but unsets other fields.
 			vm.Cleanup()
 
@@ -963,23 +962,6 @@ func runnerStatus(pod *corev1.Pod) runnerStatusKind {
 	}
 }
 
-// runnerContainerStopped returns true iff the neonvm-runner container has exited.
-//
-// The guarantee is simple: It is only safe to start a new runner pod for a VM if
-// runnerContainerStopped returns true (otherwise, we may end up with >1 instance of the same VM).
-func runnerContainerStopped(pod *corev1.Pod) bool {
-	if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
-		return true
-	}
-
-	for _, stat := range pod.Status.ContainerStatuses {
-		if stat.Name == "neonvm-runner" {
-			return stat.State.Terminated != nil
-		}
-	}
-	return false
-}
-
 // deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
 // then emits an event and log line about what it did, whether it actually deleted the runner pod.
 func (r *VMReconciler) deleteRunnerPodIfEnabled(
diff --git a/pkg/neonvm/controllers/vm_controller_unit_test.go b/pkg/neonvm/controllers/vm_controller_unit_test.go
index 139252924..f77d8d1a2 100644
--- a/pkg/neonvm/controllers/vm_controller_unit_test.go
+++ b/pkg/neonvm/controllers/vm_controller_unit_test.go
@@ -123,6 +123,7 @@ func newTestParams(t *testing.T) *testParams {
 			MemhpAutoMovableRatio:   "301",
 			FailurePendingPeriod:    time.Minute,
 			FailingRefreshInterval:  time.Minute,
+			AtMostOnePod:            false,
 		},
 		Metrics: reconcilerMetrics,
 	}