neonvm-controller: recreate the pod for VM even if the older pod stil…

…l exists (#1055) We have previously assumed running multiple pods for one VM is dangerous. We now believe it should be fine, even if the old pod still tries to continue operating. What would happen is that the new pod would take over the safekeepers quorum, leaving the old one to disconnect and shutdown. Signed-off-by: Oleg Vasilev <[email protected]>
neondatabase · Oct 15, 2024 · 5e5465b · 5e5465b
1 parent bfa69c2
commit 5e5465b
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 23 deletions.
diff --git a/neonvm-controller/cmd/main.go b/neonvm-controller/cmd/main.go
@@ -100,6 +100,7 @@ func main() {
 	var memhpAutoMovableRatio string
 	var failurePendingPeriod time.Duration
 	var failingRefreshInterval time.Duration
+	var atMostOnePod bool
 	flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
 	flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
 	flag.BoolVar(&enableLeaderElection, "leader-elect", false,
@@ -140,6 +141,9 @@ func main() {
 		"the period for the propagation of reconciliation failures to the observability instruments")
 	flag.DurationVar(&failingRefreshInterval, "failing-refresh-interval", 1*time.Minute,
 		"the interval between consecutive updates of metrics and logs, related to failing reconciliations")
+	flag.BoolVar(&atMostOnePod, "at-most-one-pod", false,
+		"If true, the controller will ensure that at most one pod is running at a time. "+
+			"Otherwise, the outdated pod might be left to terminate, while the new one is already running.")
 	flag.Parse()
 
 	if defaultMemoryProvider == "" {
@@ -195,6 +199,7 @@ func main() {
 		MemhpAutoMovableRatio:   memhpAutoMovableRatio,
 		FailurePendingPeriod:    failurePendingPeriod,
 		FailingRefreshInterval:  failingRefreshInterval,
+		AtMostOnePod:            atMostOnePod,
 	}
 
 	vmReconciler := &controllers.VMReconciler{

diff --git a/pkg/neonvm/controllers/config.go b/pkg/neonvm/controllers/config.go
@@ -50,4 +50,7 @@ type ReconcilerConfig struct {
 	// FailingRefreshInterval is the interval between consecutive
 	// updates of metrics and logs, related to failing reconciliations
 	FailingRefreshInterval time.Duration
+
+	// AtMostOnePod is the flag that indicates whether we should only have one pod per VM.
+	AtMostOnePod bool
 }
diff --git a/pkg/neonvm/controllers/functests/vm_controller_test.go b/pkg/neonvm/controllers/functests/vm_controller_test.go
@@ -115,6 +115,7 @@ var _ = Describe("VirtualMachine controller", func() {
 					MemhpAutoMovableRatio:   "301",
 					FailurePendingPeriod:    1 * time.Minute,
 					FailingRefreshInterval:  1 * time.Minute,
+					AtMostOnePod:            false,
 				},
 			}
 

diff --git a/pkg/neonvm/controllers/vm_controller.go b/pkg/neonvm/controllers/vm_controller.go
@@ -776,13 +776,12 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine)
 			return err
 		}
 
-		// We must keep the VM status the same until we know the neonvm-runner container has been
-		// terminated, otherwise we could end up starting a new runner pod while the VM in the old
-		// one is still running.
+		// By default, we cleanup the VM, even if previous pod still exists. This behavior is for the case
+		// where the pod is stuck deleting, and we want to progress without waiting for it.
 		//
-		// Note that this is required because 'VmSucceeded' and 'VmFailed' are true if *at least
-		// one* container inside the runner pod has finished; the VM itself may still be running.
-		if apierrors.IsNotFound(err) || runnerContainerStopped(vmRunner) {
+		// However, this opens up a possibility for cascading failures where the pods would be constantly
+		// recreated, and then stuck deleting. That's why we have AtMostOnePod.
+		if !r.Config.AtMostOnePod || apierrors.IsNotFound(err) {
 			// NB: Cleanup() leaves status .Phase and .RestartCount (+ some others) but unsets other fields.
 			vm.Cleanup()
 
@@ -963,23 +962,6 @@ func runnerStatus(pod *corev1.Pod) runnerStatusKind {
 	}
 }
 
-// runnerContainerStopped returns true iff the neonvm-runner container has exited.
-//
-// The guarantee is simple: It is only safe to start a new runner pod for a VM if
-// runnerContainerStopped returns true (otherwise, we may end up with >1 instance of the same VM).
-func runnerContainerStopped(pod *corev1.Pod) bool {
-	if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
-		return true
-	}
-
-	for _, stat := range pod.Status.ContainerStatuses {
-		if stat.Name == "neonvm-runner" {
-			return stat.State.Terminated != nil
-		}
-	}
-	return false
-}
-
 // deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
 // then emits an event and log line about what it did, whether it actually deleted the runner pod.
 func (r *VMReconciler) deleteRunnerPodIfEnabled(

diff --git a/pkg/neonvm/controllers/vm_controller_unit_test.go b/pkg/neonvm/controllers/vm_controller_unit_test.go
@@ -123,6 +123,7 @@ func newTestParams(t *testing.T) *testParams {
 			MemhpAutoMovableRatio:   "301",
 			FailurePendingPeriod:    time.Minute,
 			FailingRefreshInterval:  time.Minute,
+			AtMostOnePod:            false,
 		},
 		Metrics: reconcilerMetrics,
 	}