diff --git a/neonvm-controller/cmd/main.go b/neonvm-controller/cmd/main.go index fb8046f85..c56eb6db8 100644 --- a/neonvm-controller/cmd/main.go +++ b/neonvm-controller/cmd/main.go @@ -100,6 +100,7 @@ func main() { var memhpAutoMovableRatio string var failurePendingPeriod time.Duration var failingRefreshInterval time.Duration + var atMostOnePod bool flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, @@ -140,6 +141,9 @@ func main() { "the period for the propagation of reconciliation failures to the observability instruments") flag.DurationVar(&failingRefreshInterval, "failing-refresh-interval", 1*time.Minute, "the interval between consecutive updates of metrics and logs, related to failing reconciliations") + flag.BoolVar(&atMostOnePod, "at-most-one-pod", false, + "If true, the controller will ensure that at most one pod is running at a time. "+ + "Otherwise, the outdated pod might be left to terminate, while the new one is already running.") flag.Parse() if defaultMemoryProvider == "" { @@ -195,6 +199,7 @@ func main() { MemhpAutoMovableRatio: memhpAutoMovableRatio, FailurePendingPeriod: failurePendingPeriod, FailingRefreshInterval: failingRefreshInterval, + AtMostOnePod: atMostOnePod, } vmReconciler := &controllers.VMReconciler{ diff --git a/pkg/neonvm/controllers/config.go b/pkg/neonvm/controllers/config.go index eb81d98b2..69616e73d 100644 --- a/pkg/neonvm/controllers/config.go +++ b/pkg/neonvm/controllers/config.go @@ -50,4 +50,7 @@ type ReconcilerConfig struct { // FailingRefreshInterval is the interval between consecutive // updates of metrics and logs, related to failing reconciliations FailingRefreshInterval time.Duration + + // AtMostOnePod is the flag that indicates whether we should only have one pod per VM. + AtMostOnePod bool } diff --git a/pkg/neonvm/controllers/functests/vm_controller_test.go b/pkg/neonvm/controllers/functests/vm_controller_test.go index 542dcb88b..8b3987b6c 100644 --- a/pkg/neonvm/controllers/functests/vm_controller_test.go +++ b/pkg/neonvm/controllers/functests/vm_controller_test.go @@ -115,6 +115,7 @@ var _ = Describe("VirtualMachine controller", func() { MemhpAutoMovableRatio: "301", FailurePendingPeriod: 1 * time.Minute, FailingRefreshInterval: 1 * time.Minute, + AtMostOnePod: false, }, } diff --git a/pkg/neonvm/controllers/vm_controller.go b/pkg/neonvm/controllers/vm_controller.go index 338991bb2..d06bf5f1f 100644 --- a/pkg/neonvm/controllers/vm_controller.go +++ b/pkg/neonvm/controllers/vm_controller.go @@ -772,13 +772,12 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine) return err } - // We must keep the VM status the same until we know the neonvm-runner container has been - // terminated, otherwise we could end up starting a new runner pod while the VM in the old - // one is still running. + // By default, we cleanup the VM, even if previous pod still exists. This behavior is for the case + // where the pod is stuck deleting, and we want to progress without waiting for it. // - // Note that this is required because 'VmSucceeded' and 'VmFailed' are true if *at least - // one* container inside the runner pod has finished; the VM itself may still be running. - if apierrors.IsNotFound(err) || runnerContainerStopped(vmRunner) { + // However, this opens up a possibility for cascading failures where the pods would be constantly + // recreated, and then stuck deleting. That's why we have AtMostOnePod. + if !r.Config.AtMostOnePod || apierrors.IsNotFound(err) { // NB: Cleanup() leaves status .Phase and .RestartCount (+ some others) but unsets other fields. vm.Cleanup() @@ -957,23 +956,6 @@ func runnerStatus(pod *corev1.Pod) runnerStatusKind { } } -// runnerContainerStopped returns true iff the neonvm-runner container has exited. -// -// The guarantee is simple: It is only safe to start a new runner pod for a VM if -// runnerContainerStopped returns true (otherwise, we may end up with >1 instance of the same VM). -func runnerContainerStopped(pod *corev1.Pod) bool { - if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed { - return true - } - - for _, stat := range pod.Status.ContainerStatuses { - if stat.Name == "neonvm-runner" { - return stat.State.Terminated != nil - } - } - return false -} - // deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and // then emits an event and log line about what it did, whether it actually deleted the runner pod. func (r *VMReconciler) deleteRunnerPodIfEnabled( diff --git a/pkg/neonvm/controllers/vm_controller_unit_test.go b/pkg/neonvm/controllers/vm_controller_unit_test.go index 139252924..f77d8d1a2 100644 --- a/pkg/neonvm/controllers/vm_controller_unit_test.go +++ b/pkg/neonvm/controllers/vm_controller_unit_test.go @@ -123,6 +123,7 @@ func newTestParams(t *testing.T) *testParams { MemhpAutoMovableRatio: "301", FailurePendingPeriod: time.Minute, FailingRefreshInterval: time.Minute, + AtMostOnePod: false, }, Metrics: reconcilerMetrics, }