Skip to content

Commit

Permalink
neonvm: recreate the pod for VM even if the older pod still exists
Browse files Browse the repository at this point in the history
We have previously assumed running multiple pods for one VM is
dangerous.

We now believe it should be fine, even if the old pod still tries to
continue operating. What would happen is that the new pod would take
over the safekeepers quorum, leaving the old one to disconnect and
shutdown.

Signed-off-by: Oleg Vasilev <[email protected]>
  • Loading branch information
Omrigan committed Sep 25, 2024
1 parent 76cfe4f commit 5b1c98c
Showing 1 changed file with 20 additions and 44 deletions.
64 changes: 20 additions & 44 deletions neonvm/controllers/vm_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -772,35 +772,28 @@ func (r *VMReconciler) doReconcile(ctx context.Context, vm *vmv1.VirtualMachine)
return err
}

// We must keep the VM status the same until we know the neonvm-runner container has been
// terminated, otherwise we could end up starting a new runner pod while the VM in the old
// one is still running.
//
// Note that this is required because 'VmSucceeded' and 'VmFailed' are true if *at least
// one* container inside the runner pod has finished; the VM itself may still be running.
if apierrors.IsNotFound(err) || runnerContainerStopped(vmRunner) {
// NB: Cleanup() leaves status .Phase and .RestartCount (+ some others) but unsets other fields.
vm.Cleanup()

var shouldRestart bool
switch vm.Spec.RestartPolicy {
case vmv1.RestartPolicyAlways:
shouldRestart = true
case vmv1.RestartPolicyOnFailure:
shouldRestart = vm.Status.Phase == vmv1.VmFailed
case vmv1.RestartPolicyNever:
shouldRestart = false
}

if shouldRestart {
log.Info("Restarting VM runner pod", "VM.Phase", vm.Status.Phase, "RestartPolicy", vm.Spec.RestartPolicy)
vm.Status.Phase = vmv1.VmPending // reset to trigger restart
vm.Status.RestartCount += 1 // increment restart count
r.Metrics.vmRestartCounts.Inc()
}
// NB: Cleanup() leaves status .Phase and .RestartCount (+ some others) but unsets other fields.
vm.Cleanup()

var shouldRestart bool
switch vm.Spec.RestartPolicy {
case vmv1.RestartPolicyAlways:
shouldRestart = true
case vmv1.RestartPolicyOnFailure:
shouldRestart = vm.Status.Phase == vmv1.VmFailed
case vmv1.RestartPolicyNever:
shouldRestart = false
}

// TODO for RestartPolicyNever: implement TTL or do nothing
if shouldRestart {
log.Info("Restarting VM runner pod", "VM.Phase", vm.Status.Phase, "RestartPolicy", vm.Spec.RestartPolicy)
vm.Status.Phase = vmv1.VmPending // reset to trigger restart
vm.Status.RestartCount += 1 // increment restart count
r.Metrics.vmRestartCounts.Inc()
}

// TODO for RestartPolicyNever: implement TTL or do nothing

default:
// do nothing
}
Expand Down Expand Up @@ -953,23 +946,6 @@ func runnerStatus(pod *corev1.Pod) runnerStatusKind {
}
}

// runnerContainerStopped returns true iff the neonvm-runner container has exited.
//
// The guarantee is simple: It is only safe to start a new runner pod for a VM if
// runnerContainerStopped returns true (otherwise, we may end up with >1 instance of the same VM).
func runnerContainerStopped(pod *corev1.Pod) bool {
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
return true
}

for _, stat := range pod.Status.ContainerStatuses {
if stat.Name == "neonvm-runner" {
return stat.State.Terminated != nil
}
}
return false
}

// deleteRunnerPodIfEnabled deletes the runner pod if buildtag.NeverDeleteRunnerPods is false, and
// then emits an event and log line about what it did, whether it actually deleted the runner pod.
func (r *VMReconciler) deleteRunnerPodIfEnabled(
Expand Down

0 comments on commit 5b1c98c

Please sign in to comment.