diff --git a/neonvm-controller/role.yaml b/neonvm-controller/role.yaml index b6d6d3f3f..b30fbe477 100644 --- a/neonvm-controller/role.yaml +++ b/neonvm-controller/role.yaml @@ -11,6 +11,14 @@ rules: verbs: - create - patch +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch - apiGroups: - "" resources: diff --git a/pkg/neonvm/controllers/vmmigration_controller.go b/pkg/neonvm/controllers/vmmigration_controller.go index 2a7160170..7a9efd4c1 100644 --- a/pkg/neonvm/controllers/vmmigration_controller.go +++ b/pkg/neonvm/controllers/vmmigration_controller.go @@ -172,6 +172,26 @@ func (r *VirtualMachineMigrationReconciler) Reconcile(ctx context.Context, req c if err := ctrl.SetControllerReference(vm, migration, r.Scheme); err != nil { return ctrl.Result{}, err } + // First check if VM has the affinity with CPU architecture + // if it doesn't we need to set it explicitly + // to prevent migration going to the node with different architecture + sourceNode := &corev1.Node{} + //nolint:exhaustruct // corev1.Node objects is not namespaced + err := r.Get(ctx, types.NamespacedName{Name: vm.Status.Node}, sourceNode) + if err != nil { + log.Error(err, "Failed to get source node") + return ctrl.Result{}, err + } + if !hasArchitectureAffinity(vm, sourceNode) { + + if err := addArchitectureAffinity(vm, sourceNode); err != nil { + log.Error(err, "Failed to add architecture affinity to VM", "sourceNodeStatus", sourceNode.Status) + } + if err = r.Update(ctx, vm); err != nil { + log.Error(err, "Failed to update node affinity of the source vm") + return ctrl.Result{RequeueAfter: time.Second}, err + } + } if err := r.Update(ctx, migration); err != nil { log.Info("Failed to add owner to Migration", "error", err) return ctrl.Result{}, err @@ -224,6 +244,7 @@ func (r *VirtualMachineMigrationReconciler) Reconcile(ctx context.Context, req c // NB: .Spec.EnableSSH guaranteed non-nil because the k8s API server sets the default for us. enableSSH := *vm.Spec.EnableSSH var sshSecret *corev1.Secret + if enableSSH { // We require the SSH secret to exist because we cannot unmount and // mount the new secret into the VM after the live migration. If a @@ -753,3 +774,55 @@ func (r *VirtualMachineMigrationReconciler) targetPodForVirtualMachine( return pod, nil } + +// hasArchitectureAffinity checks if the VM has the affinity with the CPU architecture of the source node +func hasArchitectureAffinity(vm *vmv1.VirtualMachine, sourceNode *corev1.Node) bool { + if vm.Spec.Affinity != nil && vm.Spec.Affinity.NodeAffinity != nil && vm.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { + for _, term := range vm.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms { + for _, expr := range term.MatchExpressions { + if expr.Key == "kubernetes.io/arch" && expr.Operator == corev1.NodeSelectorOpIn && len(expr.Values) > 0 { + for _, value := range expr.Values { + if value == sourceNode.Status.NodeInfo.Architecture { + return true + } + } + } + } + } + } + return false +} + +func addArchitectureAffinity(vm *vmv1.VirtualMachine, sourceNode *corev1.Node) error { + if sourceNode.Status.NodeInfo.Architecture == "" { + return errors.New("source node architecture is empty") + } + if vm.Spec.Affinity == nil { + vm.Spec.Affinity = &corev1.Affinity{} + } + if vm.Spec.Affinity.NodeAffinity == nil { + vm.Spec.Affinity.NodeAffinity = &corev1.NodeAffinity{} + } + if vm.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { + vm.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &corev1.NodeSelector{} + } + + vm.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms = append( + vm.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms, + corev1.NodeSelectorTerm{ + MatchExpressions: []corev1.NodeSelectorRequirement{ + { + Key: "kubernetes.io/arch", + Operator: corev1.NodeSelectorOpIn, + Values: []string{sourceNode.Status.NodeInfo.Architecture}, + }, + { + Key: "kubernetes.io/os", + Operator: "In", + Values: []string{"linux"}, + }, + }, + }, + ) + return nil +} diff --git a/pkg/neonvm/controllers/vmmigration_controller_test.go b/pkg/neonvm/controllers/vmmigration_controller_test.go new file mode 100644 index 000000000..598803f56 --- /dev/null +++ b/pkg/neonvm/controllers/vmmigration_controller_test.go @@ -0,0 +1,144 @@ +package controllers + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + + vmv1 "github.com/neondatabase/autoscaling/neonvm/apis/neonvm/v1" +) + +func TestHasArchitectureAffinity(t *testing.T) { + + cases := []struct { + name string + result bool + vmSpec *vmv1.VirtualMachine + node *corev1.Node + }{ + { + name: "No architecture affinity", + result: false, + vmSpec: &vmv1.VirtualMachine{}, + node: &corev1.Node{}, + }, + { + name: "VM spec has affinity", + result: true, + vmSpec: &vmv1.VirtualMachine{ + Spec: vmv1.VirtualMachineSpec{ + Affinity: &corev1.Affinity{ + NodeAffinity: &corev1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ + NodeSelectorTerms: []corev1.NodeSelectorTerm{ + { + MatchExpressions: []corev1.NodeSelectorRequirement{ + { + Key: "kubernetes.io/arch", + Operator: corev1.NodeSelectorOpIn, + Values: []string{"amd64"}, + }, + }, + }, + }, + }, + }, + }, + }, + }, + node: &corev1.Node{ + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ + Architecture: "amd64", + }, + }, + }, + }, + { + name: "VM affinity and node has different architectures", + result: false, + vmSpec: &vmv1.VirtualMachine{ + Spec: vmv1.VirtualMachineSpec{ + Affinity: &corev1.Affinity{ + NodeAffinity: &corev1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ + NodeSelectorTerms: []corev1.NodeSelectorTerm{ + { + MatchExpressions: []corev1.NodeSelectorRequirement{ + { + Key: "kubernetes.io/arch", + Operator: corev1.NodeSelectorOpIn, + Values: []string{"arm64"}, + }, + }, + }, + }, + }, + }, + }, + }, + }, + node: &corev1.Node{ + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ + Architecture: "amd64", + }, + }, + }, + }, + } + + for _, testCase := range cases { + t.Run(testCase.name, func(t *testing.T) { + result := hasArchitectureAffinity(testCase.vmSpec, testCase.node) + if result != testCase.result { + t.Errorf("Expected %v, got %v", testCase.result, result) + } + }) + } +} + +func TestAddArchitectureAffinity(t *testing.T) { + vmSpec := &vmv1.VirtualMachine{} + sourceNode := &corev1.Node{ + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ + Architecture: "amd64", + }, + }, + } + if hasArchitectureAffinity(vmSpec, sourceNode) { + t.Errorf("Expected no architecture affinity") + } + if err := addArchitectureAffinity(vmSpec, &corev1.Node{ + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ + Architecture: "amd64", + }, + }, + }); err != nil { + t.Errorf("Unexpected error: %v", err) + } + + if !hasArchitectureAffinity(vmSpec, sourceNode) { + t.Errorf("Expected architecture affinity to be added") + } +} + +func TestSourceNodeHasNoArchitecture(t *testing.T) { + sourceNode := &corev1.Node{ + Status: corev1.NodeStatus{ + NodeInfo: corev1.NodeSystemInfo{ + Architecture: "", + }, + }, + } + + vmSpec := &vmv1.VirtualMachine{} + + err := addArchitectureAffinity(vmSpec, sourceNode) + if err == nil { + t.Errorf("Expected error when source node has no architecture") + } + +} diff --git a/tests/e2e/vm-migration/00-assert.yaml b/tests/e2e/vm-migration/00-assert.yaml index f6cc18db2..daf7a2a32 100644 --- a/tests/e2e/vm-migration/00-assert.yaml +++ b/tests/e2e/vm-migration/00-assert.yaml @@ -1,6 +1,15 @@ apiVersion: kuttl.dev/v1beta1 kind: TestAssert timeout: 90 +commands: +# it is not expected to have affinity set before the migration started +- script: | + set -eux + affinity=$(kubectl -n "$NAMESPACE" get virtualmachines example -ojsonpath="{.spec.affinity}") + if [ -n "$affinity" ]; then + echo "Affinity is set" + exit 1 + fi --- apiVersion: vm.neon.tech/v1 kind: VirtualMachine diff --git a/tests/e2e/vm-migration/01-assert.yaml b/tests/e2e/vm-migration/01-assert.yaml index 9416c0d72..6ba9b2144 100644 --- a/tests/e2e/vm-migration/01-assert.yaml +++ b/tests/e2e/vm-migration/01-assert.yaml @@ -19,6 +19,21 @@ status: conditions: - type: Available status: "True" +# original spec had no affinity but it should be added during the migration to prevent the VM from being scheduled on a different architecture +spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: kubernetes.io/os + operator: In + values: + - linux --- apiVersion: v1 kind: pod