Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Fix agent node affinity settings to always prefer control plane nodes #433

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ GINKGO_ARGS ?=
SKIP_RESOURCE_CLEANUP ?= false
USE_EXISTING_CLUSTER ?= false
ISOLATED_MODE ?= false
CNI ?= calico # alternatively: kindnet
GINKGO_NOCOLOR ?= false
GINKGO_LABEL_FILTER ?= short || full
GINKGO_TESTS ?= $(ROOT_DIR)/$(TEST_DIR)/e2e/suites/...
Expand Down Expand Up @@ -484,6 +485,9 @@ $(RELEASE_DIR):
$(CHART_RELEASE_DIR):
mkdir -p $(CHART_RELEASE_DIR)/templates

clean-chart-release-dir:
rm -rf $(CHART_RELEASE_DIR)

$(CHART_PACKAGE_DIR):
mkdir -p $(CHART_PACKAGE_DIR)

Expand All @@ -492,7 +496,7 @@ release: clean-release $(RELEASE_DIR) ## Builds and push container images using
$(MAKE) release-chart

.PHONY: build-chart
build-chart: $(HELM) $(KUSTOMIZE) $(RELEASE_DIR) $(CHART_RELEASE_DIR) $(CHART_PACKAGE_DIR) ## Builds the chart to publish with a release
build-chart: $(HELM) $(KUSTOMIZE) clean-chart-release-dir $(RELEASE_DIR) $(CHART_RELEASE_DIR) $(CHART_PACKAGE_DIR) ## Builds the chart to publish with a release
$(KUSTOMIZE) build ./config/chart > $(CHART_DIR)/templates/rancher-turtles-components.yaml
cp -rf $(CHART_DIR)/* $(CHART_RELEASE_DIR)
sed -i'' -e 's@image: .*@image: '"$(CONTROLLER_IMG)"'@' $(CHART_RELEASE_DIR)/values.yaml
Expand All @@ -519,7 +523,8 @@ test-e2e: $(GINKGO) $(HELM) $(CLUSTERCTL) kubectl e2e-image ## Run the end-to-en
-e2e.chart-path=$(ROOT_DIR)/$(CHART_RELEASE_DIR) \
-e2e.skip-resource-cleanup=$(SKIP_RESOURCE_CLEANUP) \
-e2e.use-existing-cluster=$(USE_EXISTING_CLUSTER) \
-e2e.isolated-mode=$(ISOLATED_MODE)
-e2e.isolated-mode=$(ISOLATED_MODE) \
-e2e.cni=$(CNI)

.PHONY: e2e-image
e2e-image: ## Build the image for e2e tests
Expand Down
70 changes: 63 additions & 7 deletions internal/controllers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ package controllers

import (
"bufio"
"bytes"
"cmp"
"context"
"crypto/tls"
"errors"
Expand All @@ -26,9 +28,12 @@ import (
"net/http"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
yamlDecoder "k8s.io/apimachinery/pkg/util/yaml"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand All @@ -50,6 +55,7 @@ const (
capiClusterOwner = "cluster-api.cattle.io/capi-cluster-owner"
capiClusterOwnerNamespace = "cluster-api.cattle.io/capi-cluster-owner-ns"

deploymentKind = "Deployment"
defaultRequeueDuration = 1 * time.Minute
)

Expand Down Expand Up @@ -178,21 +184,71 @@ func createImportManifest(ctx context.Context, remoteClient client.Client, in io
return nil
}

func createRawManifest(ctx context.Context, remoteClient client.Client, bytes []byte) error {
items, err := utilyaml.ToUnstructured(bytes)
if err != nil {
return fmt.Errorf("error unmarshalling bytes or empty object passed: %w", err)
}
func createRawManifest(ctx context.Context, remoteClient client.Client, data []byte) error {
log := log.FromContext(ctx)
decoder := utilyaml.NewYAMLDecoder(io.NopCloser(bytes.NewReader(data)))

for _, obj := range items {
if err := createObject(ctx, remoteClient, obj.DeepCopy()); err != nil {
for {
u := &unstructured.Unstructured{}

_, gvk, err := decoder.Decode(nil, u)
if errors.Is(err, io.EOF) {
break
} else if err != nil {
return err
}

if gvk.Kind == deploymentKind {
deploy := &appsv1.Deployment{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, deploy); err != nil {
log.Error(err, "failed to decode agent deployment")
return err
}

setDeploymentAffinity(deploy)

if err := createObject(ctx, remoteClient, deploy); err != nil {
return err
}

continue
}

if err := createObject(ctx, remoteClient, u.DeepCopy()); err != nil {
return err
}
}

return nil
}

func setDeploymentAffinity(deploy *appsv1.Deployment) {
affinity := cmp.Or(deploy.Spec.Template.Spec.Affinity, &corev1.Affinity{})
nodeAffinity := cmp.Or(affinity.NodeAffinity, &corev1.NodeAffinity{})
preference := corev1.PreferredSchedulingTerm{
Weight: 100,
Preference: corev1.NodeSelectorTerm{
MatchExpressions: []corev1.NodeSelectorRequirement{{
Key: "node-role.kubernetes.io/control-plane",
Operator: corev1.NodeSelectorOpExists,
}},
},
}
nodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution = append(nodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution, preference)
preference.Preference.MatchExpressions = []corev1.NodeSelectorRequirement{{
Key: "node-role.kubernetes.io/controlplane",
Operator: corev1.NodeSelectorOpExists,
}}
nodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution = append(nodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution, preference)
preference.Preference.MatchExpressions = []corev1.NodeSelectorRequirement{{
Key: "node-role.kubernetes.io/master",
Operator: corev1.NodeSelectorOpExists,
}}
nodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution = append(nodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution, preference)
affinity.NodeAffinity = nodeAffinity
deploy.Spec.Template.Spec.Affinity = affinity
}

func createObject(ctx context.Context, c client.Client, obj client.Object) error {
log := log.FromContext(ctx)
gvk := obj.GetObjectKind().GroupVersionKind()
Expand Down
52 changes: 45 additions & 7 deletions internal/controllers/import_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ import (
"context"
"fmt"
"strings"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -49,6 +51,8 @@ import (
turtlespredicates "github.com/rancher/turtles/util/predicates"
)

const fieldOwner = "rancher-turtles"

// CAPIImportReconciler represents a reconciler for importing CAPI clusters in Rancher.
type CAPIImportReconciler struct {
Client client.Client
Expand Down Expand Up @@ -241,11 +245,50 @@ func (r *CAPIImportReconciler) reconcileNormal(ctx context.Context, capiCluster

log.Info("found cluster name", "name", rancherCluster.Status.ClusterName)

if rancherCluster.Status.AgentDeployed {
log.Info("agent already deployed, no action needed")
if rancherCluster.Status.Ready {
log.Info("cluster is ready, no action needed")
return ctrl.Result{}, nil
}

// We have to ensure the agent deployment has correct nodeAffinity settings at all times
remoteClient, err := r.remoteClientGetter(ctx, capiCluster.Name, r.Client, client.ObjectKeyFromObject(capiCluster))
if err != nil {
return ctrl.Result{}, fmt.Errorf("getting remote cluster client: %w", err)
}

if rancherCluster.Status.AgentDeployed {
log.Info("updating agent node affinity settings")

agent := &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{
Name: "cattle-cluster-agent",
Namespace: "cattle-system",
}}

if err := remoteClient.Get(ctx, client.ObjectKeyFromObject(agent), agent); err != nil {
log.Error(err, "unable to get existing agent deployment")
return ctrl.Result{}, err
}

setDeploymentAffinity(agent)
agent.SetManagedFields(nil)
agent.TypeMeta = metav1.TypeMeta{
APIVersion: "apps/v1",
Kind: deploymentKind,
}

if err := remoteClient.Patch(ctx, agent, client.Apply, []client.PatchOption{
client.ForceOwnership,
client.FieldOwner(fieldOwner),
}...); err != nil {
log.Error(err, "unable to update existing agent deployment")
return ctrl.Result{}, err
}

// During the provisioning after registration the initial deployment gets
// updated by the rancher. We must not miss it.
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}

// get the registration manifest
manifest, err := getClusterRegistrationManifest(ctx, rancherCluster.Status.ClusterName, capiCluster.Namespace, r.RancherClient, r.InsecureSkipVerify)
if err != nil {
Expand All @@ -259,11 +302,6 @@ func (r *CAPIImportReconciler) reconcileNormal(ctx context.Context, capiCluster

log.Info("Creating import manifest")

remoteClient, err := r.remoteClientGetter(ctx, capiCluster.Name, r.Client, client.ObjectKeyFromObject(capiCluster))
if err != nil {
return ctrl.Result{}, fmt.Errorf("getting remote cluster client: %w", err)
}

if err := createImportManifest(ctx, remoteClient, strings.NewReader(manifest)); err != nil {
return ctrl.Result{}, fmt.Errorf("creating import manifest: %w", err)
}
Expand Down
55 changes: 50 additions & 5 deletions internal/controllers/import_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ import (

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/rancher/turtles/internal/controllers/testdata"
managementv3 "github.com/rancher/turtles/internal/rancher/management/v3"
provisioningv1 "github.com/rancher/turtles/internal/rancher/provisioning/v1"
"github.com/rancher/turtles/internal/test"
turtlesnaming "github.com/rancher/turtles/util/naming"
"github.com/rancher-sandbox/rancher-turtles/internal/controllers/testdata"
managementv3 "github.com/rancher-sandbox/rancher-turtles/internal/rancher/management/v3"
provisioningv1 "github.com/rancher-sandbox/rancher-turtles/internal/rancher/provisioning/v1"
"github.com/rancher-sandbox/rancher-turtles/internal/test"
turtlesnaming "github.com/rancher-sandbox/rancher-turtles/util/naming"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
Expand Down Expand Up @@ -225,6 +226,50 @@ var _ = Describe("reconcile CAPI Cluster", func() {
unstructuredObj.SetUnstructuredContent(u)
unstructuredObj.SetGroupVersionKind(obj.GetObjectKind().GroupVersionKind())

if unstructuredObj.GroupVersionKind().Kind == "Deployment" {
dep := &appsv1.Deployment{}
g.Eventually(testEnv.GetAs(unstructuredObj, dep)).ShouldNot(BeNil())
affinity := dep.Spec.Template.Spec.Affinity.NodeAffinity
g.Expect(affinity).ToNot(BeNil())
g.Expect(affinity.PreferredDuringSchedulingIgnoredDuringExecution).To(
ContainElement(
HaveField(
"Preference.MatchExpressions",
HaveExactElements(corev1.NodeSelectorRequirement{
Key: "node-role.kubernetes.io/control-plane",
Operator: corev1.NodeSelectorOpExists,
})),
))
g.Expect(affinity.PreferredDuringSchedulingIgnoredDuringExecution).To(
ContainElement(
HaveField(
"Preference.MatchExpressions",
HaveExactElements(corev1.NodeSelectorRequirement{
Key: "node-role.kubernetes.io/controlplane",
Operator: corev1.NodeSelectorOpExists,
})),
))
g.Expect(affinity.PreferredDuringSchedulingIgnoredDuringExecution).To(
ContainElement(
HaveField(
"Preference.MatchExpressions",
HaveExactElements(corev1.NodeSelectorRequirement{
Key: "node-role.kubernetes.io/master",
Operator: corev1.NodeSelectorOpExists,
})),
))
g.Expect(affinity.PreferredDuringSchedulingIgnoredDuringExecution).To(
ContainElement(
HaveField(
"Preference.MatchExpressions",
HaveExactElements(corev1.NodeSelectorRequirement{
Key: "node-role.kubernetes.io/master",
Operator: corev1.NodeSelectorOpIn,
Values: []string{"true"},
})),
))
}

g.Expect(cl.Get(ctx, client.ObjectKey{
Namespace: unstructuredObj.GetNamespace(),
Name: unstructuredObj.GetName(),
Expand Down
50 changes: 43 additions & 7 deletions internal/controllers/import_controller_v3.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ import (
"context"
"fmt"
"strings"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -274,11 +276,50 @@ func (r *CAPIImportManagementV3Reconciler) reconcileNormal(ctx context.Context,
return ctrl.Result{}, err
}

if conditions.IsTrue(rancherCluster, managementv3.ClusterConditionAgentDeployed) {
log.Info("agent already deployed, no action needed")
if conditions.IsTrue(rancherCluster, managementv3.ClusterConditionReady) {
log.Info("cluster is ready, no action needed")
return ctrl.Result{}, nil
}

// We have to ensure the agent deployment has correct nodeAffinity settings at all times
remoteClient, err := r.remoteClientGetter(ctx, capiCluster.Name, r.Client, client.ObjectKeyFromObject(capiCluster))
if err != nil {
return ctrl.Result{}, fmt.Errorf("getting remote cluster client: %w", err)
}

if conditions.IsTrue(rancherCluster, managementv3.ClusterConditionAgentDeployed) {
log.Info("updating agent node affinity settings")

agent := &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{
Name: "cattle-cluster-agent",
Namespace: "cattle-system",
}}

if err := remoteClient.Get(ctx, client.ObjectKeyFromObject(agent), agent); err != nil {
log.Error(err, "unable to get existing agent deployment")
return ctrl.Result{}, err
}

setDeploymentAffinity(agent)
agent.SetManagedFields(nil)
agent.TypeMeta = metav1.TypeMeta{
APIVersion: "apps/v1",
Kind: deploymentKind,
}

if err := remoteClient.Patch(ctx, agent, client.Apply, []client.PatchOption{
client.ForceOwnership,
client.FieldOwner(fieldOwner),
}...); err != nil {
log.Error(err, "unable to update existing agent deployment")
return ctrl.Result{}, err
}

// During the provisioning after registration the initial deployment gets
// updated by the rancher. We must not miss it.
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
}

// get the registration manifest
manifest, err := getClusterRegistrationManifest(ctx, rancherCluster.Name, rancherCluster.Name, r.RancherClient, r.InsecureSkipVerify)
if err != nil {
Expand All @@ -292,11 +333,6 @@ func (r *CAPIImportManagementV3Reconciler) reconcileNormal(ctx context.Context,

log.Info("Creating import manifest")

remoteClient, err := r.remoteClientGetter(ctx, capiCluster.Name, r.Client, client.ObjectKeyFromObject(capiCluster))
if err != nil {
return ctrl.Result{}, fmt.Errorf("getting remote cluster client: %w", err)
}

if err := createImportManifest(ctx, remoteClient, strings.NewReader(manifest)); err != nil {
return ctrl.Result{}, fmt.Errorf("creating import manifest: %w", err)
}
Expand Down
Loading
Loading