gardener · rishabh-11 · Oct 3, 2023 · Sep 6, 2023 · Sep 6, 2023 · Sep 14, 2023
@@ -67,8 +67,11 @@ this document:
   * [How can I update CA dependencies (particularly k8s.io/kubernetes)?](#how-can-i-update-ca-dependencies-particularly-k8siokubernetes)
 
 * [In the context of Gardener](#in-the-context-of-gardener)
-  * [How do I sync gardener autoscaler with an upstream autoscaler minor release?](#how-do-i-sync-gardener-autoscaler-with-an-upstream-autoscaler-minor-release)
-  * [How do I revendor a different version of MCM in autoscaler?](#how-do-i-revendor-a-different-version-of-mcm-in-autoscaler)
+  * [For User](#for-user)
+    * [When does autoscaler back off early from a node group?](#when-does-autoscaler-backs-off-early-from-a-node-group)
+  * [For Developer](#for-developer)
+    * [How do I sync gardener autoscaler with an upstream autoscaler minor release?](#how-do-i-sync-gardener-autoscaler-with-an-upstream-autoscaler-minor-release)
+    * [How do I revendor a different version of MCM in autoscaler?](#how-do-i-revendor-a-different-version-of-mcm-in-autoscaler)
 <!--- TOC END -->
 
 # Basics
@@ -1087,6 +1090,51 @@ Caveats:
 
 # In the context of Gardener:
 
+## For User
+### When does autoscaler backs off early from a node group?
+
+Autoscaler backs off from a node group if the scale-up requested doesn't succeed. Autoscaler decides to backoff based on:
+- Timeout
+  - if the node doesn't join in `max-node-provision-time`
+- `ResourceExhausted` error 
+  - if the node doesn't join due to error from cloud provider side, and the error is classified as `ResourceExhausted`
+- Scale up operation fails for a node group
+
+As the name suggests, early back-off doesn't wait till `timeout` but backs off when a certain condition is satisfied. This helps in trying other node groups quickly.
+
+Currently early-backoff is enabled only for `ResourceExhausted` errors. Errors classified as `ResourceExhausted` are(and not limited to):
+- `out of quota` errors where customer quota is exhausted, and the quota is configurable per zone (not per region). Generally quotas for VMs, cpus, gpus and disks are configurable per zone, but please confirm the same for your cloud provider
+- `out of stock` errors where cloud-provider doesn't have enough resources in the particular zone, but the resource is available in other zones
+- `not-supported` errors where the instance type or disk type is not supported in the particular zone. 
+
+Errors not classified as `ResourceExhausted` are:(and not limited to):
+- `invalid credentials`
+- `rate limiting`
+- `policy constraints defined by customer`
+- `service-unavailable` on cloud-provider side
+
+Backoff after `timeout` will happen for errors other than `ResourceExhausted`.
+
+*NOTE:* The identifier for the error might differ for each cloud-provider. The above listed errors are general names used.
+
+**--Caveat during rolling update--**
+
+Case:
+
+- If node-grp `ng-A` is in rolling update, AND
+- If the scale-up happens for `ng-A` due to an unschedulable pod `podA`, or a set of pods, AND
+- if the node(say `node1`) couldn't join due to `ResourceExhausted`
+
+then autoscaler will early backoff and try to remove the node, but the node removal won't succeed as currently CA is not allowed to perform any scale-down/delete node operation for a rolling update node-grp.
+
+In the above scenario, CA won't try to scale-up any other node-grp for `podA` as it still calculates `node1` to be a possible candidate to join(`ResourceExhausted` errors are recoverable errors). 
+Scale up would still work for any other new pods which can't fit on upcoming `node1`
+
+If you are sure that the capacity won't recover soon, then kindly re-create `podA`. This will allow CA to see it as a new pod and allow scale-up.
+
+Refer issue https://github.com/gardener/autoscaler/issues/154 to track changes made for early-backoff enablement
+
+## For Developer 
 ### How do I sync gardener autoscaler with an upstream autoscaler minor release?
 
 This is helpful in order to offer Gardener CA with latest or recent K8s version. Note that this may also demand a need to upgrade K8s version used by Machine Controller Manager.

@@ -24,13 +24,14 @@ package mcm
 import (
 	"context"
 	"fmt"
+	"strings"
+
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/labels"
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/klog/v2"
 	schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
-	"strings"
 
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 	"k8s.io/autoscaler/cluster-autoscaler/config"
@@ -237,7 +238,7 @@ func ReferenceFromProviderID(m *McmManager, id string) (*Ref, error) {
 	for _, machine := range machines {
 		machineID := strings.Split(machine.Spec.ProviderID, "/")
 		nodeID := strings.Split(id, "/")
-		// If registered, the ID will match the AWS instance ID.
+		// If registered, the ID will match the cloudprovider instance ID.
 		// If unregistered, the ID will match the machine name.
 		if machineID[len(machineID)-1] == nodeID[len(nodeID)-1] ||
 			nodeID[len(nodeID)-1] == machine.Name {
@@ -371,7 +372,7 @@ func (machinedeployment *MachineDeployment) Belongs(node *apiv1.Node) (bool, err
 }
 
 // DeleteNodes deletes the nodes from the group. It is expected that this method will not be called
-// for nodes not part of ANY machine deployment.
+// for nodes which are not part of ANY machine deployment.
 func (machinedeployment *MachineDeployment) DeleteNodes(nodes []*apiv1.Node) error {
 	size, err := machinedeployment.mcmManager.GetMachineDeploymentSize(machinedeployment)
 	if err != nil {
@@ -409,17 +410,11 @@ func (machinedeployment *MachineDeployment) Debug() string {
 
 // Nodes returns a list of all nodes that belong to this node group.
 func (machinedeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, error) {
-	nodeProviderIDs, err := machinedeployment.mcmManager.GetMachineDeploymentNodes(machinedeployment)
+	instances, err := machinedeployment.mcmManager.GetInstancesForMachineDeployment(machinedeployment)
 	if err != nil {
-		return nil, fmt.Errorf("failed to get the nodes backed by the machinedeployment %q, error: %v", machinedeployment.Name, err)
+		return nil, fmt.Errorf("failed to get the cloudprovider.Instance for machines backed by the machinedeployment %q, error: %v", machinedeployment.Name, err)
 	}
 
-	instances := make([]cloudprovider.Instance, len(nodeProviderIDs))
-	for i := range nodeProviderIDs {
-		instances[i] = cloudprovider.Instance{
-			Id: nodeProviderIDs[i],
-		}
-	}
 	return instances, nil
 }
 

@@ -20,14 +20,19 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"math"
+	"strings"
+	"testing"
+
+	machinecodes "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes"
+	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
+	customfake "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mcm/fakeclient"
+
 	"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
 	. "github.com/onsi/gomega"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
-	customfake "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mcm/fakeclient"
-	"math"
-	"testing"
 )
 
 const (
@@ -375,7 +380,7 @@ func TestRefresh(t *testing.T) {
 				nodeGroups: []string{nodeGroup2},
 			},
 			expect{
-				machines: []*v1alpha1.Machine{newMachine("machine-1", "fakeID", nil, "machinedeployment-1", "machineset-1", "1", false)},
+				machines: []*v1alpha1.Machine{newMachine("machine-1", "fakeID-1", nil, "machinedeployment-1", "machineset-1", "1", false, true)},
 				err:      errors.Join(fmt.Errorf("could not reset priority annotation on machine machine-1, Error: %v", mcUpdateErrorMsg)),
 			},
 		},
@@ -414,3 +419,126 @@ func TestRefresh(t *testing.T) {
 		})
 	}
 }
+
+// Different kinds of cases possible and expected cloudprovider.Instance returned for them
+// (mobj, mobjPid, nodeobj)   				    -> instance(nodeobj.pid,_)
+// (mobj, mobjPid, _)         				    -> instance("requested://<machine-name>",status{'creating'})
+// (mobj, _,_)                 				    -> instance("requested://<machine-name>",status{'creating'})
+// (mobj, _,_) with quota error 				-> instance("requested://<machine-name>",status{'creating',{'outofResourcesClass','ResourceExhausted','<message>'}})
+// (mobj, _,_) with invalid credentials error   -> instance("requested://<machine-name>",status{'creating'})
+
+// Example machine.status.lastOperation for a `ResourceExhausted` error
+//
+//		lastOperation: {
+//			type: Creating
+//			state: Failed
+//			errorCode: ResourceExhausted
+//			description: "Cloud provider message - machine codes error: code = [ResourceExhausted] message = [Create machine "shoot--ddci--cbc-sys-tests03-pool-c32m256-3b-z1-575b9-hlvj6" failed: The following errors occurred: [{QUOTA_EXCEEDED  Quota 'N2_CPUS' exceeded.  Limit: 6000.0 in region europe-west3. [] []}]]."
+//		}
+//	}
+func TestNodes(t *testing.T) {
+	const (
+		outOfQuotaMachineStatusErrorDescription         = "Cloud provider message - machine codes error: code = [ResourceExhausted] message = [Create machine \"machine-with-vm-create-error-out-of-quota\" failed: The following errors occurred: [{QUOTA_EXCEEDED  Quota 'N2_CPUS' exceeded.  Limit: 6000.0 in region europe-west3. [] []}]]"
+		outOfQuotaInstanceErrorMessage                  = "Create machine \"machine-with-vm-create-error-out-of-quota\" failed: The following errors occurred: [{QUOTA_EXCEEDED  Quota 'N2_CPUS' exceeded.  Limit: 6000.0 in region europe-west3. [] []}]"
+		invalidCredentialsMachineStatusErrorDescription = "Cloud provider message - machine codes error: code = [Internal] message = [user is not authorized to perform this action]"
+	)
+	type expectationPerInstance struct {
+		providerID           string
+		instanceState        cloudprovider.InstanceState
+		instanceErrorClass   cloudprovider.InstanceErrorClass
+		instanceErrorCode    string
+		instanceErrorMessage string
+	}
+	type expect struct {
+		expectationPerInstanceList []expectationPerInstance
+	}
+	type data struct {
+		name   string
+		setup  setup
+		expect expect
+	}
+	table := []data{
+		{
+			"Correct instances should be returned for machine objects under the machinedeployment",
+			setup{
+				nodes: []*corev1.Node{newNode("node-1", "fakeID-1", false)},
+				machines: func() []*v1alpha1.Machine {
+					allMachines := make([]*v1alpha1.Machine, 0, 5)
+					allMachines = append(allMachines, newMachine("machine-with-registered-node", "fakeID-1", nil, "machinedeployment-1", "", "", false, true))
+					allMachines = append(allMachines, newMachine("machine-with-vm-but-no-node", "fakeID-2", nil, "machinedeployment-1", "", "", false, false))
+					allMachines = append(allMachines, newMachine("machine-with-vm-creating", "", nil, "machinedeployment-1", "", "", false, false))
+					allMachines = append(allMachines, newMachine("machine-with-vm-create-error-out-of-quota", "", &v1alpha1.MachineStatus{LastOperation: v1alpha1.LastOperation{Type: v1alpha1.MachineOperationCreate, State: v1alpha1.MachineStateFailed, ErrorCode: machinecodes.ResourceExhausted.String(), Description: outOfQuotaMachineStatusErrorDescription}}, "machinedeployment-1", "", "", false, false))
+					allMachines = append(allMachines, newMachine("machine-with-vm-create-error-invalid-credentials", "", &v1alpha1.MachineStatus{LastOperation: v1alpha1.LastOperation{Type: v1alpha1.MachineOperationCreate, State: v1alpha1.MachineStateFailed, ErrorCode: machinecodes.Internal.String(), Description: invalidCredentialsMachineStatusErrorDescription}}, "machinedeployment-1", "", "", false, false))
+					return allMachines
+				}(),
+				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
+				nodeGroups:         []string{nodeGroup1},
+			},
+			expect{
+				expectationPerInstanceList: []expectationPerInstance{
+					{"fakeID-1", cloudprovider.InstanceState(-1), cloudprovider.InstanceErrorClass(-1), "", ""},
+					{placeholderInstanceIDForMachineObj("machine-with-vm-but-no-node"), cloudprovider.InstanceCreating, cloudprovider.InstanceErrorClass(-1), "", ""},
+					{placeholderInstanceIDForMachineObj("machine-with-vm-creating"), cloudprovider.InstanceCreating, cloudprovider.InstanceErrorClass(-1), "", ""},
+					{placeholderInstanceIDForMachineObj("machine-with-vm-create-error-out-of-quota"), cloudprovider.InstanceCreating, cloudprovider.OutOfResourcesErrorClass, machinecodes.ResourceExhausted.String(), outOfQuotaInstanceErrorMessage},
+					// invalid credentials error is mapped to Internal code as it can't be fixed by trying another zone
+					{placeholderInstanceIDForMachineObj("machine-with-vm-create-error-invalid-credentials"), cloudprovider.InstanceCreating, cloudprovider.InstanceErrorClass(-1), "", ""},
+				},
+			},
+		},
+	}
+
+	for _, entry := range table {
+		entry := entry // have a shallow copy of the entry for parallelization of tests
+		t.Run(entry.name, func(t *testing.T) {
+			t.Parallel()
+			g := NewWithT(t)
+			stop := make(chan struct{})
+			defer close(stop)
+			controlMachineObjects, targetCoreObjects := setupEnv(&entry.setup)
+			m, trackers, hasSyncedCacheFns := createMcmManager(t, stop, testNamespace, nil, controlMachineObjects, targetCoreObjects)
+			defer trackers.Stop()
+			waitForCacheSync(t, stop, hasSyncedCacheFns)
+
+			if entry.setup.targetCoreFakeResourceActions != nil {
+				trackers.TargetCore.SetFailAtFakeResourceActions(entry.setup.targetCoreFakeResourceActions)
+			}
+			if entry.setup.controlMachineFakeResourceActions != nil {
+				trackers.ControlMachine.SetFailAtFakeResourceActions(entry.setup.controlMachineFakeResourceActions)
+			}
+
+			md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m)
+			g.Expect(err).To(BeNil())
+
+			returnedInstances, err := md.Nodes()
+			g.Expect(err).To(BeNil())
+			g.Expect(len(returnedInstances)).To(BeNumerically("==", len(entry.expect.expectationPerInstanceList)))
+
+			for _, expectedInstance := range entry.expect.expectationPerInstanceList {
+				found := false
+				for _, gotInstance := range returnedInstances {
+					g.Expect(gotInstance.Id).ToNot(BeEmpty())
+					if expectedInstance.providerID == gotInstance.Id {
+						if !strings.Contains(gotInstance.Id, "requested://") {
+							// must be a machine obj whose node is registered (ready or notReady)
+							g.Expect(gotInstance.Status).To(BeNil())
+						} else {
+							if int(expectedInstance.instanceState) != -1 {
+								g.Expect(gotInstance.Status).ToNot(BeNil())
+								g.Expect(gotInstance.Status.State).To(Equal(expectedInstance.instanceState))
+							}
+							if int(expectedInstance.instanceErrorClass) != -1 || expectedInstance.instanceErrorCode != "" || expectedInstance.instanceErrorMessage != "" {
+								g.Expect(gotInstance.Status.ErrorInfo).ToNot(BeNil())
+								g.Expect(gotInstance.Status.ErrorInfo.ErrorClass).To(Equal(expectedInstance.instanceErrorClass))
+								g.Expect(gotInstance.Status.ErrorInfo.ErrorCode).To(Equal(expectedInstance.instanceErrorCode))
+								g.Expect(gotInstance.Status.ErrorInfo.ErrorMessage).To(Equal(expectedInstance.instanceErrorMessage))
+							}
+						}
+						found = true
+						break
+					}
+				}
+				g.Expect(found).To(BeTrue())
+			}
+		})
+	}
+}