From f085dc25549cf6d9bea4d7e050ce9e422faefe67 Mon Sep 17 00:00:00 2001 From: Rishabh Patel Date: Thu, 4 Jul 2024 12:28:14 +0530 Subject: [PATCH 1/3] populate instance status state when machine lastOperation is Create --- .../cloudprovider/mcm/mcm_manager.go | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go index 31435342e821..78701699a9bb 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go @@ -657,7 +657,7 @@ func findMatchingInstance(nodes []*v1.Node, machine *v1alpha1.Machine) cloudprov // Report InstanceStatus only for `ResourceExhausted` errors return cloudprovider.Instance{ Id: placeholderInstanceIDForMachineObj(machine.Name), - Status: checkAndGetResourceExhaustedInstanceStatus(machine), + Status: generateInstanceStatus(machine), } } @@ -665,17 +665,20 @@ func placeholderInstanceIDForMachineObj(name string) string { return fmt.Sprintf("requested://%s", name) } -// checkAndGetResourceExhaustedInstanceStatus returns cloudprovider.InstanceStatus for the machine obj -func checkAndGetResourceExhaustedInstanceStatus(machine *v1alpha1.Machine) *cloudprovider.InstanceStatus { - if machine.Status.LastOperation.Type == v1alpha1.MachineOperationCreate && machine.Status.LastOperation.State == v1alpha1.MachineStateFailed && machine.Status.LastOperation.ErrorCode == machinecodes.ResourceExhausted.String() { - return &cloudprovider.InstanceStatus{ - State: cloudprovider.InstanceCreating, - ErrorInfo: &cloudprovider.InstanceErrorInfo{ - ErrorClass: cloudprovider.OutOfResourcesErrorClass, - ErrorCode: machinecodes.ResourceExhausted.String(), - ErrorMessage: machine.Status.LastOperation.Description, - }, +// generateInstanceStatus returns cloudprovider.InstanceStatus for the machine obj +func generateInstanceStatus(machine *v1alpha1.Machine) *cloudprovider.InstanceStatus { + if machine.Status.LastOperation.Type == v1alpha1.MachineOperationCreate { + if machine.Status.LastOperation.State == v1alpha1.MachineStateFailed && machine.Status.LastOperation.ErrorCode == machinecodes.ResourceExhausted.String() { + return &cloudprovider.InstanceStatus{ + State: cloudprovider.InstanceCreating, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OutOfResourcesErrorClass, + ErrorCode: machinecodes.ResourceExhausted.String(), + ErrorMessage: machine.Status.LastOperation.Description, + }, + } } + return &cloudprovider.InstanceStatus{State: cloudprovider.InstanceCreating} } return nil } From 962e5dd3944c237ee3ee8221a51e77aa42baad4f Mon Sep 17 00:00:00 2001 From: Aaron Francis Fernandes <79958509+aaronfern@users.noreply.github.com> Date: Thu, 4 Jul 2024 13:54:44 +0530 Subject: [PATCH 2/3] Updated NodeGroupAutoscalingOptions to now include all options (#310) --- .../cloudprovider/mcm/mcm_cloud_provider.go | 24 ++++---------- .../mcm/mcm_cloud_provider_test.go | 32 +++++++++---------- 2 files changed, 23 insertions(+), 33 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go index 68ae50797380..5d8fc293b919 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go @@ -478,48 +478,38 @@ func (machinedeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, e // NodeGroup. Returning a nil will result in using default options. // Implementation optional. func (machinedeployment *MachineDeployment) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) { + options := defaults mcdAnnotations, err := machinedeployment.mcmManager.GetMachineDeploymentAnnotations(machinedeployment.Name) if err != nil { return nil, err } - scaleDownUtilThresholdValue := defaults.ScaleDownUtilizationThreshold if _, ok := mcdAnnotations[ScaleDownUtilizationThresholdAnnotation]; ok { if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownUtilizationThresholdAnnotation], 64); err == nil { - scaleDownUtilThresholdValue = floatVal + options.ScaleDownUtilizationThreshold = floatVal } } - scaleDownGPUUtilThresholdValue := defaults.ScaleDownGpuUtilizationThreshold if _, ok := mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation]; ok { if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation], 64); err == nil { - scaleDownGPUUtilThresholdValue = floatVal + options.ScaleDownGpuUtilizationThreshold = floatVal } } - scaleDownUnneededDuration := defaults.ScaleDownUnneededTime if _, ok := mcdAnnotations[ScaleDownUnneededTimeAnnotation]; ok { if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnneededTimeAnnotation]); err == nil { - scaleDownUnneededDuration = durationVal + options.ScaleDownUnneededTime = durationVal } } - scaleDownUnreadyDuration := defaults.ScaleDownUnreadyTime if _, ok := mcdAnnotations[ScaleDownUnreadyTimeAnnotation]; ok { if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnreadyTimeAnnotation]); err == nil { - scaleDownUnreadyDuration = durationVal + options.ScaleDownUnreadyTime = durationVal } } - maxNodeProvisionDuration := defaults.MaxNodeProvisionTime if _, ok := mcdAnnotations[MaxNodeProvisionTimeAnnotation]; ok { if durationVal, err := time.ParseDuration(mcdAnnotations[MaxNodeProvisionTimeAnnotation]); err == nil { - maxNodeProvisionDuration = durationVal + options.MaxNodeProvisionTime = durationVal } } - return &config.NodeGroupAutoscalingOptions{ - ScaleDownUtilizationThreshold: scaleDownUtilThresholdValue, - ScaleDownGpuUtilizationThreshold: scaleDownGPUUtilThresholdValue, - ScaleDownUnneededTime: scaleDownUnneededDuration, - ScaleDownUnreadyTime: scaleDownUnreadyDuration, - MaxNodeProvisionTime: maxNodeProvisionDuration, - }, nil + return &options, nil } // TemplateNodeInfo returns a node template for this node group. diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go index f8242a0e27cb..1ef72a061afa 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go @@ -590,6 +590,16 @@ func TestNodes(t *testing.T) { } func TestGetOptions(t *testing.T) { + ngAutoScalingOpDefaults := config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: 0.5, + ScaleDownGpuUtilizationThreshold: 0.5, + ScaleDownUnneededTime: 1 * time.Minute, + ScaleDownUnreadyTime: 1 * time.Minute, + MaxNodeProvisionTime: 1 * time.Minute, + IgnoreDaemonSetsUtilization: true, + ZeroOrMaxNodeScaling: true, + } + type expect struct { ngOptions *config.NodeGroupAutoscalingOptions err error @@ -616,14 +626,8 @@ func TestGetOptions(t *testing.T) { nodeGroups: []string{nodeGroup1}, }, expect{ - ngOptions: &config.NodeGroupAutoscalingOptions{ - ScaleDownUtilizationThreshold: 0.5, - ScaleDownGpuUtilizationThreshold: 0.5, - ScaleDownUnneededTime: 1 * time.Minute, - ScaleDownUnreadyTime: 1 * time.Minute, - MaxNodeProvisionTime: 1 * time.Minute, - }, - err: nil, + ngOptions: &ngAutoScalingOpDefaults, + err: nil, }, }, { @@ -651,6 +655,8 @@ func TestGetOptions(t *testing.T) { ScaleDownUnneededTime: 5 * time.Minute, ScaleDownUnreadyTime: 5 * time.Minute, MaxNodeProvisionTime: 5 * time.Minute, + IgnoreDaemonSetsUtilization: ngAutoScalingOpDefaults.IgnoreDaemonSetsUtilization, + ZeroOrMaxNodeScaling: ngAutoScalingOpDefaults.ZeroOrMaxNodeScaling, }, err: nil, }, @@ -678,6 +684,8 @@ func TestGetOptions(t *testing.T) { ScaleDownUnneededTime: 5 * time.Minute, ScaleDownUnreadyTime: 1 * time.Minute, MaxNodeProvisionTime: 2 * time.Minute, + IgnoreDaemonSetsUtilization: ngAutoScalingOpDefaults.IgnoreDaemonSetsUtilization, + ZeroOrMaxNodeScaling: ngAutoScalingOpDefaults.ZeroOrMaxNodeScaling, }, err: nil, }, @@ -699,14 +707,6 @@ func TestGetOptions(t *testing.T) { md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m) g.Expect(err).To(BeNil()) - ngAutoScalingOpDefaults := config.NodeGroupAutoscalingOptions{ - ScaleDownUtilizationThreshold: 0.5, - ScaleDownGpuUtilizationThreshold: 0.5, - ScaleDownUnneededTime: 1 * time.Minute, - ScaleDownUnreadyTime: 1 * time.Minute, - MaxNodeProvisionTime: 1 * time.Minute, - } - options, err := md.GetOptions(ngAutoScalingOpDefaults) if entry.expect.err != nil { From 2f7cf881a55092e0448c66feca2ab706e821ffab Mon Sep 17 00:00:00 2001 From: Suyash Choudhary <57896905+sssash18@users.noreply.github.com> Date: Tue, 16 Jul 2024 14:19:51 +0530 Subject: [PATCH 3/3] Added arch (#287) * Added arch * Added arch to nodetemplate * Added arch to nodetemplate --- .../cloudprovider/mcm/mcm_manager.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go index 78701699a9bb..ec3e348b5152 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go @@ -28,6 +28,7 @@ import ( "flag" "fmt" v1appslister "k8s.io/client-go/listers/apps/v1" + "k8s.io/utils/pointer" "math/rand" "net/http" "os" @@ -141,6 +142,7 @@ type nodeTemplate struct { InstanceType *instanceType Region string Zone string + Architecture *string Labels map[string]string Taints []apiv1.Taint } @@ -737,6 +739,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine req, _ = labels.NewRequirement(nodegroupset.LabelWorkerPool, selection.Equals, list) region string zone string + architecture *string instance instanceType machineClass = md.Spec.Template.Spec.Class nodeTemplateSpec = md.Spec.Template.Spec.NodeTemplateSpec @@ -788,6 +791,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine instance.InstanceType = nodeTemplateAttributes.InstanceType region = nodeTemplateAttributes.Region zone = nodeTemplateAttributes.Zone + architecture = nodeTemplateAttributes.Architecture break } @@ -813,6 +817,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine } region = providerSpec.Region zone = getZoneValueFromMCLabels(mc.Labels) + architecture = pointer.String(providerSpec.Tags[apiv1.LabelArchStable]) case providerAzure: var providerSpec *azureapis.AzureProviderSpec err = json.Unmarshal(mc.ProviderSpec.Raw, &providerSpec) @@ -835,6 +840,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine if providerSpec.Properties.Zone != nil { zone = providerSpec.Location + "-" + strconv.Itoa(*providerSpec.Properties.Zone) } + architecture = pointer.String(providerSpec.Tags["kubernetes.io_arch"]) default: return nil, cloudprovider.ErrNotImplemented } @@ -858,6 +864,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine Zone: zone, // will be implemented in MCM Labels: labels, Taints: taints, + Architecture: architecture, } return nodeTmpl, nil @@ -973,9 +980,13 @@ func (m *McmManager) buildNodeFromTemplate(name string, template *nodeTemplate) func buildGenericLabels(template *nodeTemplate, nodeName string) map[string]string { result := make(map[string]string) // TODO: extract from MCM - result[kubeletapis.LabelArch] = cloudprovider.DefaultArch - result[apiv1.LabelArchStable] = cloudprovider.DefaultArch - + if template.Architecture != nil { + result[kubeletapis.LabelArch] = *template.Architecture + result[apiv1.LabelArchStable] = *template.Architecture + } else { + result[kubeletapis.LabelArch] = cloudprovider.DefaultArch + result[apiv1.LabelArchStable] = cloudprovider.DefaultArch + } result[kubeletapis.LabelOS] = cloudprovider.DefaultOS result[apiv1.LabelOSStable] = cloudprovider.DefaultOS