diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go index 68ae50797380..5d8fc293b919 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go @@ -478,48 +478,38 @@ func (machinedeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, e // NodeGroup. Returning a nil will result in using default options. // Implementation optional. func (machinedeployment *MachineDeployment) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) { + options := defaults mcdAnnotations, err := machinedeployment.mcmManager.GetMachineDeploymentAnnotations(machinedeployment.Name) if err != nil { return nil, err } - scaleDownUtilThresholdValue := defaults.ScaleDownUtilizationThreshold if _, ok := mcdAnnotations[ScaleDownUtilizationThresholdAnnotation]; ok { if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownUtilizationThresholdAnnotation], 64); err == nil { - scaleDownUtilThresholdValue = floatVal + options.ScaleDownUtilizationThreshold = floatVal } } - scaleDownGPUUtilThresholdValue := defaults.ScaleDownGpuUtilizationThreshold if _, ok := mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation]; ok { if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation], 64); err == nil { - scaleDownGPUUtilThresholdValue = floatVal + options.ScaleDownGpuUtilizationThreshold = floatVal } } - scaleDownUnneededDuration := defaults.ScaleDownUnneededTime if _, ok := mcdAnnotations[ScaleDownUnneededTimeAnnotation]; ok { if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnneededTimeAnnotation]); err == nil { - scaleDownUnneededDuration = durationVal + options.ScaleDownUnneededTime = durationVal } } - scaleDownUnreadyDuration := defaults.ScaleDownUnreadyTime if _, ok := mcdAnnotations[ScaleDownUnreadyTimeAnnotation]; ok { if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnreadyTimeAnnotation]); err == nil { - scaleDownUnreadyDuration = durationVal + options.ScaleDownUnreadyTime = durationVal } } - maxNodeProvisionDuration := defaults.MaxNodeProvisionTime if _, ok := mcdAnnotations[MaxNodeProvisionTimeAnnotation]; ok { if durationVal, err := time.ParseDuration(mcdAnnotations[MaxNodeProvisionTimeAnnotation]); err == nil { - maxNodeProvisionDuration = durationVal + options.MaxNodeProvisionTime = durationVal } } - return &config.NodeGroupAutoscalingOptions{ - ScaleDownUtilizationThreshold: scaleDownUtilThresholdValue, - ScaleDownGpuUtilizationThreshold: scaleDownGPUUtilThresholdValue, - ScaleDownUnneededTime: scaleDownUnneededDuration, - ScaleDownUnreadyTime: scaleDownUnreadyDuration, - MaxNodeProvisionTime: maxNodeProvisionDuration, - }, nil + return &options, nil } // TemplateNodeInfo returns a node template for this node group. diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go index f8242a0e27cb..1ef72a061afa 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go @@ -590,6 +590,16 @@ func TestNodes(t *testing.T) { } func TestGetOptions(t *testing.T) { + ngAutoScalingOpDefaults := config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: 0.5, + ScaleDownGpuUtilizationThreshold: 0.5, + ScaleDownUnneededTime: 1 * time.Minute, + ScaleDownUnreadyTime: 1 * time.Minute, + MaxNodeProvisionTime: 1 * time.Minute, + IgnoreDaemonSetsUtilization: true, + ZeroOrMaxNodeScaling: true, + } + type expect struct { ngOptions *config.NodeGroupAutoscalingOptions err error @@ -616,14 +626,8 @@ func TestGetOptions(t *testing.T) { nodeGroups: []string{nodeGroup1}, }, expect{ - ngOptions: &config.NodeGroupAutoscalingOptions{ - ScaleDownUtilizationThreshold: 0.5, - ScaleDownGpuUtilizationThreshold: 0.5, - ScaleDownUnneededTime: 1 * time.Minute, - ScaleDownUnreadyTime: 1 * time.Minute, - MaxNodeProvisionTime: 1 * time.Minute, - }, - err: nil, + ngOptions: &ngAutoScalingOpDefaults, + err: nil, }, }, { @@ -651,6 +655,8 @@ func TestGetOptions(t *testing.T) { ScaleDownUnneededTime: 5 * time.Minute, ScaleDownUnreadyTime: 5 * time.Minute, MaxNodeProvisionTime: 5 * time.Minute, + IgnoreDaemonSetsUtilization: ngAutoScalingOpDefaults.IgnoreDaemonSetsUtilization, + ZeroOrMaxNodeScaling: ngAutoScalingOpDefaults.ZeroOrMaxNodeScaling, }, err: nil, }, @@ -678,6 +684,8 @@ func TestGetOptions(t *testing.T) { ScaleDownUnneededTime: 5 * time.Minute, ScaleDownUnreadyTime: 1 * time.Minute, MaxNodeProvisionTime: 2 * time.Minute, + IgnoreDaemonSetsUtilization: ngAutoScalingOpDefaults.IgnoreDaemonSetsUtilization, + ZeroOrMaxNodeScaling: ngAutoScalingOpDefaults.ZeroOrMaxNodeScaling, }, err: nil, }, @@ -699,14 +707,6 @@ func TestGetOptions(t *testing.T) { md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m) g.Expect(err).To(BeNil()) - ngAutoScalingOpDefaults := config.NodeGroupAutoscalingOptions{ - ScaleDownUtilizationThreshold: 0.5, - ScaleDownGpuUtilizationThreshold: 0.5, - ScaleDownUnneededTime: 1 * time.Minute, - ScaleDownUnreadyTime: 1 * time.Minute, - MaxNodeProvisionTime: 1 * time.Minute, - } - options, err := md.GetOptions(ngAutoScalingOpDefaults) if entry.expect.err != nil { diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go index d6363ea9df9f..bee23e1026d7 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go @@ -28,6 +28,7 @@ import ( "flag" "fmt" v1appslister "k8s.io/client-go/listers/apps/v1" + "k8s.io/utils/pointer" "math/rand" "net/http" "os" @@ -143,6 +144,7 @@ type nodeTemplate struct { InstanceType *instanceType Region string Zone string + Architecture *string Labels map[string]string Taints []apiv1.Taint } @@ -659,7 +661,7 @@ func findMatchingInstance(nodes []*v1.Node, machine *v1alpha1.Machine) cloudprov // Report InstanceStatus only for `ResourceExhausted` errors return cloudprovider.Instance{ Id: placeholderInstanceIDForMachineObj(machine.Name), - Status: checkAndGetResourceExhaustedInstanceStatus(machine), + Status: generateInstanceStatus(machine), } } @@ -667,17 +669,20 @@ func placeholderInstanceIDForMachineObj(name string) string { return fmt.Sprintf("requested://%s", name) } -// checkAndGetResourceExhaustedInstanceStatus returns cloudprovider.InstanceStatus for the machine obj -func checkAndGetResourceExhaustedInstanceStatus(machine *v1alpha1.Machine) *cloudprovider.InstanceStatus { - if machine.Status.LastOperation.Type == v1alpha1.MachineOperationCreate && machine.Status.LastOperation.State == v1alpha1.MachineStateFailed && machine.Status.LastOperation.ErrorCode == machinecodes.ResourceExhausted.String() { - return &cloudprovider.InstanceStatus{ - State: cloudprovider.InstanceCreating, - ErrorInfo: &cloudprovider.InstanceErrorInfo{ - ErrorClass: cloudprovider.OutOfResourcesErrorClass, - ErrorCode: machinecodes.ResourceExhausted.String(), - ErrorMessage: machine.Status.LastOperation.Description, - }, +// generateInstanceStatus returns cloudprovider.InstanceStatus for the machine obj +func generateInstanceStatus(machine *v1alpha1.Machine) *cloudprovider.InstanceStatus { + if machine.Status.LastOperation.Type == v1alpha1.MachineOperationCreate { + if machine.Status.LastOperation.State == v1alpha1.MachineStateFailed && machine.Status.LastOperation.ErrorCode == machinecodes.ResourceExhausted.String() { + return &cloudprovider.InstanceStatus{ + State: cloudprovider.InstanceCreating, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OutOfResourcesErrorClass, + ErrorCode: machinecodes.ResourceExhausted.String(), + ErrorMessage: machine.Status.LastOperation.Description, + }, + } } + return &cloudprovider.InstanceStatus{State: cloudprovider.InstanceCreating} } return nil } @@ -741,6 +746,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine req, _ = labels.NewRequirement(nodegroupset.LabelWorkerPool, selection.Equals, list) region string zone string + architecture *string instance instanceType machineClass = md.Spec.Template.Spec.Class nodeTemplateSpec = md.Spec.Template.Spec.NodeTemplateSpec @@ -793,6 +799,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine instance.InstanceType = nodeTemplateAttributes.InstanceType region = nodeTemplateAttributes.Region zone = nodeTemplateAttributes.Zone + architecture = nodeTemplateAttributes.Architecture break } @@ -818,6 +825,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine } region = providerSpec.Region zone = getZoneValueFromMCLabels(mc.Labels) + architecture = pointer.String(providerSpec.Tags[apiv1.LabelArchStable]) case providerAzure: var providerSpec *azureapis.AzureProviderSpec err = json.Unmarshal(mc.ProviderSpec.Raw, &providerSpec) @@ -840,6 +848,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine if providerSpec.Properties.Zone != nil { zone = providerSpec.Location + "-" + strconv.Itoa(*providerSpec.Properties.Zone) } + architecture = pointer.String(providerSpec.Tags["kubernetes.io_arch"]) default: return nil, cloudprovider.ErrNotImplemented } @@ -863,6 +872,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine Zone: zone, // will be implemented in MCM Labels: labels, Taints: taints, + Architecture: architecture, } return nodeTmpl, nil @@ -978,9 +988,13 @@ func (m *McmManager) buildNodeFromTemplate(name string, template *nodeTemplate) func buildGenericLabels(template *nodeTemplate, nodeName string) map[string]string { result := make(map[string]string) // TODO: extract from MCM - result[kubeletapis.LabelArch] = cloudprovider.DefaultArch - result[apiv1.LabelArchStable] = cloudprovider.DefaultArch - + if template.Architecture != nil { + result[kubeletapis.LabelArch] = *template.Architecture + result[apiv1.LabelArchStable] = *template.Architecture + } else { + result[kubeletapis.LabelArch] = cloudprovider.DefaultArch + result[apiv1.LabelArchStable] = cloudprovider.DefaultArch + } result[kubeletapis.LabelOS] = cloudprovider.DefaultOS result[apiv1.LabelOSStable] = cloudprovider.DefaultOS