Skip to content

Commit

Permalink
Merge branch 'machine-controller-manager-provider' into extended-reso…
Browse files Browse the repository at this point in the history
…urces
  • Loading branch information
elankath committed Jul 23, 2024
2 parents b29f3b6 + 2f7cf88 commit 463510d
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 47 deletions.
24 changes: 7 additions & 17 deletions cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -478,48 +478,38 @@ func (machinedeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, e
// NodeGroup. Returning a nil will result in using default options.
// Implementation optional.
func (machinedeployment *MachineDeployment) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
options := defaults
mcdAnnotations, err := machinedeployment.mcmManager.GetMachineDeploymentAnnotations(machinedeployment.Name)
if err != nil {
return nil, err
}

scaleDownUtilThresholdValue := defaults.ScaleDownUtilizationThreshold
if _, ok := mcdAnnotations[ScaleDownUtilizationThresholdAnnotation]; ok {
if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownUtilizationThresholdAnnotation], 64); err == nil {
scaleDownUtilThresholdValue = floatVal
options.ScaleDownUtilizationThreshold = floatVal
}
}
scaleDownGPUUtilThresholdValue := defaults.ScaleDownGpuUtilizationThreshold
if _, ok := mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation]; ok {
if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation], 64); err == nil {
scaleDownGPUUtilThresholdValue = floatVal
options.ScaleDownGpuUtilizationThreshold = floatVal
}
}
scaleDownUnneededDuration := defaults.ScaleDownUnneededTime
if _, ok := mcdAnnotations[ScaleDownUnneededTimeAnnotation]; ok {
if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnneededTimeAnnotation]); err == nil {
scaleDownUnneededDuration = durationVal
options.ScaleDownUnneededTime = durationVal
}
}
scaleDownUnreadyDuration := defaults.ScaleDownUnreadyTime
if _, ok := mcdAnnotations[ScaleDownUnreadyTimeAnnotation]; ok {
if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnreadyTimeAnnotation]); err == nil {
scaleDownUnreadyDuration = durationVal
options.ScaleDownUnreadyTime = durationVal
}
}
maxNodeProvisionDuration := defaults.MaxNodeProvisionTime
if _, ok := mcdAnnotations[MaxNodeProvisionTimeAnnotation]; ok {
if durationVal, err := time.ParseDuration(mcdAnnotations[MaxNodeProvisionTimeAnnotation]); err == nil {
maxNodeProvisionDuration = durationVal
options.MaxNodeProvisionTime = durationVal
}
}
return &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: scaleDownUtilThresholdValue,
ScaleDownGpuUtilizationThreshold: scaleDownGPUUtilThresholdValue,
ScaleDownUnneededTime: scaleDownUnneededDuration,
ScaleDownUnreadyTime: scaleDownUnreadyDuration,
MaxNodeProvisionTime: maxNodeProvisionDuration,
}, nil
return &options, nil
}

// TemplateNodeInfo returns a node template for this node group.
Expand Down
32 changes: 16 additions & 16 deletions cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,16 @@ func TestNodes(t *testing.T) {
}

func TestGetOptions(t *testing.T) {
ngAutoScalingOpDefaults := config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.5,
ScaleDownGpuUtilizationThreshold: 0.5,
ScaleDownUnneededTime: 1 * time.Minute,
ScaleDownUnreadyTime: 1 * time.Minute,
MaxNodeProvisionTime: 1 * time.Minute,
IgnoreDaemonSetsUtilization: true,
ZeroOrMaxNodeScaling: true,
}

type expect struct {
ngOptions *config.NodeGroupAutoscalingOptions
err error
Expand All @@ -616,14 +626,8 @@ func TestGetOptions(t *testing.T) {
nodeGroups: []string{nodeGroup1},
},
expect{
ngOptions: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.5,
ScaleDownGpuUtilizationThreshold: 0.5,
ScaleDownUnneededTime: 1 * time.Minute,
ScaleDownUnreadyTime: 1 * time.Minute,
MaxNodeProvisionTime: 1 * time.Minute,
},
err: nil,
ngOptions: &ngAutoScalingOpDefaults,
err: nil,
},
},
{
Expand Down Expand Up @@ -651,6 +655,8 @@ func TestGetOptions(t *testing.T) {
ScaleDownUnneededTime: 5 * time.Minute,
ScaleDownUnreadyTime: 5 * time.Minute,
MaxNodeProvisionTime: 5 * time.Minute,
IgnoreDaemonSetsUtilization: ngAutoScalingOpDefaults.IgnoreDaemonSetsUtilization,
ZeroOrMaxNodeScaling: ngAutoScalingOpDefaults.ZeroOrMaxNodeScaling,
},
err: nil,
},
Expand Down Expand Up @@ -678,6 +684,8 @@ func TestGetOptions(t *testing.T) {
ScaleDownUnneededTime: 5 * time.Minute,
ScaleDownUnreadyTime: 1 * time.Minute,
MaxNodeProvisionTime: 2 * time.Minute,
IgnoreDaemonSetsUtilization: ngAutoScalingOpDefaults.IgnoreDaemonSetsUtilization,
ZeroOrMaxNodeScaling: ngAutoScalingOpDefaults.ZeroOrMaxNodeScaling,
},
err: nil,
},
Expand All @@ -699,14 +707,6 @@ func TestGetOptions(t *testing.T) {
md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m)
g.Expect(err).To(BeNil())

ngAutoScalingOpDefaults := config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.5,
ScaleDownGpuUtilizationThreshold: 0.5,
ScaleDownUnneededTime: 1 * time.Minute,
ScaleDownUnreadyTime: 1 * time.Minute,
MaxNodeProvisionTime: 1 * time.Minute,
}

options, err := md.GetOptions(ngAutoScalingOpDefaults)

if entry.expect.err != nil {
Expand Down
42 changes: 28 additions & 14 deletions cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"flag"
"fmt"
v1appslister "k8s.io/client-go/listers/apps/v1"
"k8s.io/utils/pointer"
"math/rand"
"net/http"
"os"
Expand Down Expand Up @@ -143,6 +144,7 @@ type nodeTemplate struct {
InstanceType *instanceType
Region string
Zone string
Architecture *string
Labels map[string]string
Taints []apiv1.Taint
}
Expand Down Expand Up @@ -659,25 +661,28 @@ func findMatchingInstance(nodes []*v1.Node, machine *v1alpha1.Machine) cloudprov
// Report InstanceStatus only for `ResourceExhausted` errors
return cloudprovider.Instance{
Id: placeholderInstanceIDForMachineObj(machine.Name),
Status: checkAndGetResourceExhaustedInstanceStatus(machine),
Status: generateInstanceStatus(machine),
}
}

func placeholderInstanceIDForMachineObj(name string) string {
return fmt.Sprintf("requested://%s", name)
}

// checkAndGetResourceExhaustedInstanceStatus returns cloudprovider.InstanceStatus for the machine obj
func checkAndGetResourceExhaustedInstanceStatus(machine *v1alpha1.Machine) *cloudprovider.InstanceStatus {
if machine.Status.LastOperation.Type == v1alpha1.MachineOperationCreate && machine.Status.LastOperation.State == v1alpha1.MachineStateFailed && machine.Status.LastOperation.ErrorCode == machinecodes.ResourceExhausted.String() {
return &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceCreating,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: machinecodes.ResourceExhausted.String(),
ErrorMessage: machine.Status.LastOperation.Description,
},
// generateInstanceStatus returns cloudprovider.InstanceStatus for the machine obj
func generateInstanceStatus(machine *v1alpha1.Machine) *cloudprovider.InstanceStatus {
if machine.Status.LastOperation.Type == v1alpha1.MachineOperationCreate {
if machine.Status.LastOperation.State == v1alpha1.MachineStateFailed && machine.Status.LastOperation.ErrorCode == machinecodes.ResourceExhausted.String() {
return &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceCreating,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: machinecodes.ResourceExhausted.String(),
ErrorMessage: machine.Status.LastOperation.Description,
},
}
}
return &cloudprovider.InstanceStatus{State: cloudprovider.InstanceCreating}
}
return nil
}
Expand Down Expand Up @@ -741,6 +746,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
req, _ = labels.NewRequirement(nodegroupset.LabelWorkerPool, selection.Equals, list)
region string
zone string
architecture *string
instance instanceType
machineClass = md.Spec.Template.Spec.Class
nodeTemplateSpec = md.Spec.Template.Spec.NodeTemplateSpec
Expand Down Expand Up @@ -793,6 +799,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
instance.InstanceType = nodeTemplateAttributes.InstanceType
region = nodeTemplateAttributes.Region
zone = nodeTemplateAttributes.Zone
architecture = nodeTemplateAttributes.Architecture
break
}

Expand All @@ -818,6 +825,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
}
region = providerSpec.Region
zone = getZoneValueFromMCLabels(mc.Labels)
architecture = pointer.String(providerSpec.Tags[apiv1.LabelArchStable])
case providerAzure:
var providerSpec *azureapis.AzureProviderSpec
err = json.Unmarshal(mc.ProviderSpec.Raw, &providerSpec)
Expand All @@ -840,6 +848,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
if providerSpec.Properties.Zone != nil {
zone = providerSpec.Location + "-" + strconv.Itoa(*providerSpec.Properties.Zone)
}
architecture = pointer.String(providerSpec.Tags["kubernetes.io_arch"])
default:
return nil, cloudprovider.ErrNotImplemented
}
Expand All @@ -863,6 +872,7 @@ func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *Machine
Zone: zone, // will be implemented in MCM
Labels: labels,
Taints: taints,
Architecture: architecture,
}

return nodeTmpl, nil
Expand Down Expand Up @@ -978,9 +988,13 @@ func (m *McmManager) buildNodeFromTemplate(name string, template *nodeTemplate)
func buildGenericLabels(template *nodeTemplate, nodeName string) map[string]string {
result := make(map[string]string)
// TODO: extract from MCM
result[kubeletapis.LabelArch] = cloudprovider.DefaultArch
result[apiv1.LabelArchStable] = cloudprovider.DefaultArch

if template.Architecture != nil {
result[kubeletapis.LabelArch] = *template.Architecture
result[apiv1.LabelArchStable] = *template.Architecture
} else {
result[kubeletapis.LabelArch] = cloudprovider.DefaultArch
result[apiv1.LabelArchStable] = cloudprovider.DefaultArch
}
result[kubeletapis.LabelOS] = cloudprovider.DefaultOS
result[apiv1.LabelOSStable] = cloudprovider.DefaultOS

Expand Down

0 comments on commit 463510d

Please sign in to comment.