diff --git a/autoscaler-agent/config_map.yaml b/autoscaler-agent/config_map.yaml index b22c30d6b..21a7d0c75 100644 --- a/autoscaler-agent/config_map.yaml +++ b/autoscaler-agent/config_map.yaml @@ -16,7 +16,9 @@ data: "enableLFCMetrics": false, "lfcToMemoryRatio": 0.75, "lfcWindowSizeMinutes": 5, - "lfcMinWaitBeforeDownscaleMinutes": 5 + "lfcMinWaitBeforeDownscaleMinutes": 5, + "cpuStableZoneRatio": 0, + "cpuMixedZoneRatio": 0 } }, "billing": { diff --git a/pkg/agent/core/goalcu.go b/pkg/agent/core/goalcu.go index fd5785d79..e88ef0b05 100644 --- a/pkg/agent/core/goalcu.go +++ b/pkg/agent/core/goalcu.go @@ -8,6 +8,7 @@ import ( "github.com/samber/lo" "go.uber.org/zap" "go.uber.org/zap/zapcore" + "golang.org/x/exp/constraints" "github.com/neondatabase/autoscaling/pkg/api" ) @@ -65,11 +66,32 @@ func calculateCPUGoalCU( computeUnit api.Resources, systemMetrics SystemMetrics, ) uint32 { - goalCPUs := systemMetrics.LoadAverage1Min / *cfg.LoadAverageFractionTarget + stableThreshold := *cfg.CPUStableZoneRatio * systemMetrics.LoadAverage5Min + mixedThreshold := stableThreshold + *cfg.CPUMixedZoneRatio*systemMetrics.LoadAverage5Min + + diff := math.Abs(systemMetrics.LoadAverage1Min - systemMetrics.LoadAverage5Min) + // load1Weight is 0 when diff < stableThreshold, and 1 when diff > mixedThreshold. + // If diff is between the thresholds, it'll be between 0 and 1. + load1Weight := blendingFactor(diff, stableThreshold, mixedThreshold) + + blendedLoadAverage := load1Weight*systemMetrics.LoadAverage1Min + (1-load1Weight)*systemMetrics.LoadAverage5Min + + goalCPUs := blendedLoadAverage / *cfg.LoadAverageFractionTarget cpuGoalCU := uint32(math.Round(goalCPUs / computeUnit.VCPU.AsFloat64())) return cpuGoalCU } +func blendingFactor[T constraints.Float](value, t1, t2 T) T { + if value <= t1 { + return 0 + } + if value >= t2 { + return 1 + } + // 1e-6 is just a precaution, if t1==t2, we'd return earlier. + return (value - t1) / (t2 - t1 + 1e-6) +} + // For Mem: // Goal compute unit is at the point where (Mem) * (MemoryUsageFractionTarget) == (Mem Usage) // We can get the desired memory allocation in bytes by dividing MU by MUFT, and then convert diff --git a/pkg/agent/core/goalcu_test.go b/pkg/agent/core/goalcu_test.go new file mode 100644 index 000000000..b620b10e8 --- /dev/null +++ b/pkg/agent/core/goalcu_test.go @@ -0,0 +1,137 @@ +package core + +import ( + "testing" + + "github.com/samber/lo" + "github.com/stretchr/testify/assert" + + "github.com/neondatabase/autoscaling/pkg/api" +) + +func Test_calculateGoalCU(t *testing.T) { + gb := api.Bytes(1 << 30 /* 1 Gi */) + cu := api.Resources{VCPU: 250, Mem: 1 * gb} + + defaultScalingConfig := api.ScalingConfig{ + LoadAverageFractionTarget: lo.ToPtr(1.0), + MemoryUsageFractionTarget: lo.ToPtr(0.5), + MemoryTotalFractionTarget: lo.ToPtr(0.9), + EnableLFCMetrics: lo.ToPtr(true), + LFCToMemoryRatio: lo.ToPtr(0.75), + LFCWindowSizeMinutes: lo.ToPtr(5), + LFCMinWaitBeforeDownscaleMinutes: lo.ToPtr(5), + CPUStableZoneRatio: lo.ToPtr(0.0), + CPUMixedZoneRatio: lo.ToPtr(0.0), + } + + warn := func(msg string) {} + + cases := []struct { + name string + cfgUpdater func(*api.ScalingConfig) + sys *SystemMetrics + lfc *LFCMetrics + want scalingGoal + }{ + { + name: "basic", + cfgUpdater: nil, + sys: nil, + lfc: nil, + want: scalingGoal{ + goalCU: 0, + hasAllMetrics: false, + }, + }, + { + name: "cpu-load1-1cu", + cfgUpdater: nil, + //nolint:exhaustruct // this is a test + sys: &SystemMetrics{ + LoadAverage1Min: 0.2, + }, + lfc: nil, + want: scalingGoal{ + goalCU: 1, + hasAllMetrics: false, + }, + }, + { + name: "cpu-load1-4cu", + cfgUpdater: nil, + //nolint:exhaustruct // this is a test + sys: &SystemMetrics{ + LoadAverage1Min: 1, + }, + lfc: nil, + want: scalingGoal{ + goalCU: 4, + hasAllMetrics: false, + }, + }, + { + name: "cpu-zone-load1", + cfgUpdater: func(cfg *api.ScalingConfig) { + cfg.CPUStableZoneRatio = lo.ToPtr(0.5) + }, + //nolint:exhaustruct // this is a test + sys: &SystemMetrics{ + LoadAverage1Min: 0.7, // equal to 3 CUs + LoadAverage5Min: 0.0, + }, + lfc: nil, + want: scalingGoal{ + goalCU: 3, + hasAllMetrics: false, + }, + }, + { + name: "cpu-zone-load5", + cfgUpdater: func(cfg *api.ScalingConfig) { + cfg.CPUStableZoneRatio = lo.ToPtr(0.5) + }, + sys: &SystemMetrics{ + LoadAverage1Min: 1, // value is ignored, because it is in the stable zone + LoadAverage5Min: 0.7, // equal to 3 CUs + MemoryUsageBytes: 0, + MemoryCachedBytes: 0, + }, + lfc: nil, + want: scalingGoal{ + goalCU: 3, + hasAllMetrics: false, + }, + }, + { + name: "cpu-zone-mixed", + cfgUpdater: func(cfg *api.ScalingConfig) { + cfg.CPUStableZoneRatio = lo.ToPtr(0.5) + cfg.CPUMixedZoneRatio = lo.ToPtr(0.5) + }, + sys: &SystemMetrics{ + LoadAverage1Min: 1.75, // 1.75*4 = 7 CUs + LoadAverage5Min: 1, // 1*4 = 4 CUs + MemoryUsageBytes: 0, + MemoryCachedBytes: 0, + }, + lfc: nil, + want: scalingGoal{ + goalCU: 5, // Weighted average between 7 and 4 CUs + hasAllMetrics: false, + }, + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + scalingConfig := defaultScalingConfig + if c.cfgUpdater != nil { + c.cfgUpdater(&scalingConfig) + } + + got, _ := calculateGoalCU(warn, scalingConfig, cu, c.sys, c.lfc) + assert.Equal(t, c.want, got) + }) + } +} diff --git a/pkg/agent/core/metrics.go b/pkg/agent/core/metrics.go index 3cb80628b..d03669caf 100644 --- a/pkg/agent/core/metrics.go +++ b/pkg/agent/core/metrics.go @@ -18,8 +18,8 @@ import ( ) type SystemMetrics struct { - LoadAverage1Min float64 - + LoadAverage1Min float64 + LoadAverage5Min float64 MemoryUsageBytes float64 MemoryCachedBytes float64 } @@ -94,12 +94,14 @@ func (m *SystemMetrics) fromPrometheus(mfs map[string]*promtypes.MetricFamily) e } load1 := getFloat("host_load1") + load5 := getFloat("host_load5") memTotal := getFloat("host_memory_total_bytes") memAvailable := getFloat("host_memory_available_bytes") memCached := getFloat("host_memory_cached_bytes") tmp := SystemMetrics{ LoadAverage1Min: load1, + LoadAverage5Min: load5, // Add an extra 100 MiB to account for kernel memory usage MemoryUsageBytes: memTotal - memAvailable + 100*(1<<20), MemoryCachedBytes: memCached, diff --git a/pkg/agent/core/state_test.go b/pkg/agent/core/state_test.go index bee432816..2729d6313 100644 --- a/pkg/agent/core/state_test.go +++ b/pkg/agent/core/state_test.go @@ -41,6 +41,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { name: "BasicScaleup", systemMetrics: &core.SystemMetrics{ LoadAverage1Min: 0.30, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, }, @@ -58,6 +59,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { name: "MismatchedApprovedNoScaledown", systemMetrics: &core.SystemMetrics{ LoadAverage1Min: 0.0, // ordinarily would like to scale down + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, }, @@ -77,6 +79,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { name: "MismatchedApprovedNoScaledownButVMAtMaximum", systemMetrics: &core.SystemMetrics{ LoadAverage1Min: 0.0, // ordinarily would like to scale down + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, }, @@ -96,6 +99,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { name: "BasicLFCScaleup", systemMetrics: &core.SystemMetrics{ LoadAverage1Min: 0.0, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, }, @@ -131,6 +135,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { name: "CanScaleUpWithoutExpectedLFCMetrics", systemMetrics: &core.SystemMetrics{ LoadAverage1Min: 0.30, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, }, @@ -152,6 +157,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { name: "CanScaleToBoundsWithoutExpectedMetrics", systemMetrics: &core.SystemMetrics{ LoadAverage1Min: 0.30, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, }, @@ -237,6 +243,8 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) { LFCToMemoryRatio: lo.ToPtr(0.75), LFCWindowSizeMinutes: lo.ToPtr(5), LFCMinWaitBeforeDownscaleMinutes: lo.ToPtr(5), + CPUStableZoneRatio: lo.ToPtr(0.0), + CPUMixedZoneRatio: lo.ToPtr(0.0), }, // these don't really matter, because we're not using (*State).NextActions() NeonVMRetryWait: time.Second, @@ -325,6 +333,8 @@ var DefaultInitialStateConfig = helpers.InitialStateConfig{ LFCToMemoryRatio: lo.ToPtr(0.75), LFCWindowSizeMinutes: lo.ToPtr(5), LFCMinWaitBeforeDownscaleMinutes: lo.ToPtr(15), + CPUStableZoneRatio: lo.ToPtr(0.0), + CPUMixedZoneRatio: lo.ToPtr(0.0), }, NeonVMRetryWait: 5 * time.Second, PluginRequestTick: 5 * time.Second, @@ -450,6 +460,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { clockTick().AssertEquals(duration("0.2s")) lastMetrics := core.SystemMetrics{ LoadAverage1Min: 0.3, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } @@ -549,6 +560,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) { // Set metrics back so that desired resources should now be zero lastMetrics = core.SystemMetrics{ LoadAverage1Min: 0.0, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } @@ -652,6 +664,7 @@ func TestPeriodicPluginRequest(t *testing.T) { metrics := core.SystemMetrics{ LoadAverage1Min: 0.0, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } @@ -742,6 +755,7 @@ func TestPartialUpscaleThenFull(t *testing.T) { clockTick() metrics := core.SystemMetrics{ LoadAverage1Min: 1.0, + LoadAverage5Min: 0.0, MemoryUsageBytes: 12345678, MemoryCachedBytes: 0.0, } @@ -886,6 +900,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) { clockTick() metrics := core.SystemMetrics{ LoadAverage1Min: 0.0, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } @@ -1162,6 +1177,7 @@ func TestRequestedUpscale(t *testing.T) { clockTick() lastMetrics := core.SystemMetrics{ LoadAverage1Min: 0.0, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } @@ -1305,11 +1321,13 @@ func TestDownscalePivotBack(t *testing.T) { initialMetrics := core.SystemMetrics{ LoadAverage1Min: 0.0, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } newMetrics := core.SystemMetrics{ LoadAverage1Min: 0.3, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } @@ -1547,6 +1565,7 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) { // Set metrics so the desired resources are still 2 CU metrics := core.SystemMetrics{ LoadAverage1Min: 0.3, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } @@ -1662,6 +1681,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) { // Set metrics so the desired resources are still 2 CU metrics := core.SystemMetrics{ LoadAverage1Min: 0.3, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } @@ -1762,6 +1782,7 @@ func TestFailedRequestRetry(t *testing.T) { clockTick() metrics := core.SystemMetrics{ LoadAverage1Min: 0.3, + LoadAverage5Min: 0.0, MemoryUsageBytes: 0.0, MemoryCachedBytes: 0.0, } @@ -1919,6 +1940,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) { // the actual metrics we got in the actual logs metrics := core.SystemMetrics{ LoadAverage1Min: 0.0, + LoadAverage5Min: 0.0, MemoryUsageBytes: 150589570, // 143.6 MiB MemoryCachedBytes: 0.0, } diff --git a/pkg/api/vminfo.go b/pkg/api/vminfo.go index 015f6d68b..5663623a0 100644 --- a/pkg/api/vminfo.go +++ b/pkg/api/vminfo.go @@ -368,6 +368,16 @@ type ScalingConfig struct { // LFCWindowSizeMinutes dictates the minimum duration we must use during internal calculations // of the rate of increase in LFC working set size. LFCWindowSizeMinutes *int `json:"lfcWindowSizeMinutes,omitempty"` + + // CPUStableZoneRatio is the ratio of the stable load zone size relative to load5. + // For example, a value of 0.25 means that stable zone will be load5±25%. + CPUStableZoneRatio *float64 `json:"cpuStableZoneRatio,omitempty"` + + // CPUMixedZoneRatio is the ratio of the mixed load zone size relative to load5. + // Since mixed zone starts after stable zone, values CPUStableZoneRatio=0.25 and CPUMixedZoneRatio=0.15 + // means that stable zone will be from 0.75*load5 to 1.25*load5, and mixed zone will be + // from 0.6*load5 to 0.75*load5, and from 1.25*load5 to 1.4*load5. + CPUMixedZoneRatio *float64 `json:"cpuMixedZoneRatio,omitempty"` } // WithOverrides returns a new copy of defaults, where fields set in overrides replace the ones in @@ -401,6 +411,13 @@ func (defaults ScalingConfig) WithOverrides(overrides *ScalingConfig) ScalingCon defaults.LFCMinWaitBeforeDownscaleMinutes = lo.ToPtr(*overrides.LFCMinWaitBeforeDownscaleMinutes) } + if overrides.CPUStableZoneRatio != nil { + defaults.CPUStableZoneRatio = lo.ToPtr(*overrides.CPUStableZoneRatio) + } + if overrides.CPUMixedZoneRatio != nil { + defaults.CPUMixedZoneRatio = lo.ToPtr(*overrides.CPUMixedZoneRatio) + } + return defaults } @@ -453,6 +470,8 @@ func (c *ScalingConfig) validate(requireAll bool) error { erc.Whenf(ec, c.LFCToMemoryRatio == nil, "%s is a required field", ".lfcToMemoryRatio") erc.Whenf(ec, c.LFCWindowSizeMinutes == nil, "%s is a required field", ".lfcWindowSizeMinutes") erc.Whenf(ec, c.LFCMinWaitBeforeDownscaleMinutes == nil, "%s is a required field", ".lfcMinWaitBeforeDownscaleMinutes") + erc.Whenf(ec, c.CPUStableZoneRatio == nil, "%s is a required field", ".cpuStableZoneRatio") + erc.Whenf(ec, c.CPUMixedZoneRatio == nil, "%s is a required field", ".cpuMixedZoneRatio") } // heads-up! some functions elsewhere depend on the concrete return type of this function.