Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

agent/goalcu: implement ZoneLoadAverage #1148

Merged
merged 7 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion autoscaler-agent/config_map.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ data:
"enableLFCMetrics": false,
"lfcToMemoryRatio": 0.75,
"lfcWindowSizeMinutes": 5,
"lfcMinWaitBeforeDownscaleMinutes": 5
"lfcMinWaitBeforeDownscaleMinutes": 5,
"cpuStableZoneRatio": 0,
"cpuMixedZoneRatio": 0
Omrigan marked this conversation as resolved.
Show resolved Hide resolved
}
},
"billing": {
Expand Down
24 changes: 23 additions & 1 deletion pkg/agent/core/goalcu.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/samber/lo"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"golang.org/x/exp/constraints"

"github.com/neondatabase/autoscaling/pkg/api"
)
Expand Down Expand Up @@ -65,11 +66,32 @@ func calculateCPUGoalCU(
computeUnit api.Resources,
systemMetrics SystemMetrics,
) uint32 {
goalCPUs := systemMetrics.LoadAverage1Min / *cfg.LoadAverageFractionTarget
stableThreshold := *cfg.CPUStableZoneRatio * systemMetrics.LoadAverage5Min
mixedThreshold := stableThreshold + *cfg.CPUMixedZoneRatio*systemMetrics.LoadAverage5Min
Omrigan marked this conversation as resolved.
Show resolved Hide resolved

diff := math.Abs(systemMetrics.LoadAverage1Min - systemMetrics.LoadAverage5Min)
// load1Weight is 0 when diff < stableThreshold, and 1 when diff > mixedThreshold.
// If diff is between the thresholds, it'll be between 0 and 1.
load1Weight := blendingFactor(diff, stableThreshold, mixedThreshold)
Omrigan marked this conversation as resolved.
Show resolved Hide resolved

blendedLoadAverage := load1Weight*systemMetrics.LoadAverage1Min + (1-load1Weight)*systemMetrics.LoadAverage5Min

goalCPUs := blendedLoadAverage / *cfg.LoadAverageFractionTarget
cpuGoalCU := uint32(math.Round(goalCPUs / computeUnit.VCPU.AsFloat64()))
return cpuGoalCU
}

func blendingFactor[T constraints.Float](value, t1, t2 T) T {
if value <= t1 {
return 0
}
if value >= t2 {
return 1
}
// 1e-6 is just a precaution, if t1==t2, we'd return earlier.
return (value - t1) / (t2 - t1 + 1e-6)
}

// For Mem:
// Goal compute unit is at the point where (Mem) * (MemoryUsageFractionTarget) == (Mem Usage)
// We can get the desired memory allocation in bytes by dividing MU by MUFT, and then convert
Expand Down
137 changes: 137 additions & 0 deletions pkg/agent/core/goalcu_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
package core

import (
"testing"

"github.com/samber/lo"
"github.com/stretchr/testify/assert"

"github.com/neondatabase/autoscaling/pkg/api"
)

func Test_calculateGoalCU(t *testing.T) {
gb := api.Bytes(1 << 30 /* 1 Gi */)
cu := api.Resources{VCPU: 250, Mem: 1 * gb}

defaultScalingConfig := api.ScalingConfig{
LoadAverageFractionTarget: lo.ToPtr(1.0),
MemoryUsageFractionTarget: lo.ToPtr(0.5),
MemoryTotalFractionTarget: lo.ToPtr(0.9),
EnableLFCMetrics: lo.ToPtr(true),
LFCToMemoryRatio: lo.ToPtr(0.75),
LFCWindowSizeMinutes: lo.ToPtr(5),
LFCMinWaitBeforeDownscaleMinutes: lo.ToPtr(5),
CPUStableZoneRatio: lo.ToPtr(0.0),
CPUMixedZoneRatio: lo.ToPtr(0.0),
}

warn := func(msg string) {}

cases := []struct {
name string
cfgUpdater func(*api.ScalingConfig)
sys *SystemMetrics
lfc *LFCMetrics
want scalingGoal
}{
{
name: "basic",
cfgUpdater: nil,
sys: nil,
lfc: nil,
want: scalingGoal{
goalCU: 0,
hasAllMetrics: false,
},
},
{
name: "cpu-load1-1cu",
cfgUpdater: nil,
//nolint:exhaustruct // this is a test
sys: &SystemMetrics{
LoadAverage1Min: 0.2,
},
lfc: nil,
want: scalingGoal{
goalCU: 1,
hasAllMetrics: false,
},
},
{
name: "cpu-load1-4cu",
cfgUpdater: nil,
//nolint:exhaustruct // this is a test
sys: &SystemMetrics{
LoadAverage1Min: 1,
},
lfc: nil,
want: scalingGoal{
goalCU: 4,
hasAllMetrics: false,
},
},
{
name: "cpu-zone-load1",
cfgUpdater: func(cfg *api.ScalingConfig) {
cfg.CPUStableZoneRatio = lo.ToPtr(0.5)
},
//nolint:exhaustruct // this is a test
sys: &SystemMetrics{
LoadAverage1Min: 0.7, // equal to 3 CUs
LoadAverage5Min: 0.0,
},
lfc: nil,
want: scalingGoal{
goalCU: 3,
hasAllMetrics: false,
},
},
{
name: "cpu-zone-load5",
cfgUpdater: func(cfg *api.ScalingConfig) {
cfg.CPUStableZoneRatio = lo.ToPtr(0.5)
},
sys: &SystemMetrics{
LoadAverage1Min: 1, // value is ignored, because it is in the stable zone
LoadAverage5Min: 0.7, // equal to 3 CUs
MemoryUsageBytes: 0,
MemoryCachedBytes: 0,
},
lfc: nil,
want: scalingGoal{
goalCU: 3,
hasAllMetrics: false,
},
},
{
name: "cpu-zone-mixed",
cfgUpdater: func(cfg *api.ScalingConfig) {
cfg.CPUStableZoneRatio = lo.ToPtr(0.5)
cfg.CPUMixedZoneRatio = lo.ToPtr(0.5)
},
sys: &SystemMetrics{
LoadAverage1Min: 1.75, // 1.75*4 = 7 CUs
LoadAverage5Min: 1, // 1*4 = 4 CUs
MemoryUsageBytes: 0,
MemoryCachedBytes: 0,
},
lfc: nil,
want: scalingGoal{
goalCU: 5, // Weighted average between 7 and 4 CUs
hasAllMetrics: false,
},
},
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
scalingConfig := defaultScalingConfig
if c.cfgUpdater != nil {
c.cfgUpdater(&scalingConfig)
}

got, _ := calculateGoalCU(warn, scalingConfig, cu, c.sys, c.lfc)
assert.Equal(t, c.want, got)
})
}
}
6 changes: 4 additions & 2 deletions pkg/agent/core/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ import (
)

type SystemMetrics struct {
LoadAverage1Min float64

LoadAverage1Min float64
LoadAverage5Min float64
MemoryUsageBytes float64
MemoryCachedBytes float64
}
Expand Down Expand Up @@ -94,12 +94,14 @@ func (m *SystemMetrics) fromPrometheus(mfs map[string]*promtypes.MetricFamily) e
}

load1 := getFloat("host_load1")
load5 := getFloat("host_load5")
memTotal := getFloat("host_memory_total_bytes")
memAvailable := getFloat("host_memory_available_bytes")
memCached := getFloat("host_memory_cached_bytes")

tmp := SystemMetrics{
LoadAverage1Min: load1,
LoadAverage5Min: load5,
// Add an extra 100 MiB to account for kernel memory usage
MemoryUsageBytes: memTotal - memAvailable + 100*(1<<20),
MemoryCachedBytes: memCached,
Expand Down
22 changes: 22 additions & 0 deletions pkg/agent/core/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "BasicScaleup",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.30,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand All @@ -58,6 +59,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "MismatchedApprovedNoScaledown",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.0, // ordinarily would like to scale down
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand All @@ -77,6 +79,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "MismatchedApprovedNoScaledownButVMAtMaximum",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.0, // ordinarily would like to scale down
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand All @@ -96,6 +99,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "BasicLFCScaleup",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand Down Expand Up @@ -131,6 +135,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "CanScaleUpWithoutExpectedLFCMetrics",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.30,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand All @@ -152,6 +157,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "CanScaleToBoundsWithoutExpectedMetrics",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.30,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand Down Expand Up @@ -237,6 +243,8 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
LFCToMemoryRatio: lo.ToPtr(0.75),
LFCWindowSizeMinutes: lo.ToPtr(5),
LFCMinWaitBeforeDownscaleMinutes: lo.ToPtr(5),
CPUStableZoneRatio: lo.ToPtr(0.0),
CPUMixedZoneRatio: lo.ToPtr(0.0),
},
// these don't really matter, because we're not using (*State).NextActions()
NeonVMRetryWait: time.Second,
Expand Down Expand Up @@ -325,6 +333,8 @@ var DefaultInitialStateConfig = helpers.InitialStateConfig{
LFCToMemoryRatio: lo.ToPtr(0.75),
LFCWindowSizeMinutes: lo.ToPtr(5),
LFCMinWaitBeforeDownscaleMinutes: lo.ToPtr(15),
CPUStableZoneRatio: lo.ToPtr(0.0),
CPUMixedZoneRatio: lo.ToPtr(0.0),
},
NeonVMRetryWait: 5 * time.Second,
PluginRequestTick: 5 * time.Second,
Expand Down Expand Up @@ -450,6 +460,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) {
clockTick().AssertEquals(duration("0.2s"))
lastMetrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -549,6 +560,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) {
// Set metrics back so that desired resources should now be zero
lastMetrics = core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -652,6 +664,7 @@ func TestPeriodicPluginRequest(t *testing.T) {

metrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -742,6 +755,7 @@ func TestPartialUpscaleThenFull(t *testing.T) {
clockTick()
metrics := core.SystemMetrics{
LoadAverage1Min: 1.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 12345678,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -886,6 +900,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) {
clockTick()
metrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1162,6 +1177,7 @@ func TestRequestedUpscale(t *testing.T) {
clockTick()
lastMetrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1305,11 +1321,13 @@ func TestDownscalePivotBack(t *testing.T) {

initialMetrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
newMetrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1547,6 +1565,7 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) {
// Set metrics so the desired resources are still 2 CU
metrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1662,6 +1681,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) {
// Set metrics so the desired resources are still 2 CU
metrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1762,6 +1782,7 @@ func TestFailedRequestRetry(t *testing.T) {
clockTick()
metrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1919,6 +1940,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) {
// the actual metrics we got in the actual logs
metrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 150589570, // 143.6 MiB
MemoryCachedBytes: 0.0,
}
Expand Down
Loading
Loading