Skip to content

Commit

Permalink
Implement ZoneLoadAverage
Browse files Browse the repository at this point in the history
Signed-off-by: Oleg Vasilev <[email protected]>
  • Loading branch information
Omrigan committed Nov 18, 2024
1 parent 7d0d521 commit d436c3d
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 4 deletions.
4 changes: 3 additions & 1 deletion autoscaler-agent/config_map.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ data:
"enableLFCMetrics": false,
"lfcToMemoryRatio": 0.75,
"lfcWindowSizeMinutes": 5,
"lfcMinWaitBeforeDownscaleMinutes": 5
"lfcMinWaitBeforeDownscaleMinutes": 5,
"cpuStableZoneRatio": 0.25,
"cpuMixedZoneRatio": 0.25
}
},
"billing": {
Expand Down
21 changes: 20 additions & 1 deletion pkg/agent/core/goalcu.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/samber/lo"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"golang.org/x/exp/constraints"

"github.com/neondatabase/autoscaling/pkg/api"
)
Expand Down Expand Up @@ -65,11 +66,29 @@ func calculateCPUGoalCU(
computeUnit api.Resources,
systemMetrics SystemMetrics,
) uint32 {
goalCPUs := systemMetrics.LoadAverage1Min / *cfg.LoadAverageFractionTarget
stableThreshold := *cfg.CPUStableZoneRatio * systemMetrics.LoadAverage5Min
mixedThreshold := stableThreshold + *cfg.CPUMixedZoneRatio*systemMetrics.LoadAverage5Min

diff := math.Abs(systemMetrics.LoadAverage1Min - systemMetrics.LoadAverage5Min)
load1Weight := blendingFactor(diff, stableThreshold, mixedThreshold)

blendedLoadAverage := load1Weight*systemMetrics.LoadAverage1Min + (1-load1Weight)*systemMetrics.LoadAverage5Min

goalCPUs := blendedLoadAverage / *cfg.LoadAverageFractionTarget
cpuGoalCU := uint32(math.Round(goalCPUs / computeUnit.VCPU.AsFloat64()))
return cpuGoalCU
}

func blendingFactor[T constraints.Float](value, t1, t2 T) T {
if value < t1 {
return 0
} else if value >= t2 {
return 1
} else {
return (value - t1) / (t2 - t1 + 1e-6)
}
}

// For Mem:
// Goal compute unit is at the point where (Mem) * (MemoryUsageFractionTarget) == (Mem Usage)
// We can get the desired memory allocation in bytes by dividing MU by MUFT, and then convert
Expand Down
6 changes: 4 additions & 2 deletions pkg/agent/core/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ import (
)

type SystemMetrics struct {
LoadAverage1Min float64

LoadAverage1Min float64
LoadAverage5Min float64
MemoryUsageBytes float64
MemoryCachedBytes float64
}
Expand Down Expand Up @@ -94,12 +94,14 @@ func (m *SystemMetrics) fromPrometheus(mfs map[string]*promtypes.MetricFamily) e
}

load1 := getFloat("host_load1")
load5 := getFloat("host_load5")
memTotal := getFloat("host_memory_total_bytes")
memAvailable := getFloat("host_memory_available_bytes")
memCached := getFloat("host_memory_cached_bytes")

tmp := SystemMetrics{
LoadAverage1Min: load1,
LoadAverage5Min: load5,
// Add an extra 100 MiB to account for kernel memory usage
MemoryUsageBytes: memTotal - memAvailable + 100*(1<<20),
MemoryCachedBytes: memCached,
Expand Down
22 changes: 22 additions & 0 deletions pkg/agent/core/state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "BasicScaleup",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.30,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand All @@ -58,6 +59,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "MismatchedApprovedNoScaledown",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.0, // ordinarily would like to scale down
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand All @@ -77,6 +79,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "MismatchedApprovedNoScaledownButVMAtMaximum",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.0, // ordinarily would like to scale down
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand All @@ -96,6 +99,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "BasicLFCScaleup",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand Down Expand Up @@ -131,6 +135,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "CanScaleUpWithoutExpectedLFCMetrics",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.30,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand All @@ -152,6 +157,7 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
name: "CanScaleToBoundsWithoutExpectedMetrics",
systemMetrics: &core.SystemMetrics{
LoadAverage1Min: 0.30,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
},
Expand Down Expand Up @@ -237,6 +243,8 @@ func Test_DesiredResourcesFromMetricsOrRequestedUpscaling(t *testing.T) {
LFCToMemoryRatio: lo.ToPtr(0.75),
LFCWindowSizeMinutes: lo.ToPtr(5),
LFCMinWaitBeforeDownscaleMinutes: lo.ToPtr(5),
CPUStableZoneRatio: lo.ToPtr(0.0),
CPUMixedZoneRatio: lo.ToPtr(0.0),
},
// these don't really matter, because we're not using (*State).NextActions()
NeonVMRetryWait: time.Second,
Expand Down Expand Up @@ -325,6 +333,8 @@ var DefaultInitialStateConfig = helpers.InitialStateConfig{
LFCToMemoryRatio: lo.ToPtr(0.75),
LFCWindowSizeMinutes: lo.ToPtr(5),
LFCMinWaitBeforeDownscaleMinutes: lo.ToPtr(15),
CPUStableZoneRatio: lo.ToPtr(0.0),
CPUMixedZoneRatio: lo.ToPtr(0.0),
},
NeonVMRetryWait: 5 * time.Second,
PluginRequestTick: 5 * time.Second,
Expand Down Expand Up @@ -450,6 +460,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) {
clockTick().AssertEquals(duration("0.2s"))
lastMetrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -549,6 +560,7 @@ func TestBasicScaleUpAndDownFlow(t *testing.T) {
// Set metrics back so that desired resources should now be zero
lastMetrics = core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -652,6 +664,7 @@ func TestPeriodicPluginRequest(t *testing.T) {

metrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -742,6 +755,7 @@ func TestPartialUpscaleThenFull(t *testing.T) {
clockTick()
metrics := core.SystemMetrics{
LoadAverage1Min: 1.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 12345678,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -886,6 +900,7 @@ func TestDeniedDownscalingIncreaseAndRetry(t *testing.T) {
clockTick()
metrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1162,6 +1177,7 @@ func TestRequestedUpscale(t *testing.T) {
clockTick()
lastMetrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1305,11 +1321,13 @@ func TestDownscalePivotBack(t *testing.T) {

initialMetrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
newMetrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1547,6 +1565,7 @@ func TestBoundsChangeRequiresDownsale(t *testing.T) {
// Set metrics so the desired resources are still 2 CU
metrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1662,6 +1681,7 @@ func TestBoundsChangeRequiresUpscale(t *testing.T) {
// Set metrics so the desired resources are still 2 CU
metrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1762,6 +1782,7 @@ func TestFailedRequestRetry(t *testing.T) {
clockTick()
metrics := core.SystemMetrics{
LoadAverage1Min: 0.3,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 0.0,
MemoryCachedBytes: 0.0,
}
Expand Down Expand Up @@ -1919,6 +1940,7 @@ func TestMetricsConcurrentUpdatedDuringDownscale(t *testing.T) {
// the actual metrics we got in the actual logs
metrics := core.SystemMetrics{
LoadAverage1Min: 0.0,
LoadAverage5Min: 0.0,
MemoryUsageBytes: 150589570, // 143.6 MiB
MemoryCachedBytes: 0.0,
}
Expand Down
19 changes: 19 additions & 0 deletions pkg/api/vminfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,16 @@ type ScalingConfig struct {
// LFCWindowSizeMinutes dictates the minimum duration we must use during internal calculations
// of the rate of increase in LFC working set size.
LFCWindowSizeMinutes *int `json:"lfcWindowSizeMinutes,omitempty"`

// CPUStableZoneRatio is the ratio of the stable load zone size relative to load5.
// For example, a value of 0.25 means that stable zone will be load5±25%.
CPUStableZoneRatio *float64 `json:"cpuStableZoneRatio,omitempty"`

// CPUMixedZoneRatio is the ratio of the mixed load zone size relative to load5.
// Since mixed zone starts after stable zone, values CPUStableZoneRatio=0.25 and CPUMixedZoneRatio=0.15
// means that stable zone will be from 0.75*load5 to 1.25*load5, and mixed zone will be
// from 0.6*load5 to 0.75*load5, and from 1.25*load5 to 1.4*load5.
CPUMixedZoneRatio *float64 `json:"cpuMixedZoneRatio,omitempty"`
}

// WithOverrides returns a new copy of defaults, where fields set in overrides replace the ones in
Expand Down Expand Up @@ -401,6 +411,13 @@ func (defaults ScalingConfig) WithOverrides(overrides *ScalingConfig) ScalingCon
defaults.LFCMinWaitBeforeDownscaleMinutes = lo.ToPtr(*overrides.LFCMinWaitBeforeDownscaleMinutes)
}

if overrides.CPUStableZoneRatio != nil {
defaults.CPUStableZoneRatio = lo.ToPtr(*overrides.CPUStableZoneRatio)
}
if overrides.CPUMixedZoneRatio != nil {
defaults.CPUMixedZoneRatio = lo.ToPtr(*overrides.CPUMixedZoneRatio)
}

return defaults
}

Expand Down Expand Up @@ -453,6 +470,8 @@ func (c *ScalingConfig) validate(requireAll bool) error {
erc.Whenf(ec, c.LFCToMemoryRatio == nil, "%s is a required field", ".lfcToMemoryRatio")
erc.Whenf(ec, c.LFCWindowSizeMinutes == nil, "%s is a required field", ".lfcWindowSizeMinutes")
erc.Whenf(ec, c.LFCMinWaitBeforeDownscaleMinutes == nil, "%s is a required field", ".lfcMinWaitBeforeDownscaleMinutes")
erc.Whenf(ec, c.CPUStableZoneRatio == nil, "%s is a required field", ".cpuStableZoneRatio")
erc.Whenf(ec, c.CPUMixedZoneRatio == nil, "%s is a required field", ".cpuMixedZoneRatio")
}

// heads-up! some functions elsewhere depend on the concrete return type of this function.
Expand Down

0 comments on commit d436c3d

Please sign in to comment.